In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

***
## data loading

In [2]:
split_date = "2021-03-01"

In [3]:
data = pd.read_parquet("../data/train.parquet")
unpredictable = pd.read_csv("../data/unpredictable.csv")
data = data.query("sku not in @unpredictable.sku").reset_index(drop=True)

In [4]:
train = data.query("date <= @split_date")
valid = data.query("date > @split_date")

In [5]:
to_remove1 = set(train.groupby("sku")["sold_quantity"].sum()[train.groupby("sku")["sold_quantity"].sum() == 0].index)
to_remove2 = set(valid.groupby("sku")["sold_quantity"].sum()[valid.groupby("sku")["sold_quantity"].sum() == 0].index)
assert len(to_remove1 & to_remove2) == 0
to_remove = to_remove1 | to_remove2

In [6]:
train = train.query("sku not in @to_remove").reset_index(drop=True)
valid = valid.query("sku not in @to_remove").reset_index(drop=True)

In [7]:
train.sku.nunique()

496797

In [8]:
valid.sku.nunique()

509939

In [9]:
in_both = set(train.sku) & set(valid.sku)
len(in_both)

496797

In [10]:
train = train.query("sku in @in_both").reset_index(drop=True)
valid = valid.query("sku in @in_both").reset_index(drop=True)

In [11]:
skus_for_assess = pd.DataFrame(in_both, columns=["sku",])
skus_for_assess.to_csv("../data/skus_for_assess.csv", index=False)
skus_for_assess

Unnamed: 0,sku
0,1
1,3
2,4
3,5
4,6
...,...
496792,660911
496793,660912
496794,660913
496795,660914


***
## generating multiple validation sets

In [12]:
def create_validation_set(dataset, seed):
    np.random.seed(seed)
    
    print('Sorting records...')
    temp_pd = dataset.loc[:, ['sku','date','sold_quantity']].sort_values(['sku','date'])

    print('Grouping quantity...')
    temp_dict = temp_pd.groupby('sku').agg({'sold_quantity':lambda x: [i for i in x]})['sold_quantity'].to_dict()

    result = []
    for idx, list_quantity in tqdm(temp_dict.items(), desc='Making targets...'):
        cumsum = np.array(list_quantity).cumsum()
        stock_target = 0
        if cumsum[-1] > 0 and len(cumsum)==30:
            
            #choose a random target different from 0
            while stock_target == 0:
                stock_target = np.random.choice(cumsum)
                
            #get the first day with this amount of sales
            day_to_stockout = np.argwhere(cumsum==stock_target).min() + 1
            
            #add to a list
            result.append({'sku':idx, 'target_stock':stock_target, 'inventory_days':day_to_stockout})
    return result

In [13]:
seed_list = [2, 3, 5, 7, 11, 13, 17, 19, 23]

for seed in seed_list:
    valid_dataset = create_validation_set(valid, seed=seed)
    valid_dataset = pd.DataFrame(valid_dataset)
    print("Number of skus:", valid_dataset.sku.nunique())
    valid_dataset.to_csv(f"../data/validation_seed{seed}.csv", index=False)

Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:19<00:00, 25421.23it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:19<00:00, 25750.39it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:19<00:00, 26038.12it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:19<00:00, 26084.60it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:19<00:00, 25963.24it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:19<00:00, 25880.39it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:19<00:00, 25916.90it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:18<00:00, 26186.46it/s]


Number of skus: 496797
Sorting records...
Grouping quantity...


Making targets...: 100%|██████████| 496797/496797 [00:18<00:00, 26336.47it/s]


Number of skus: 496797


***