In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

In [None]:
def mimic_missing(patch_ts, p=0.3, max_missing_patch=7, min_missing_patch=3):
    patch_len = patch_ts.shape[-1]
    patch_num = patch_ts.shape[1]
    batch_size = patch_ts.shape[0]
    patch_time = np.arange(patch_len)[None,None,:] # patch_time which could broadcast
    ## missing mechanism
    patch_missing_cnt = np.isnan(patch_ts).sum(axis=-1, keepdims=True) # true = 1 = missing
    non_missing_idx = patch_missing_cnt==0
    # continuous-patch missing: conti_idx
    non_missing_cumsum = np.zeros_like(non_missing_idx.astype(int))
    sum_vec = np.zeros_like(non_missing_idx[:,0].astype(int))
    conti_idx = np.zeros_like(non_missing_idx) # (batch_size, day_len, 1)
    for i in range(patch_num):
        sum_vec = np.where(non_missing_idx[:,i], sum_vec, 0)
        sum_vec = sum_vec + non_missing_idx[:,i].astype(int)
        non_missing_cumsum[:,i] = sum_vec.copy()
        if i>max_missing_patch:
            conti_len = np.random.randint(low=min_missing_patch, high=max_missing_patch+1, size=sum_vec.shape)
            conti_len = np.where((conti_len < sum_vec) & (np.random.rand(*sum_vec.shape)<p/10), conti_len, 0)
            conti_tmp = np.arange(batch_size * max_missing_patch).reshape(batch_size, max_missing_patch, 1)
            conti_tmp = max_missing_patch - (conti_tmp - conti_tmp[:,0:1])
            conti_tmp = (conti_tmp <= conti_len[:,None]) & (conti_tmp > 1)
            conti_idx[:,i-max_missing_patch+1:i+1] = conti_idx[:,i-max_missing_patch+1:i+1] | conti_tmp
    # intra-patch missing: intra_idx
    intra_rand_idx = (np.random.rand(*patch_missing_cnt.shape) < p) & non_missing_idx
    patch_missing_start_time = np.random.randint(low=0, high=int(patch_len*(1-p)), size=patch_missing_cnt.shape)
    patch_missing_end_time = patch_len - patch_missing_start_time
    intra_missing_idx_front = patch_time>=patch_missing_start_time
    intra_missing_idx_backend = patch_time<=patch_missing_end_time
    shape = list(intra_missing_idx_front.shape)
    shape[-1] = 1
    intra_missing_idx = np.where(np.random.rand(*shape)<=0.5, intra_missing_idx_front, intra_missing_idx_backend)
    intra_idx = intra_rand_idx & intra_missing_idx # (batch_size, patch_num, patch_len)


    # valid_idx = np.squeeze(intra_rand_idx | conti_idx)
    sample_idx = intra_idx | conti_idx

    patch_ts_missing = np.where(sample_idx, np.nan, patch_ts)
    valid_idx = ~np.isnan(patch_ts)&np.isnan(patch_ts_missing)
    return patch_ts_missing, valid_idx

In [2]:
data = load_dataset("Dingdong-Inc/FreshRetailNet-50K")
data = data['train'].to_pandas()
data = data.sort_values(by=['store_id', 'product_id', 'dt'])

In [None]:
data.head()
# hours_sale_stock: 0 (in stock), 1 (out of stock)

Unnamed: 0,city_id,store_id,management_group_id,first_category_id,second_category_id,third_category_id,product_id,dt,sale_amount,hours_sale,stock_hour6_22_cnt,hours_stock_status,discount,holiday_flag,activity_flag,precpt,avg_temperature,avg_humidity,avg_wind_level
720,0,0,2,29,78,82,4,2024-03-28,0.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, ...",13,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...",0.882,0,1,1.6999,15.48,73.54,1.97
721,0,0,2,29,78,82,4,2024-03-29,1.3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.882,0,1,3.019,15.08,76.56,1.71
722,0,0,2,29,78,82,4,2024-03-30,5.3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.882,1,1,2.0942,15.91,76.47,1.73
723,0,0,2,29,78,82,4,2024-03-31,4.2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.879,1,1,1.5618,16.13,77.4,1.76
724,0,0,2,29,78,82,4,2024-04-01,0.7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, ...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.882,0,1,3.5386,15.37,78.26,1.25


In [38]:
horizon = 90
series_num = data.shape[0] // horizon
series_num
hours_sale = np.array(data['hours_sale'].tolist())
hours_stock_status = np.array(data['hours_stock_status'].tolist())

hours_sale_origin = hours_sale.reshape(series_num, horizon, -1)[..., 6:22] # (num_series, num_days, 16 hours)
hours_stock_status = hours_stock_status.reshape(series_num, horizon, -1)[..., 6:22] # (num_series, num_days, 16 hours)
hours_sale = np.where(hours_stock_status == 1, np.nan, hours_sale_origin) # set sales to nan when out of stock
print(f"{(np.isnan(hours_sale).sum() * 100 / hours_sale.size):.2f}% missing values in hours_sale")
covariate = data[['discount', 'holiday_flag', 'precpt', 'avg_temperature']].values.reshape(series_num, horizon, -1) # (num_series, num_days, num_covariates)
covariate = covariate/(covariate.max(axis=1, keepdims=True)+0.1) # normalize covariates

hours_sale, valid_idx = mimic_missing(hours_sale, p=0.3, max_missing_patch=7, min_missing_patch=3)  
print(f"{(np.isnan(hours_sale).sum() * 100 / hours_sale.size):.2f}% missing values in hours_sale after mimic missing")

19.88% missing values in hours_sale
32.79% missing values in hours_sale after mimic missing


In [39]:
covariate = np.expand_dims(covariate, axis=2)
covariate = np.broadcast_to(covariate, (series_num, horizon, hours_sale.shape[2], covariate.shape[-1]))
hours_sale = np.expand_dims(hours_sale, axis=-1)
hours_stock_status = np.expand_dims(hours_stock_status, axis=-1)
hour_encoding = np.broadcast_to(np.arange(16)[None,None,:,None]/15, (series_num, horizon, 16, 1))
train_set = np.concatenate([hours_sale, hour_encoding, covariate], axis=-1) # (num_series, num_days, num_hours, hour_sales+hour_encoding+num_covariates)
train_set = train_set.reshape(series_num, horizon*hours_sale.shape[2], -1) # (num_series, num_days*num_hours, feature_dim)
valid_idx = valid_idx[..., None].reshape(series_num, horizon*hours_sale.shape[2], 1) # (num_series, num_days*num_hours, 1)

In [41]:
dataset = {
    'train_set': train_set,
    'valid_idx': valid_idx,
    'hours_sale_origin': hours_sale_origin,
}
np.savez_compressed('../data/processed_data.npz', **dataset)

In [50]:
np.isnan(train_set).sum()

np.int64(23606027)

valid_idx:
- missing: true-1
- non-missing false-0