In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

In [2]:
data = load_dataset("Dingdong-Inc/FreshRetailNet-50K")
data = data['train'].to_pandas()
data = data.sort_values(by=['store_id', 'product_id', 'dt'])

In [3]:
output_path = '/home/guest/DemandForecasting/data/original_data.csv'
data.to_csv(output_path, index=False)

In [3]:
horizon = 90
series_num = data.shape[0] // horizon
series_num
hours_sale = np.array(data['hours_sale'].tolist())
hours_stock_status = np.array(data['hours_stock_status'].tolist())
mask = np.where(hours_stock_status == 1, 0, 1)

hours_sale = np.where(hours_stock_status == 1, np.nan, hours_sale)[..., 6:22]
hours_sale = hours_sale.reshape(series_num, horizon, -1)
hours_sale = np.expand_dims(hours_sale, -1)  # (num_series, num_days, num_hours, 1)
hours_sale = hours_sale.reshape(series_num, horizon * 16, 1)  # (num_series, num_days*num_hours, 1)

mask = mask[..., 6:22]
mask = mask.reshape(series_num, horizon, -1)
mask = mask.reshape(series_num, horizon * 16)  # (num_series, num_days*num_hours, 1)

covariate = data[['discount', 'holiday_flag', 'precpt', 'avg_temperature']].values.reshape(series_num, horizon, -1) # (num_series, num_days, num_covariates)
covariate = covariate/(covariate.max(axis=1, keepdims=True)+0.1)
covariate = np.expand_dims(covariate, 2)  # (num_series, num_days, 1, num_covariates)
covariate = np.broadcast_to(covariate, (series_num, horizon, 16, covariate.shape[-1]))  # (num_series, num_days, num_hours, num_covariates)
covariate = covariate.reshape(series_num, horizon * 16, covariate.shape[-1])  # (num_series, num_days*num_hours, num_covariates)

hour_encoding = np.broadcast_to(np.arange(16)[None, None, :, None] / 15, (series_num, horizon, 16, 1))
hour_encoding = hour_encoding.reshape(series_num, horizon * 16, 1)

# Concatenate all features: [hours_sale, hour_encoding, covariate]
train_set = np.concatenate([hours_sale, hour_encoding, covariate], axis=-1)

In [4]:
# Prepare mask for model (shape: (50000, 1440, 6))
# Mask logic: 1=observed, 0=missing (stockout)
mask_model = np.ones((series_num, horizon * 16, 6), dtype=np.float32)
mask_model[:, :, 0] = mask  # Apply stockout mask to hours_sale (feature 0)
# Other features (1-5) all observed (mask=1)

print(f"Final shapes for model:")
print(f"  train_set: {train_set.shape}")
print(f"  mask_model: {mask_model.shape}")
print(f"  Features: [hours_sale, hour_encoding, discount, holiday, precpt, temp]")
print(f"  Stockout (missing) rate: {(mask == 0).sum() / mask.size * 100:.2f}%")
print(f"\n✓ Ready for imputation!")

Final shapes for model:
  train_set: (50000, 1440, 6)
  mask_model: (50000, 1440, 6)
  Features: [hours_sale, hour_encoding, discount, holiday, precpt, temp]
  Stockout (missing) rate: 19.88%

✓ Ready for imputation!


In [5]:
import torch
import sys
sys.path.append('latent_demand_recovery')
from model.timesnet import Model

class Config:
    def __init__(self):
        self.seq_len = 1440
        self.pred_len = 0
        self.label_len = 0
        self.task_name = 'imputation'
        self.enc_in = 6
        self.c_out = 6
        self.d_model = 64
        self.d_ff = 128
        self.num_kernels = 5
        self.top_k = 7
        self.e_layers = 2
        self.embed = 'timeF'
        self.freq = 'h'
        self.dropout = 0.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Model(Config()).to(device)
model.load_state_dict(torch.load('/home/guest/DemandForecasting/latent_demand_recovery/checkpoints/timesnet_best.pth', map_location=device))
model.eval()

train_set_filled = np.nan_to_num(train_set.copy(), nan=0.0)

print(f"Imputing {len(train_set)} sequences...")
imputed_all = []
batch_size = 128

with torch.no_grad():
    for i in range(0, len(train_set), batch_size):
        batch = train_set_filled[i:i+batch_size]
        batch_mask = mask_model[i:i+batch_size]
        
        x_enc = torch.FloatTensor(batch).to(device)
        batch_mask_t = torch.FloatTensor(batch_mask).to(device)
        
        output = model(x_enc, None, x_enc.clone(), None, batch_mask_t)
        imputed_all.append(output.cpu().numpy())

imputed_data = np.concatenate(imputed_all, axis=0)
model_predictions_flat = imputed_data[:, :, 0]
model_predictions_flat = np.clip(model_predictions_flat, 0.0, None)

hours_sale_raw_flat = train_set[:, :, 0]
hours_sale_imputed_flat = np.nan_to_num(hours_sale_raw_flat.copy(), nan=0.0)
missing_mask_flat = (mask == 0)
hours_sale_imputed_flat[missing_mask_flat] = model_predictions_flat[missing_mask_flat]

print(f"✓ Imputation completed!")
print(f"  Imputed {missing_mask_flat.sum():,} missing values")

Imputing 50000 sequences...
✓ Imputation completed!
  Imputed 14,311,536 missing values
✓ Imputation completed!
  Imputed 14,311,536 missing values


In [6]:
hours_sale_imputed = hours_sale_imputed_flat.reshape(series_num, horizon, 16)
del hours_sale_imputed_flat

hours_sale_full = np.zeros((series_num, horizon, 24))
hours_sale_full[:, :, 6:22] = hours_sale_imputed
del hours_sale_imputed

data_imputed = data.copy()
hours_sale_list = hours_sale_full.reshape(series_num * horizon, 24).tolist()
del hours_sale_full

data_imputed['hours_sale'] = hours_sale_list
del hours_sale_list

output_path = '/home/guest/DemandForecasting/data/imputed_data.csv'
data_imputed.to_csv(output_path, index=False)

print(f"✓ Imputed data saved to: {output_path}")
print(f"  Total rows: {len(data_imputed):,}")

✓ Imputed data saved to: /home/guest/DemandForecasting/data/imputed_data.csv
  Total rows: 4,500,000
