In [30]:
import pandas as pd
import numpy as np

data_path = '/home/guest/DemandForecasting/data/original_data.csv'
data = pd.read_csv(data_path)
data = data.sort_values(by=['store_id', 'product_id', 'dt'])
# fix hours_sale column
type = data_path.split('/')[-1][:-4]
if type == 'imputed_data':
    data['hours_sale'] = data['hours_sale'].map(lambda x: x[1:-1].split(', '))
else:
    data['hours_sale'] = data['hours_sale'].map(lambda x: x[1:-1].replace('\n', '').split())

# add time encoded columns
data['dt'] = pd.to_datetime(data['dt'])
data['dayofweek'] = data['dt'].dt.dayofweek
data['day'] = data['dt'].dt.day

In [31]:
data.head()

Unnamed: 0,city_id,store_id,management_group_id,first_category_id,second_category_id,third_category_id,product_id,dt,sale_amount,hours_sale,...,hours_stock_status,discount,holiday_flag,activity_flag,precpt,avg_temperature,avg_humidity,avg_wind_level,dayofweek,day
0,0,0,2,29,78,82,4,2024-03-28,0.5,"[0., 0., 0., 0., 0., 0., 0., 0., 0.5, 0., 0., ...",...,[1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1],0.882,0,1,1.6999,15.48,73.54,1.97,3,28
1,0,0,2,29,78,82,4,2024-03-29,1.3,"[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0...",...,[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1],0.882,0,1,3.019,15.08,76.56,1.71,4,29
2,0,0,2,29,78,82,4,2024-03-30,5.3,"[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.7, 1.4,...",...,[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0],0.882,1,1,2.0942,15.91,76.47,1.73,5,30
3,0,0,2,29,78,82,4,2024-03-31,4.2,"[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.5, ...",...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0],0.879,1,1,1.5618,16.13,77.4,1.76,6,31
4,0,0,2,29,78,82,4,2024-04-01,0.7,"[0., 0., 0., 0., 0., 0., 0., 0., 0.2, 0., 0., ...",...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0],0.882,0,1,3.5386,15.37,78.26,1.25,0,1


In [None]:
from sklearn.preprocessing import StandardScaler

# Phân loại features
numerical_features = ['sale_amount', 'discount', 'precpt', 'avg_temperature', 'avg_humidity', 'avg_wind_level']
binary_features = ['holiday_flag', 'activity_flag']
time_features = ['dayofweek', 'day'] # hour xử lý sau, vì chỉ lấy giờ từ 6-22

horizon=90
series_num = data.shape[0]//horizon

hours_sale = np.array(data['hours_sale'].tolist(), dtype=float)
hours_sale = hours_sale.reshape(series_num, horizon, 24)[...,6:22]

# numerical features
numerical_data = data[numerical_features].values.astype(float)
scaler = StandardScaler()
numerical_normalized = scaler.fit_transform(numerical_data)

# time features (MinMax to 0-1)
time_data = data[time_features].values.astype(float)
time_data[:, 0] = time_data[:, 0] / 6  # dayofweek: 0-6 -> 0-1
time_data[:, 1] = (time_data[:, 1] - 1) / 30  # day: 1-31 -> 0-1

# concat: numerical + binary + time
binary_data = data[binary_features].values.astype(float)
features_combined = np.concatenate([numerical_normalized, binary_data, time_data], axis=1)
features = features_combined.reshape(series_num, horizon, -1)


In [35]:
hours_sale.shape, features.shape

((50000, 90, 16), (50000, 90, 11))

In [36]:
hours_sale = np.expand_dims(hours_sale, axis=-1)

features = np.expand_dims(features, axis=2)
features = np.broadcast_to(features, (series_num, horizon, hours_sale.shape[2], features.shape[-1]))

hour_encoding = np.broadcast_to(np.arange(16)[None,None,:,None]/15, (series_num, horizon, 16, 1))

In [37]:
hours_sale.shape, features.shape, hour_encoding.shape

((50000, 90, 16, 1), (50000, 90, 16, 11), (50000, 90, 16, 1))

In [38]:
ds = np.concatenate([features, hour_encoding, hours_sale], axis=-1)

In [39]:
ds = ds.reshape(series_num, horizon * 16, -1)

In [40]:
# giải phóng RAM
del data, hours_sale, numerical_data, scaler, numerical_normalized, time_data, binary_data, features_combined, features, hour_encoding

In [41]:
ds.shape

(50000, 1440, 13)

In [42]:
import torch
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, data, input_len, target_len):
        self.data = torch.from_numpy(data).float()
        self.input_len = input_len
        self.target_len = target_len
        self.total_len = input_len + target_len
        
        # Tính indices
        self.indices = []
        for i in range(data.shape[0]):
            n_seqs = data.shape[1] - self.total_len + 1
            self.indices.extend([(i, start) for start in range(n_seqs)])
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        series_idx, start = self.indices[idx]
        seq = self.data[series_idx, start:start + self.total_len]
        
        # Tách input và target
        x = seq[:self.input_len]
        y = seq[self.input_len:]
        return x, y

# Sử dụng
dataset = TimeSeriesDataset(ds, input_len=30*16, target_len=7*16)

In [43]:
del ds 