In [1]:
import numpy as np
import pandas as pd
import os, gc, datetime, pickle, warnings

from utils import *
from processts import *


warnings.filterwarnings('ignore')

# Step 2

In [2]:
dir_ = 'D:/Github/knowledge/time-series/data/m5/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

In [3]:
########################### Vars
#################################################################################
TARGET_COL = 'sales'         # Our main target
DATE_COL = 'date'
PRICE_COL = 'sell_price'

group_columns = ['store_id','item_id']

LAGS = list(range(7,57)) + [60,90,182,364]
ROLL_LAGS = [7,14,21,28,35,42,49,56,60,90,182,364]
ROLL_WINDOWS = [7,14,21,28,60,90,182,364]
EWM_LAGS = [7,14,21,28,35,42,49,56,60,90,182,364]
EWM_ALPHAS = [0.99, 0.95, 0.9, 0.8, 0.5, 0.2, 0.1]
ENC_COLS = [
    ['cat_id'],
    ['dept_id'],
    ['item_id'],
    ['store_id', 'cat_id'],
    ['store_id', 'dept_id'],
    ['store_id', 'item_id']
]

In [4]:
pp = PreProcessing('m5_log','D:/Github/knowledge/time-series/data/logs')
df = pd.read_parquet(f'{processed_data_dir}init_data.parquet')
df = df[df['store_id']=='CA_1'].reset_index(drop=True)

df = pp.generate_grid_date(df, DATE_COL)
df = pp.generate_grid_price(df, group_columns, PRICE_COL)
df = pp.generate_lag_feature(df, group_columns, TARGET_COL, LAGS)
df = pp.generate_roll_feature(df, group_columns, TARGET_COL, ROLL_WINDOWS, ROLL_LAGS)
df = pp.generate_ewm_feature(df, group_columns, TARGET_COL, EWM_ALPHAS, EWM_LAGS)
df = pp.generate_target_encoding_feature(df, TARGET_COL, ENC_COLS)

df.to_parquet(f'{processed_data_dir}features_data_CA_1.parquet')

2024-09-26 10:28:08,097	INFO	generate_grid_date
2024-09-26 10:28:09,879	INFO	generate_grid_price


Mem. usage decreased to 213.02 MB (50.4% reduction)


2024-09-26 10:28:13,237	INFO	generate lag features


Mem. usage decreased to 335.30 MB (34.8% reduction)


2024-09-26 10:28:20,730	INFO	generate roll features
2024-09-26 10:28:20,730	INFO	Shifting period: 7


Mem. usage decreased to 843.21 MB (64.4% reduction)


2024-09-26 10:28:46,914	INFO	Shifting period: 14
2024-09-26 10:29:13,815	INFO	Shifting period: 21
2024-09-26 10:29:40,158	INFO	Shifting period: 28
2024-09-26 10:30:09,092	INFO	Shifting period: 35
2024-09-26 10:30:37,096	INFO	Shifting period: 42
2024-09-26 10:31:06,866	INFO	Shifting period: 49
2024-09-26 10:31:37,441	INFO	Shifting period: 56
2024-09-26 10:32:08,854	INFO	Shifting period: 60
2024-09-26 10:32:42,916	INFO	Shifting period: 90
2024-09-26 10:33:11,562	INFO	Shifting period: 182
2024-09-26 10:33:40,266	INFO	Shifting period: 364
2024-09-26 10:34:36,850	INFO	generate exponentially weighted moving features
2024-09-26 10:34:36,851	INFO	Shifting period:7


Mem. usage decreased to 2649.12 MB (67.2% reduction)


2024-09-26 10:34:45,762	INFO	Shifting period:14
2024-09-26 10:34:54,408	INFO	Shifting period:21
2024-09-26 10:35:03,225	INFO	Shifting period:28
2024-09-26 10:35:12,052	INFO	Shifting period:35
2024-09-26 10:35:20,924	INFO	Shifting period:42
2024-09-26 10:35:29,777	INFO	Shifting period:49
2024-09-26 10:35:38,690	INFO	Shifting period:56
2024-09-26 10:35:47,505	INFO	Shifting period:60
2024-09-26 10:35:57,720	INFO	Shifting period:90
2024-09-26 10:36:09,159	INFO	Shifting period:182
2024-09-26 10:36:20,755	INFO	Shifting period:364
2024-09-26 10:37:01,053	INFO	generate target encoding feature
2024-09-26 10:37:01,054	INFO	encoding	['cat_id']


Mem. usage decreased to 3439.21 MB (40.8% reduction)


2024-09-26 10:37:01,272	INFO	encoding	['dept_id']
2024-09-26 10:37:01,494	INFO	encoding	['item_id']
2024-09-26 10:37:01,708	INFO	encoding	['store_id', 'cat_id']
2024-09-26 10:37:02,193	INFO	encoding	['store_id', 'dept_id']
2024-09-26 10:37:02,747	INFO	encoding	['store_id', 'item_id']


Mem. usage decreased to 3552.08 MB (8.7% reduction)
