In [1]:
import numpy as np
import pandas as pd
import sys, os, warnings, glob

# Get the parent directory
parent_dir = os.path.dirname(os.getcwd())
# Add the parent directory to sys.path
sys.path.append(parent_dir)

from utils import *
from processts import *

warnings.filterwarnings('ignore')

In [2]:
dir_ = 'D:/Github/knowledge/time-series/data/daikin/sell-out/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

TARGET_COL = 'qty'
DATE_COL = 'date'

group_columns = ['item','location_name']

COMMON_LAGS = [26,39,52,78,104]
LAGS = list(range(4,25))
ROLL_LAGS = [4,8,12,16,20,26,39,52]
ROLL_WINDOWS = [4,8,12,16,26,39,52]
EWM_LAGS = [4,8,12,16,20,26,39,52]
EWM_ALPHAS = [0.99, 0.95, 0.9, 0.8, 0.5, 0.2, 0.1]
ENC_COLS = [
    ['item'],
    ['location_name'],
    ['item','location_name'],
]

In [3]:
sellout_df = pd.read_parquet(f'{processed_data_dir}sellout.parquet')

sellout_df[DATE_COL] = sellout_df[DATE_COL].dt.to_period('W-MON').dt.to_timestamp()
sellout_df = sellout_df.groupby(group_columns+[DATE_COL])[TARGET_COL].agg(['sum']).rename(columns={'sum': TARGET_COL}).reset_index()

In [4]:
date_df = pd.read_parquet(f'{processed_data_dir}date.parquet')
date_df['is_holiday'] = date_df['holiday'].apply(lambda x: 1 if x else 0)
date_df['is_wknd'] = (date_df[DATE_COL].dt.weekday >= 5).astype(int)
date_df['is_offline'] = np.maximum(date_df['is_holiday'], date_df['is_wknd'])
date_df[DATE_COL] = date_df[DATE_COL].dt.to_period('W-MON').dt.to_timestamp()
date_df = date_df.groupby([DATE_COL])[['is_holiday','is_offline']].agg(['sum']).reset_index()
date_df.columns = [DATE_COL] + ['is_holiday','is_offline']
date_df['year'] = date_df[DATE_COL].dt.year
date_df['week_of_year'] = date_df[DATE_COL].dt.isocalendar().week.astype(np.int8)
# date_df

In [5]:
group_df = sellout_df[group_columns].drop_duplicates()

group_df = group_df.merge(date_df, how='cross')

sellout_df = Util.merge_by_concat(group_df, sellout_df, group_columns+[DATE_COL])


release_df = sellout_df[sellout_df[TARGET_COL]>0].groupby(group_columns)[DATE_COL].agg(['min']).reset_index()
release_df.columns = group_columns + ['release']
sellout_df = Util.merge_by_concat(sellout_df, release_df, group_columns)
del release_df

idx = (sellout_df[DATE_COL]>=sellout_df['release']) & (sellout_df[DATE_COL]<='2024-03-31')
sellout_df = sellout_df[idx]
sellout_df[TARGET_COL] = sellout_df[TARGET_COL].fillna(0)
sellout_df['release'] = (sellout_df['date'] - sellout_df['release']).dt.days / 7
sellout_df = sellout_df.sort_values(group_columns+[DATE_COL])
sellout_df = sellout_df.reset_index(drop=True)

In [None]:
pp = PreProcessing('daikin_sellout_log','D:/Github/knowledge/time-series/data/logs')

category_cols = group_columns + ['week_of_year']

sellout_df = pp.generate_grid_category(sellout_df, category_cols)
# sellout_df = pp.generate_grid_date(sellout_df, DATE_COL)
# df = pp.generate_grid_price(df, group_columns, PRICE_COL)
sellout_df = pp.generate_lag_feature(sellout_df, group_columns, TARGET_COL, LAGS + COMMON_LAGS)
sellout_df = pp.generate_roll_feature(sellout_df, group_columns, TARGET_COL, ROLL_WINDOWS, ROLL_LAGS)
sellout_df = pp.generate_ewm_feature(sellout_df, group_columns, TARGET_COL, EWM_ALPHAS, EWM_LAGS)
sellout_df = pp.generate_target_encoding_feature(sellout_df, TARGET_COL, ENC_COLS)

sellout_df.to_csv(f'{processed_data_dir}features_data.csv', index=False, header=True)

2024-10-26 13:21:16,991	INFO	generate lag features
2024-10-26 13:21:28,202	INFO	generate roll features
2024-10-26 13:21:28,202	INFO	Shifting period: 4


Mem. usage decreased to 1103.76 MB (71.9% reduction)


2024-10-26 13:24:15,540	INFO	Shifting period: 8
