In [1]:
import numpy as np
import pandas as pd
import sys, os, warnings, glob

# Get the parent directory
parent_dir = os.path.dirname(os.getcwd())
# Add the parent directory to sys.path
sys.path.append(parent_dir)

from utils import *
from processts import *

warnings.filterwarnings('ignore')

In [2]:
dir_ = 'D:/Github/knowledge/time-series/data/daikin/sell-in/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

TARGET_COL = 'qty'
DATE_COL = 'date'

group_columns = ['item','department']

COMMON_LAGS = [26,39,52,78,104]
LAGS = list(range(4,25))
ROLL_LAGS = [4,8,12,16,20,26,39,52]
ROLL_WINDOWS = [4,8,12,16,26,39,52]
EWM_LAGS = [4,8,12,16,20,26,39,52]
EWM_ALPHAS = [0.99, 0.95, 0.9, 0.8, 0.5, 0.2, 0.1]
ENC_COLS = [
    ['item'],
    ['department'],
    ['item','department'],
]

In [3]:
sellin_df = pd.read_parquet(f'{processed_data_dir}sellin.parquet')

sellin_df[DATE_COL] = sellin_df[DATE_COL].dt.to_period('W-MON').dt.to_timestamp()
sellin_df = sellin_df.groupby(group_columns+[DATE_COL])[TARGET_COL].agg(['sum']).rename(columns={'sum': TARGET_COL}).reset_index()

In [4]:
date_df = pd.read_parquet(f'{processed_data_dir}date.parquet')
date_df['is_holiday'] = date_df['holiday'].apply(lambda x: 1 if x else 0)
date_df['is_wknd'] = (date_df[DATE_COL].dt.weekday >= 5).astype(int)
date_df['is_offline'] = np.maximum(date_df['is_holiday'], date_df['is_wknd'])
date_df[DATE_COL] = date_df[DATE_COL].dt.to_period('W-MON').dt.to_timestamp()
date_df = date_df.groupby([DATE_COL])[['is_holiday','is_offline']].agg(['sum']).reset_index()
date_df.columns = [DATE_COL] + ['is_holiday','is_offline']
date_df['year'] = date_df[DATE_COL].dt.year
date_df['week_of_year'] = date_df[DATE_COL].dt.isocalendar().week.astype(np.int8)
# date_df

In [5]:
group_df = sellin_df[group_columns].drop_duplicates()

group_df = group_df.merge(date_df, how='cross')

sellin_df = Util.merge_by_concat(group_df, sellin_df, group_columns+[DATE_COL])


release_df = sellin_df[sellin_df[TARGET_COL]>0].groupby(group_columns)[DATE_COL].agg(['min']).reset_index()
release_df.columns = group_columns + ['release']
sellin_df = Util.merge_by_concat(sellin_df, release_df, group_columns)
del release_df

idx = (sellin_df[DATE_COL]>=sellin_df['release']) & (sellin_df[DATE_COL]<='2024-08-31')
sellin_df = sellin_df[idx]
sellin_df[TARGET_COL] = sellin_df[TARGET_COL].fillna(0)
sellin_df['release'] = (sellin_df['date'] - sellin_df['release']).dt.days / 7
sellin_df = sellin_df.sort_values(group_columns+[DATE_COL])
sellin_df = sellin_df.reset_index(drop=True)

In [6]:
pp = PreProcessing('daikin_sellin_log','D:/Github/knowledge/time-series/data/logs')

category_cols = group_columns + ['week_of_year']

sellin_df = pp.generate_grid_category(sellin_df, category_cols)
# sellout_df = pp.generate_grid_date(sellout_df, DATE_COL)
# df = pp.generate_grid_price(df, group_columns, PRICE_COL)
sellin_df = pp.generate_lag_feature(sellin_df, group_columns, TARGET_COL, LAGS + COMMON_LAGS)
sellin_df = pp.generate_roll_feature(sellin_df, group_columns, TARGET_COL, ROLL_WINDOWS, ROLL_LAGS)
sellin_df = pp.generate_ewm_feature(sellin_df, group_columns, TARGET_COL, EWM_ALPHAS, EWM_LAGS)
sellin_df = pp.generate_target_encoding_feature(sellin_df, TARGET_COL, ENC_COLS)

sellin_df.to_csv(f'{processed_data_dir}features_data.csv', index=False, header=True)

2024-11-06 13:18:38,671	INFO	generate lag features
2024-11-06 13:18:38,801	INFO	generate roll features
2024-11-06 13:18:38,802	INFO	Shifting period: 4


Mem. usage decreased to 16.88 MB (71.8% reduction)


2024-11-06 13:18:40,609	INFO	Shifting period: 8
2024-11-06 13:18:42,337	INFO	Shifting period: 12
2024-11-06 13:18:44,091	INFO	Shifting period: 16
2024-11-06 13:18:45,836	INFO	Shifting period: 20
2024-11-06 13:18:47,611	INFO	Shifting period: 26
2024-11-06 13:18:49,384	INFO	Shifting period: 39
2024-11-06 13:18:51,133	INFO	Shifting period: 52
2024-11-06 13:18:53,314	INFO	generate exponentially weighted moving features
2024-11-06 13:18:53,326	INFO	Shifting period:4


Mem. usage decreased to 69.31 MB (69.4% reduction)


2024-11-06 13:18:54,178	INFO	Shifting period:8
2024-11-06 13:18:54,947	INFO	Shifting period:12
2024-11-06 13:18:55,760	INFO	Shifting period:16
2024-11-06 13:18:56,518	INFO	Shifting period:20
2024-11-06 13:18:57,347	INFO	Shifting period:26
2024-11-06 13:18:58,117	INFO	Shifting period:39
2024-11-06 13:18:58,943	INFO	Shifting period:52
2024-11-06 13:19:00,328	INFO	generate target encoding feature
2024-11-06 13:19:00,329	INFO	encoding	['item']
2024-11-06 13:19:00,341	INFO	encoding	['department']
2024-11-06 13:19:00,354	INFO	encoding	['item', 'department']


Mem. usage decreased to 95.52 MB (45.2% reduction)
Mem. usage decreased to 98.33 MB (5.4% reduction)


In [7]:
sellin_df[sellin_df['qty']<0]

Unnamed: 0,item,department,date,is_holiday,is_offline,year,week_of_year,qty,release,qty_lag_4,...,qty_ewm_lag_52_alpha_08,qty_ewm_lag_52_alpha_05,qty_ewm_lag_52_alpha_02,qty_ewm_lag_52_alpha_01,enc_item_mean,enc_item_std,enc_department_mean,enc_department_std,enc_item_department_mean,enc_item_department_std
