In [1]:
import numpy as np
import pandas as pd
import os, gc, datetime, pickle, warnings

from ydata_profiling import ProfileReport

from utils import *
from processts import *


warnings.filterwarnings('ignore')

# Step 1

In [2]:
dir_ = 'D:/Github/knowledge/time-series/data/m5/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

In [3]:
########################### Vars
#################################################################################
TARGET_COL = 'sales'         # Our main target

index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
group_columns = ['store_id','item_id']

In [None]:
########################### Load Data
#################################################################################
print('Load Main Data')

# Here are reafing all our data 
# without any limitations and dtype modification
train_df = pd.read_csv(raw_data_dir+'sales_train_validation.csv')
prices_df = pd.read_csv(raw_data_dir+'sell_prices.csv')
calendar_df = pd.read_csv(raw_data_dir+'calendar.csv')

In [5]:
pp = PreProcessing('m5_log','D:/Github/knowledge/time-series/data/logs')

In [6]:
train_df = pd.melt(train_df, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET_COL)

train_df = pp.generate_grid_category(train_df, index_columns)
train_df['d'] = train_df['d'].str[2:].astype(int)

prices_df = pp.generate_grid_category(prices_df, group_columns)

In [7]:
release_df = prices_df.groupby(group_columns)['wm_yr_wk'].agg(['min']).reset_index()
release_df.columns = group_columns + ['release']
train_df = Util.merge_by_concat(train_df, release_df, group_columns)
del release_df

calendar_df['d'] = calendar_df['d'].str[2:].astype(int)

icols = ['event_name_1',
         'event_type_1',
         'event_name_2',
         'event_type_2',
         'snap_CA',
         'snap_TX',
         'snap_WI']
calendar_df = pp.generate_grid_category(calendar_df, icols)
calendar_columns = [c for c in calendar_df.columns.to_list() if c not in ['weekday', 'wday']]
train_df = Util.merge_by_concat(train_df, calendar_df[calendar_columns], ['d'])

train_df = train_df[train_df['wm_yr_wk']>=train_df['release']]
train_df = train_df.reset_index(drop=True)
train_df['release'] = train_df['release'] - train_df['release'].min()
train_df['release'] = train_df['release'].astype(np.int16)

train_df = Util.merge_by_concat(train_df, prices_df, group_columns + ['wm_yr_wk'])

In [None]:
train_df = Util.reduce_mem_usage(train_df, True)
train_df.info()

In [9]:
# train_df['sell_price'] = train_df['sell_price'].astype(np.float32)
# ProfileReport(train_df, title="Profiling Train Report").to_widgets()

In [10]:
train_df.to_parquet(f'{processed_data_dir}init_data.parquet')