In [4]:
#Import All required packages
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import shutil
import multiprocessing as mp
from tqdm import tqdm
import gc

## Data Preperation

In [3]:
df=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
df1=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv') #this is used for training
df2=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
df3=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv') # this is used for testing

In [4]:
#Clearly in calender.csv we have many entries which contain NaN in event_type_1,event_type_2,event_name_1 & enent_name_2
#We have replaced all those entries with no_event
df=df.fillna(value='no_event')

In [5]:
l=[]
for i in range(1,1914):
  l.append("d_"+str(i))
df_final=pd.melt(df1,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],\
                 value_vars=l,var_name="d",value_name="sales")

In [6]:
# Taking only last 28 data days of test bcz eariler values are same as for train
l=[]
for i in range(1914,1942):
  l.append("d_"+str(i))
df_final_test=pd.melt(df3,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],\
                 value_vars=l,var_name="d",value_name="sales")

In [7]:
for i in range(1942,1970):
    df3['d_'+str(i)]=0

In [8]:
#Also create future data to be used for futures sales data
l=[]
for i in range(1942,1970):
    l.append("d_"+str(i))
df_future_data=pd.melt(df3,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],\
                 value_vars=l,var_name="d",value_name="sales")

In [9]:
#Now we merge all These 3 dataframes to get final csv file train
data=df_final.merge(df,on='d',copy=False)# combine calender.csv and modified trainevaluation.csv on feature 'd'
data=data.merge(df2,on=["store_id", "item_id", "wm_yr_wk"],copy=False) # combine new dataframe with sell_price.csv usnig features "store_id", "item_id", "wm_yr_wk"
data.to_feather('/kaggle/working/final_dataframe.feather')

In [10]:
#Now we merge all These 3 dataframes to get final csv file test
data_test=df_final_test.merge(df,on='d',copy=False)# combine calender.csv and modified trainevaluation.csv on feature 'd'
data_test=data_test.merge(df2,on=["store_id", "item_id", "wm_yr_wk"],copy=False) # combine new dataframe with sell_price.csv usnig features "store_id", "item_id", "wm_yr_wk"
data_test.to_feather('/kaggle/working/final_dataframe_test.feather')

In [11]:
#Now we merge all These 3 dataframes to get final csv file future data
data_future=df_future_data.merge(df,on='d',copy=False)# combine calender.csv and modified trainevaluation.csv on feature 'd'
data_future=data_future.merge(df2,on=["store_id", "item_id", "wm_yr_wk"],copy=False) # combine new dataframe with sell_price.csv usnig features "store_id", "item_id", "wm_yr_wk"
data_future.fillna('no_event',inplace=True)
data_future.to_feather('/kaggle/working/final_future_data.feather')

In [12]:
print("Shape of final dataframe train is=",data.shape)
print("Shape of final dataframe test is=",data_test.shape)

Shape of final dataframe train is= (46027957, 22)
Shape of final dataframe test is= (853720, 22)


In [16]:
# Free up RAM to avoid kernel issues
del data, data_test, data_future
gc.collect()

8

## Feature Engineering Steps

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import multiprocessing as mp
import gc
import datetime
from sklearn.preprocessing import LabelEncoder
import calendar
from scipy.sparse import csr_matrix,hstack
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from tqdm import tqdm
import pickle

In [3]:
#Reading up the dataframes

train=pd.read_feather('Intermediate Data/final_dataframe.feather')
test=pd.read_feather('Intermediate Data/final_dataframe_test.feather')
final_test=pd.read_feather('Intermediate Data/final_future_data.feather')

#### Encoding Categorical Data to Integer to save space in RAM

In [4]:
lbl=LabelEncoder()
train['item_id']=lbl.fit_transform(train['item_id'])
test['item_id']=lbl.transform(test['item_id'])
final_test['item_id']=lbl.transform(final_test['item_id'])
pickle.dump(lbl,open('label_encoder_item_id.sav','wb'))

In [5]:
lbl=LabelEncoder()
train['dept_id']=lbl.fit_transform(train['dept_id'])
test['dept_id']=lbl.transform(test['dept_id'])
final_test['dept_id']=lbl.transform(final_test['dept_id'])
pickle.dump(lbl,open('label_encoder_dept_id.sav','wb'))

In [6]:
lbl=LabelEncoder()
train['cat_id']=lbl.fit_transform(train['cat_id'])
test['cat_id']=lbl.transform(test['cat_id'])
final_test['cat_id']=lbl.transform(final_test['cat_id'])
pickle.dump(lbl,open('label_encoder_cat_id.sav','wb'))

In [7]:
lbl=LabelEncoder()
train['store_id']=lbl.fit_transform(train['store_id'])
test['store_id']=lbl.transform(test['store_id'])
final_test['store_id']=lbl.transform(final_test['store_id'])
pickle.dump(lbl,open('label_encoder_store_id.sav','wb'))

In [8]:
lbl=LabelEncoder()
train['state_id']=lbl.fit_transform(train['state_id'])
test['state_id']=lbl.transform(test['state_id'])
final_test['state_id']=lbl.transform(final_test['state_id'])
pickle.dump(lbl,open('label_encoder_state_id.sav','wb'))

In [9]:
lbl=LabelEncoder()
train['event_name_1']=lbl.fit_transform(train['event_name_1'])
test['event_name_1']=lbl.transform(test['event_name_1'])
final_test['event_name_1']=lbl.transform(final_test['event_name_1'])
pickle.dump(lbl,open('label_encoder_event_name_1.sav','wb'))

In [10]:
lbl=LabelEncoder()
train['event_name_2']=lbl.fit_transform(train['event_name_2'])
test['event_name_2']=lbl.transform(test['event_name_2'])
final_test['event_name_2']=lbl.transform(final_test['event_name_2'])
pickle.dump(lbl,open('label_encoder_event_name_2.sav','wb'))

In [11]:
lbl=LabelEncoder()
train['event_type_1']=lbl.fit_transform(train['event_type_1'])
test['event_type_1']=lbl.transform(test['event_type_1'])
final_test['event_type_1']=lbl.transform(final_test['event_type_1'])
pickle.dump(lbl,open('label_encoder_event_type_1.sav','wb'))

In [12]:
lbl=LabelEncoder()
train['event_type_2']=lbl.fit_transform(train['event_type_2'])
test['event_type_2']=lbl.transform(test['event_type_2'])
final_test['event_type_2']=lbl.transform(final_test['event_type_2'])
pickle.dump(lbl,open('label_encoder_event_type_2.sav','wb'))

In [13]:
lbl=LabelEncoder()
train['year']=lbl.fit_transform(train['year'])
test['year']=lbl.transform(test['year'])
final_test['year']=lbl.transform(final_test['year'])
pickle.dump(lbl,open('label_encoder_year.sav','wb'))

In [14]:
gc.collect()

580

#### Removing Unnecessary Columns to save space

In [15]:
#We are not using these features
cols_to_drop = ['weekday', 'wm_yr_wk']

train.drop(columns=cols_to_drop, inplace=True)
test.drop(columns=cols_to_drop, inplace=True)
final_test.drop(columns=cols_to_drop, inplace=True)


In [16]:
# Efficiently collapse snap columns into one
train['snap'] = np.where(train['state_id'] == 'CA', train['snap_CA'],
                np.where(train['state_id'] == 'TX', train['snap_TX'], train['snap_WI']))

# Drop old columns
train.drop(['snap_CA','snap_TX','snap_WI'], axis=1, inplace=True)

test['snap'] = np.where(test['state_id'] == 'CA', test['snap_CA'],
               np.where(test['state_id'] == 'TX', test['snap_TX'], test['snap_WI']))

# Drop old columns
test.drop(['snap_CA','snap_TX','snap_WI'], axis=1, inplace=True)

final_test['snap'] = np.where(final_test['state_id'] == 'CA', final_test['snap_CA'],
                    np.where(final_test['state_id'] == 'TX', final_test['snap_TX'], final_test['snap_WI']))

# Drop old columns
final_test.drop(['snap_CA','snap_TX','snap_WI'], axis=1, inplace=True)

#### Creating Features

In [17]:
def get_week_number(x):
    """This Function is used to get weeknumber of particular date"""
    date=calendar.datetime.date.fromisoformat(x)
    return date.isocalendar()[1]

In [18]:
train['week_number']=train['date'].apply(lambda x:get_week_number(x))
test['week_number']=test['date'].apply(lambda x:get_week_number(x))
final_test['week_number']=final_test['date'].apply(lambda x:get_week_number(x))

In [19]:
def get_season(x):
    """This function is used to get season in US according to various months"""
    if x in [12,1,2]:
        return 0      #"Winter"
    elif x in [3,4,5]:
        return 1   #"Spring"
    elif x in [6,7,8]:
        return 2   #"Summer"
    else:
        return 3   #"Autumn"

In [20]:
train['season']=train['month'].apply(lambda x:get_season(x))
test['season']=test['month'].apply(lambda x:get_season(x))
final_test['season']=final_test['month'].apply(lambda x:get_season(x))

In [21]:
def check_if_quater_begin(x):
    """This is used to check if day is begining of quater"""
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    return 1 if (day==1 and (month in [1,4,7,9])) else 0

In [22]:
train['quater_start']=train['date'].apply(lambda x:check_if_quater_begin(x))
test['quater_start']=test['date'].apply(lambda x:check_if_quater_begin(x))
final_test['quater_start']=final_test['date'].apply(lambda x:check_if_quater_begin(x))

In [23]:
def check_if_quater_end(x):
    """This is used to check if day is end of quater"""
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    if (day==31 and month==3) or (day==30 and month==6) or (day==30 and month==9) or (day==31 and month==12):
        return 1
    else:
        return 0

In [24]:
train['quater_end']=train['date'].apply(lambda x:check_if_quater_end(x))
test['quater_end']=test['date'].apply(lambda x:check_if_quater_end(x))
final_test['quater_end']=final_test['date'].apply(lambda x:check_if_quater_end(x))

In [25]:
def month_start(x):
    """This is used to check if day is begining of month"""
    day=calendar.datetime.date.fromisoformat(x).day
    return 1 if day==1 else 0

In [26]:
train['month_start']=train['date'].apply(lambda x:month_start(x))
test['month_start']=test['date'].apply(lambda x:month_start(x))
final_test['month_start']=final_test['date'].apply(lambda x:month_start(x))

In [27]:
def month_end(x):
    """This is used to check if day is end of month"""
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    year=calendar.datetime.date.fromisoformat(x).year
    leap_yr=(year%4==0) # to check if it is a leap year
    val=(day==31 and month==1) or (day==29 if leap_yr else day==28) or (day==31 and month==3) or (day==30 and month==4) or\
        (day==31 and month==5) or (day==30 and month==6) or (day==31 and month==7) or (day==31 and month==8) or\
        (day==30 and month==9) or (day==31 and month==10) or (day==30 and month==11) or (day==31 and month==12)
    return 1 if val else 0

In [28]:
train['month_end']=train['date'].apply(lambda x:month_end(x))
test['month_end']=test['date'].apply(lambda x:month_end(x))
final_test['month_end']=final_test['date'].apply(lambda x:month_end(x))

In [29]:
def year_start(x):
    """This is used to check if day is begining of year"""
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    return 1 if (day==1 and month==1) else 0

In [30]:
train['year_start']=train['date'].apply(lambda x:year_start(x))
test['year_start']=test['date'].apply(lambda x:year_start(x))
final_test['year_start']=final_test['date'].apply(lambda x:year_start(x))

In [31]:
def year_end(x):
    """This is used to check if day is end of year"""
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    return 1 if (day==31 and month==12) else 0

In [32]:
train['year_end']=train['date'].apply(lambda x:year_end(x))
test['year_end']=test['date'].apply(lambda x:year_end(x))
final_test['year_end']=final_test['date'].apply(lambda x:year_end(x))

In [33]:
#Cross Validation data will be used for hyperparameter tuning
cv=train[train['date']>='2016-03-28']
train=train[train['date']<'2016-03-28']

#### Timeseries Features

In [34]:
#Firstly we will create these Direct features for train  and CV test and final test data
# Code to create one large data for all days
gc.collect()
tt=pd.concat([train,cv,test,final_test])
tt.sort_values(['id','date'],inplace=True)
df=tt.pivot_table(index=['item_id','store_id'],columns='date',values='sales')
df.fillna(0,inplace=True)

In [35]:
#Rolling Features
# Here we are taking 28 days shift so as to avoid Data Leakage Problem
for aggregate in ['mean','std']:
    for shif in [28]:
        for r in [7,14,30,60,360]:
            roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)
            dates=roll.columns
            name="roll_"+str(r)+"_shift_"+str(shif)+"_"+aggregate
            roll=roll.astype('float16')
            roll.reset_index(level=[0,1],inplace=True)
            roll=pd.melt(roll,id_vars=['item_id','store_id'],value_vars=dates,var_name='date',value_name=name)
            roll.fillna(-1,inplace=True)
            train=train.merge(roll,on=['item_id','store_id','date'])
            cv=cv.merge(roll,on=['item_id','store_id','date'])
            final_test=final_test.merge(roll,on=['item_id','store_id','date'])
            test=test.merge(roll,on=['item_id','store_id','date'])
            print("Feature created named :=",name)
            del roll
            gc.collect()

  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_7_shift_28_mean


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_14_shift_28_mean


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_30_shift_28_mean


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_60_shift_28_mean


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_360_shift_28_mean


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_7_shift_28_std


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_14_shift_28_std


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_30_shift_28_std


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_60_shift_28_std


  roll=df.rolling(r,axis=1).agg(aggregate).shift(shif)


Feature created named := roll_360_shift_28_std


In [36]:
# Adding  Exponential weighted average with shift of 28 days
# Shift of 28 days is used to prevent data leakage Problem
roll=df.shift(28,axis=1).ewm(alpha=0.99,axis=1,adjust=False).mean()
dates=roll.columns
roll=roll.astype('float16')
roll.reset_index(level=[0,1],inplace=True)
roll=pd.melt(roll,id_vars=['item_id','store_id'],value_vars=dates,var_name='date',value_name='direct_ewm')
roll.fillna(-1,inplace=True)
train=train.merge(roll,on=['item_id','store_id','date'])
cv=cv.merge(roll,on=['item_id','store_id','date'])
test=test.merge(roll,on=['item_id','store_id','date'])
final_test=final_test.merge(roll,on=['item_id','store_id','date'])
print("Direct Feature created ewa window of size")

  roll=df.shift(28,axis=1).ewm(alpha=0.99,axis=1,adjust=False).mean()


Direct Feature created ewa window of size


In [37]:
# Now we will also calculate lag features with lag of 28,35,42,49,56,63,70,77,84,91,98 days
for lag in range(28,100,7):
    i='direct_lag_'+str(lag)
    lag_i=df.shift(lag,axis=1)
    dates=lag_i.columns
    lag_i.reset_index(level=[0,1],inplace=True)
    lag_i=pd.melt(lag_i,id_vars=['item_id','store_id'],value_vars=dates,var_name='date',value_name=i)
    lag_i.fillna(-1,inplace=True)
    lag_i[i]=lag_i[i].astype('int16')
    train=train.merge(lag_i,on=['item_id','store_id','date'])
    cv=cv.merge(lag_i,on=['item_id','store_id','date'])
    test=test.merge(lag_i,on=['item_id','store_id','date'])
    final_test=final_test.merge(lag_i,on=['item_id','store_id','date'])
    print("Feature created for lag",lag)
    del lag_i
    gc.collect()

Feature created for lag 28
Feature created for lag 35
Feature created for lag 42
Feature created for lag 49
Feature created for lag 56
Feature created for lag 63
Feature created for lag 70
Feature created for lag 77
Feature created for lag 84
Feature created for lag 91


  lag_i.reset_index(level=[0,1],inplace=True)


Feature created for lag 98


In [42]:
# Saving the final feature engineered data
train.to_feather('Intermediate Data/train1.feather')
cv.to_feather('Intermediate Data/cv1.feather')
test.to_feather('Intermediate Data/test1.feather')
final_test.to_feather('Intermediate Data/final_test1.feather')

In [41]:
# Saving the final feature engineered data
train.to_csv('Intermediate Data/train1.csv')
cv.to_csv('Intermediate Data/cv1.csv')
test.to_csv('Intermediate Data/test1.csv')
final_test.to_csv('Intermediate Data/final_test1.csv')