## Stacking Model

In [1]:
import warnings
import pickle
import random

import pandas as pd
import numpy as np

SEED = 42
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
random.seed(SEED)

### Data preprocessing

*Our stacking process consists of the following steps:*
1. Use the predictions from all the models we created (light GBM, RNN, XGBoost and ensemble model) as features
2. Split the validation and evaluation parts of dataset. As the ground truth for validation part was available in M5 dataset, we added it to the validation part. This process created the training dataset comprising of unit sales prediction from all 7 models as features and ground truth of unit sales as response
3. Train this validation dataset using XGBoost and Light GBM
4. Predict the values for evaluation part
5. Merge the prediction from step 4 with the validation part of the best model outcome we created

Creating training and testing datasets

In [2]:
df1 = pd.read_csv('dept_lgb_pred_v1.csv')
df2 = pd.read_csv('ensemble_v7.csv')
df3 = pd.read_csv('GRU_v3.csv')
df4 = pd.read_csv('state_cat_lgb_pred_v1.csv')
df5 = pd.read_csv('store_cat_lgb_pred_v1.csv')
df6 = pd.read_csv('store_lgb_pred_v2.csv')
df7 = pd.read_csv('store_xgb_pred.csv')

In [3]:
df1_w = pd.melt(df1, id_vars='id')
df2_w = pd.melt(df2, id_vars='id')
df3_w = pd.melt(df3, id_vars='id')
df4_w = pd.melt(df4, id_vars='id')
df5_w = pd.melt(df5, id_vars='id')
df6_w = pd.melt(df6, id_vars='id')
df7_w = pd.melt(df7, id_vars='id')

In [4]:
df1_w['type'] = df1_w['id'].apply(lambda x: x.split('_')[-1])
df2_w['type'] = df2_w['id'].apply(lambda x: x.split('_')[-1])
df3_w['type'] = df3_w['id'].apply(lambda x: x.split('_')[-1])
df4_w['type'] = df4_w['id'].apply(lambda x: x.split('_')[-1])
df5_w['type'] = df5_w['id'].apply(lambda x: x.split('_')[-1])
df6_w['type'] = df6_w['id'].apply(lambda x: x.split('_')[-1])
df7_w['type'] = df7_w['id'].apply(lambda x: x.split('_')[-1])

In [5]:
df1_v = df1_w[df1_w['type']=='validation']
df2_v = df2_w[df2_w['type']=='validation']
df3_v = df3_w[df3_w['type']=='validation']
df4_v = df4_w[df4_w['type']=='validation']
df5_v = df5_w[df5_w['type']=='validation']
df6_v = df6_w[df6_w['type']=='validation']
df7_v = df7_w[df7_w['type']=='validation']

df1_t = df1_w[df1_w['type']=='evaluation']
df2_t = df2_w[df2_w['type']=='evaluation']
df3_t = df3_w[df3_w['type']=='evaluation']
df4_t = df4_w[df4_w['type']=='evaluation']
df5_t = df5_w[df5_w['type']=='evaluation']
df6_t = df6_w[df6_w['type']=='evaluation']
df7_t = df7_w[df7_w['type']=='evaluation']

In [6]:
df1_v = df1_v[['id', 'variable', 'value']]
df2_v = df2_v[['id', 'variable', 'value']]
df3_v = df3_v[['id', 'variable', 'value']]
df4_v = df4_v[['id', 'variable', 'value']]
df5_v = df5_v[['id', 'variable', 'value']]
df6_v = df6_v[['id', 'variable', 'value']]
df7_v = df7_v[['id', 'variable', 'value']]

df1_t = df1_t[['id', 'variable', 'value']]
df2_t = df2_t[['id', 'variable', 'value']]
df3_t = df3_t[['id', 'variable', 'value']]
df4_t = df4_t[['id', 'variable', 'value']]
df5_t = df5_t[['id', 'variable', 'value']]
df6_t = df6_t[['id', 'variable', 'value']]
df7_t = df7_t[['id', 'variable', 'value']]

In [7]:
df_v = df1_v.merge(df2_v, on=['id', 'variable'])
df_v = df_v.merge(df3_v, on=['id', 'variable'])
df_v = df_v.merge(df4_v, on=['id', 'variable'])
df_v = df_v.merge(df5_v, on=['id', 'variable'])
df_v = df_v.merge(df6_v, on=['id', 'variable'])
df_v = df_v.merge(df7_v, on=['id', 'variable'])

df_t = df1_t.merge(df2_t, on=['id', 'variable'])
df_t = df_t.merge(df3_t, on=['id', 'variable'])
df_t = df_t.merge(df4_t, on=['id', 'variable'])
df_t = df_t.merge(df5_t, on=['id', 'variable'])
df_t = df_t.merge(df6_t, on=['id', 'variable'])
df_t = df_t.merge(df7_t, on=['id', 'variable'])

In [8]:
df_v.columns = ['id', 'variable', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']
df_t.columns = ['id', 'variable', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']

In [9]:
df_v['d_int'] = df_v['variable'].apply(lambda x:int(x[1:]))
df_t['d_int'] = df_t['variable'].apply(lambda x:int(x[1:]))

In [10]:
df_v['d'] = df_v['d_int'].apply(lambda x: 'd_'+str(int(x)+1913))
df_t['d'] = df_t['d_int'].apply(lambda x: 'd_'+str(int(x)+1913))

In [11]:
df_v['d_int'] = df_v['d_int'] + 1913
df_t['d_int'] = df_t['d_int'] + 1913

Adding ground truth for validation dataset

In [12]:
gt = pd.read_pickle('base_sales.pkl')

In [13]:
gt['d_int'] = gt['d'].apply(lambda x:int(x[2:]))

In [14]:
gta = gt[gt['d_int']>1913]

In [15]:
gta = gta[['id', 'd', 'd_int', 'units_sold']]

In [16]:
gta['id'] = gta['id'].apply(lambda x:x[:-11])

In [17]:
gta['id'] = gta['id'].apply(lambda x:x+'_validation')

In [18]:
gta['variable'] = gta['d_int'].apply(lambda x:'F'+str(x-1913))

In [19]:
training_model = df_v.merge(gta, on=['id', 'd', 'd_int', 'variable'])

In [20]:
training_model.head()

Unnamed: 0,id,variable,m1,m2,m3,m4,m5,m6,m7,d_int,d,units_sold
0,HOBBIES_1_001_CA_1_validation,F1,0.763622,0.82241,0.998859,0.803111,0.766149,0.768833,0.726918,1914,d_1914,0.0
1,HOBBIES_1_002_CA_1_validation,F1,0.290976,0.252726,0.328968,0.297042,0.273581,0.324216,0.327158,1914,d_1914,0.0
2,HOBBIES_1_003_CA_1_validation,F1,0.328426,0.481968,0.82313,0.381381,0.440857,0.361297,0.354969,1914,d_1914,0.0
3,HOBBIES_1_004_CA_1_validation,F1,1.804192,1.606938,2.134121,1.72868,1.657468,1.732153,1.778546,1914,d_1914,0.0
4,HOBBIES_1_005_CA_1_validation,F1,0.963251,0.95521,1.302828,1.012629,1.083892,0.900793,0.958628,1914,d_1914,1.0


Saving training dataset

In [21]:
training_model.to_csv('training_model.csv', index=False)

### Model building and tuning

- We used Randomized Search cross validation for hyperparameter tuning
- Used XGBoost and Light GBM models

In [22]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict, GridSearchCV, KFold
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

In [23]:
training_model.head()

Unnamed: 0,id,variable,m1,m2,m3,m4,m5,m6,m7,d_int,d,units_sold
0,HOBBIES_1_001_CA_1_validation,F1,0.763622,0.82241,0.998859,0.803111,0.766149,0.768833,0.726918,1914,d_1914,0.0
1,HOBBIES_1_002_CA_1_validation,F1,0.290976,0.252726,0.328968,0.297042,0.273581,0.324216,0.327158,1914,d_1914,0.0
2,HOBBIES_1_003_CA_1_validation,F1,0.328426,0.481968,0.82313,0.381381,0.440857,0.361297,0.354969,1914,d_1914,0.0
3,HOBBIES_1_004_CA_1_validation,F1,1.804192,1.606938,2.134121,1.72868,1.657468,1.732153,1.778546,1914,d_1914,0.0
4,HOBBIES_1_005_CA_1_validation,F1,0.963251,0.95521,1.302828,1.012629,1.083892,0.900793,0.958628,1914,d_1914,1.0


In [24]:
df_t.head()

Unnamed: 0,id,variable,m1,m2,m3,m4,m5,m6,m7,d_int,d
0,HOBBIES_1_001_CA_1_evaluation,F1,0.777035,0.825341,1.209278,0.767291,0.712727,0.701686,0.62472,1914,d_1914
1,HOBBIES_1_002_CA_1_evaluation,F1,0.213887,0.281647,0.422702,0.215666,0.212461,0.233253,0.216344,1914,d_1914
2,HOBBIES_1_003_CA_1_evaluation,F1,0.435674,0.504139,0.839649,0.519991,0.541638,0.431166,0.508959,1914,d_1914
3,HOBBIES_1_004_CA_1_evaluation,F1,1.694604,1.453415,1.804767,1.651264,1.510755,1.726345,1.582032,1914,d_1914
4,HOBBIES_1_005_CA_1_evaluation,F1,1.193925,0.988548,1.24972,1.146886,1.119197,1.127674,1.142516,1914,d_1914


In [25]:
# Spliting the data
X = training_model.iloc[:, 2:10]
Y = training_model.iloc[:, 11]

#### XGBoost Model

###### Hyperparameter tuning:

1. n_estimators – Number of boosted trees to fit
2. max_depth - limits the number of nodes in the tree and the best value depends on the interaction of the input variables
3. learning_rate - how much the contribution of each tree will shrink
4. min_child_weight - minimum sum of instance weight (hessian) needed in a child
5. booster - booster you have chosen

In [1]:
# Set up possible values of parameters to optimize over
booster=['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': n_estimators,
    'booster':booster,
    'base_score':base_score,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight
    }

clf = XGBRegressor()

grid_search = RandomizedSearchCV(clf, param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X, Y)
grid_search.best_estimator_

In [29]:
# We start with initializing our classifier. 
clf = XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=5, max_leaves=0, min_child_weight=1, missing=np.nan,
             monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0)

#### Light GBM Model

###### Hyperparameter tuning:

1. Boosting_type – ‘gbdt’, traditional Gradient Boosting Decision Tree. ‘dart’, Dropouts meet Multiple Additive Regression Trees. ‘goss’, Gradient-based One-Side Sampling. ‘rf’, Random Forest

2. Num_leaves – Maximum tree leaves for base learners

3. Max_depth – Maximum tree depth for base learners, <=0 means no limit

4. Learning_rate – Boosting learning rate

5. n_estimators – Number of boosted trees to fit

6. Min_child_weight – Minimum sum of instance weight (Hessian) needed in a child (leaf)

In [30]:
# Set up possible values of parameters to optimize over
boosting_type=['gbdt','dart', 'goss', 'rf']
num_leaves=[5,10,15,20,25]
max_depth=[3,5,7,9,11]
learning_rate=[0.01, 0.03, 0.06, 0.09, 0.12]
n_estimators=[20,40,60,80,100,120]
min_child_weight=[0.001, 0.01, 0.1, 1, 10, 100]

# Define the grid of hyperparameters to search
param_grid = {
    'boosting_type': boosting_type,
    'num_leaves':num_leaves,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'n_estimators':n_estimators,
    'min_child_weight':min_child_weight
    }

clf = lgb.LGBMRegressor()

grid_search = RandomizedSearchCV(clf, param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X, Y)
grid_search.best_estimator_

LGBMRegressor(boosting_type='goss', learning_rate=0.12, max_depth=9,
              min_child_weight=100, n_estimators=60, num_leaves=20)

In [31]:
# We start with initializing our classifier. 
clf = lgb.LGBMRegressor(boosting_type='goss', learning_rate=0.12, max_depth=9,
              min_child_weight=100, n_estimators=60, num_leaves=20)

Fitting the data

In [32]:
clf = clf.fit(X,Y)

Making prediction

In [33]:
# Spliting for testing:
Z = df_t.iloc[:, 2:10]

In [34]:
y_pred = clf.predict(Z)

In [35]:
df_t['units_sold'] = y_pred

In [36]:
sub = df_t[['id', 'variable', 'units_sold']]

In [37]:
df2_w['id'] = df2_w['id'].apply(lambda x:x[:-11])
df2_w['id'] = df2_w['id'].apply(lambda x:x + '_validation')
df2_w = df2_w[['id', 'variable', 'value']]
df2_w.columns = ['id', 'variable', 'units_sold']
sub2 = df2_w
sub2.head()

Unnamed: 0,id,variable,units_sold
0,FOODS_1_001_CA_1_validation,F1,0.667769
1,FOODS_1_001_CA_1_validation,F1,0.833985
2,FOODS_1_001_CA_2_validation,F1,0.877909
3,FOODS_1_001_CA_2_validation,F1,0.888873
4,FOODS_1_001_CA_3_validation,F1,0.637286


In [38]:
ss = pd.concat([sub, sub2])

In [39]:
subW = pd.pivot_table(ss, index=['id'],
              columns=['variable']).reset_index()

In [40]:
subW.columns = ['id', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16',
                'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28']

In [41]:
subW.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_evaluation,0.478523,0.534348,0.414144,0.933272,0.949845,0.749429,0.642199,0.579755,0.543261,...,0.594517,0.711089,0.721609,0.46954,0.522296,0.622431,0.697918,0.546335,0.475311,0.502994
1,FOODS_1_001_CA_1_validation,0.750877,0.86397,0.656228,1.184584,1.169438,0.914786,0.876346,0.852332,0.791866,...,0.81868,0.953143,0.841315,0.690597,0.715093,0.840486,0.932053,0.901149,0.80145,0.899201
2,FOODS_1_001_CA_2_evaluation,0.613527,1.350456,0.849205,1.411943,1.387238,1.180179,1.222913,1.460685,1.322387,...,1.069855,1.013084,0.925004,0.658377,0.797145,0.631974,0.84114,0.883675,1.333359,0.89064
3,FOODS_1_001_CA_2_validation,0.883391,1.110109,0.968824,1.183642,1.316519,1.227837,1.083409,1.194896,1.113735,...,0.995352,1.218162,1.139178,0.903968,0.893613,0.83053,1.189422,1.28015,1.169404,1.006378
4,FOODS_1_001_CA_3_evaluation,0.367909,0.444788,0.578945,0.575849,0.850895,1.415541,0.476489,0.339994,0.459453,...,0.863357,0.985784,1.462307,0.420665,0.407006,0.726849,1.038272,1.276933,0.806201,0.521896


Saving submission file

In [42]:
subW.to_csv('submissionF.csv', index=False)