In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.




In [2]:
#importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn import preprocessing
from sklearn import utils
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import eli5
from eli5.sklearn import PermutationImportance
import category_encoders as ce
import pickle
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV



In [3]:
#reading data from csv files
train_df = pd.read_csv('train.csv')
store_df = pd.read_csv('stores.csv')
feature_df = pd.read_csv('features.csv')

PREPROCESSING

In [4]:
#merging data and filling null values
feature_store = feature_df.merge(store_df, how='inner', on = "Store").reset_index(drop=True)
feature_store.head()
train = train_df.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by=['Store','Dept','Date']).reset_index(drop=True)
train.head()

train.fillna(0,inplace=True)
li = [x for x in train.columns if 'MarkDown' in x ]
print(li)
train['isMarkdown'] = train[li].sum(axis=1).astype('bool')


['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']


In [5]:
#date transformation function
def date_transformer(data_frame, date_column):
    data_frame[date_column]=pd.to_datetime(train['Date'])
    data_frame['Day'] = data_frame['Date'].dt.day
    data_frame['Week'] = data_frame['Date'].dt.isocalendar().week
    data_frame['Month'] = data_frame['Date'].dt.month
    data_frame['Year'] = data_frame['Date'].dt.year
    return data_frame
date_transformer(train,"Date")

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,MarkDown5,CPI,Unemployment,Type,Size,isMarkdown,Day,Week,Month,Year
0,1,1,2010-02-05,24924.50,False,42.31,2.572,0.00,0.00,0.00,...,0.00,211.096358,8.106,A,151315,False,5,5,2,2010
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.00,0.00,0.00,...,0.00,211.242170,8.106,A,151315,False,12,6,2,2010
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.00,0.00,0.00,...,0.00,211.289143,8.106,A,151315,False,19,7,2,2010
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.00,0.00,0.00,...,0.00,211.319643,8.106,A,151315,False,26,8,2,2010
4,1,1,2010-03-05,21827.90,False,46.50,2.625,0.00,0.00,0.00,...,0.00,211.350143,8.106,A,151315,False,5,9,3,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False,64.88,3.997,4556.61,20.64,1.50,...,3288.25,192.013558,8.684,B,118221,True,28,39,9,2012
421566,45,98,2012-10-05,628.10,False,64.89,3.985,5046.74,0.00,18.82,...,2340.01,192.170412,8.667,B,118221,True,5,40,10,2012
421567,45,98,2012-10-12,1061.02,False,54.47,4.000,1956.28,0.00,7.89,...,3990.54,192.327265,8.667,B,118221,True,12,41,10,2012
421568,45,98,2012-10-19,760.01,False,56.47,3.969,2004.02,0.00,3.18,...,1537.49,192.330854,8.667,B,118221,True,19,42,10,2012


In [26]:
#one-hot-encoding
train[['Type_A','Type_B','Type_C']] = pd.get_dummies(train['Type'])
train['Week'] = train['Week'].astype('int')
train['Year']=train['Year'].astype('int')


In [39]:
#markdowns summation
train['MarkdownSum']=  train['MarkDown1'] + train['MarkDown2'] + train['MarkDown3'] + train['MarkDown4'] + train['MarkDown5'] 
features = [feature for feature in train.columns if feature not in ('Date','Weekly_Sales','Month','Day','Type')]
X = train[features].copy()
y = train[['Weekly_Sales']].copy()
X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=.2)



In [8]:
#XGB regressor
gbm = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=400, max_depth=15, learning_rate=0.35)
gbm.fit(X_train,y_train)
train

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Size,isMarkdown,Day,Week,Month,Year,Type_A,Type_B,Type_C,MarkdownSum
0,1,1,2010-02-05,24924.50,False,42.31,2.572,0.00,0.00,0.00,...,151315,False,5,5,2,2010,1,0,0,0.00
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.00,0.00,0.00,...,151315,False,12,6,2,2010,1,0,0,0.00
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.00,0.00,0.00,...,151315,False,19,7,2,2010,1,0,0,0.00
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.00,0.00,0.00,...,151315,False,26,8,2,2010,1,0,0,0.00
4,1,1,2010-03-05,21827.90,False,46.50,2.625,0.00,0.00,0.00,...,151315,False,5,9,3,2010,1,0,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False,64.88,3.997,4556.61,20.64,1.50,...,118221,True,28,39,9,2012,0,1,0,9468.01
421566,45,98,2012-10-05,628.10,False,64.89,3.985,5046.74,0.00,18.82,...,118221,True,5,40,10,2012,0,1,0,9659.00
421567,45,98,2012-10-12,1061.02,False,54.47,4.000,1956.28,0.00,7.89,...,118221,True,12,41,10,2012,0,1,0,6554.03
421568,45,98,2012-10-19,760.01,False,56.47,3.969,2004.02,0.00,3.18,...,118221,True,19,42,10,2012,0,1,0,3982.42


In [9]:
#permutation importance


perm = PermutationImportance(gbm, random_state=1).fit(X_train, y_train)
features = eli5.show_weights(perm, top=len(X_train.columns), feature_names = X_test.columns.tolist())

In [10]:
#feature weights
features_weights = eli5.show_weights(perm, top=len(X_train.columns), feature_names = X_test.columns.tolist())
features_weights

Weight,Feature
1.8037  ± 0.0096,Dept
0.5489  ± 0.0059,Size
0.1503  ± 0.0019,Store
0.1444  ± 0.0051,Week
0.0702  ± 0.0022,CPI
0.0296  ± 0.0006,Type_B
0.0207  ± 0.0007,Temperature
0.0149  ± 0.0004,Unemployment
0.0084  ± 0.0005,Type_A
0.0070  ± 0.0001,Fuel_Price


In [54]:
#binary encoding
encoder = ce.BinaryEncoder(cols=['Dept'],return_df=True).fit(train['Dept'])
enc = encoder.transform(train['Dept'])
train = train.join(enc)
pickle.dump(encoder, open('enc.pkl','wb'))

In [40]:
#train-test-split
X = train[['Dept_0','Dept_1','Dept_2','Dept_3','Dept_4','Dept_5','Dept_6','Store','Size',"Week",'Year','IsHoliday','Type_A','Type_B','Type_C','Temperature','Fuel_Price','Unemployment','CPI']]
y = train['Weekly_Sales']
print (X.shape,y.shape)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30, random_state=0) # perform 80-20 split on dataset
display(X)

(421570, 19) (421570,)


Unnamed: 0,Dept_0,Dept_1,Dept_2,Dept_3,Dept_4,Dept_5,Dept_6,Store,Size,Week,Year,IsHoliday,Type_A,Type_B,Type_C,Temperature,Fuel_Price,Unemployment,CPI
0,0,0,0,0,0,0,1,1,151315,5,2010,False,1,0,0,42.31,2.572,8.106,211.096358
1,0,0,0,0,0,0,1,1,151315,6,2010,True,1,0,0,38.51,2.548,8.106,211.242170
2,0,0,0,0,0,0,1,1,151315,7,2010,False,1,0,0,39.93,2.514,8.106,211.289143
3,0,0,0,0,0,0,1,1,151315,8,2010,False,1,0,0,46.63,2.561,8.106,211.319643
4,0,0,0,0,0,0,1,1,151315,9,2010,False,1,0,0,46.50,2.625,8.106,211.350143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,1,0,0,1,1,0,0,45,118221,39,2012,False,0,1,0,64.88,3.997,8.684,192.013558
421566,1,0,0,1,1,0,0,45,118221,40,2012,False,0,1,0,64.89,3.985,8.667,192.170412
421567,1,0,0,1,1,0,0,45,118221,41,2012,False,0,1,0,54.47,4.000,8.667,192.327265
421568,1,0,0,1,1,0,0,45,118221,42,2012,False,0,1,0,56.47,3.969,8.667,192.330854


In [13]:
#function for binary+one hot encoding
def enc(x = [[1,2,False,42.31,2.572,211.096358,8.106,'A',151315,5,2010]]):
    df1 = pd.DataFrame(x,columns=['Store','Dept','IsHoliday','Temperature','Fuel_Price','CPI','Unemployment','Type','Size','Week','Year'])
    encoded_dept = encoder.transform(df1['Dept'])
    df1 = df1.join(encoded_dept)
    df1 = pd.get_dummies(df1)
    df1 = df1.reindex(columns = X.columns, fill_value=0)
    df1 = df1[['Dept_0','Dept_1','Dept_2','Dept_3','Dept_4','Dept_5','Dept_6','Store','Size',"Week",'IsHoliday','Type_A','Type_B','Type_C','Temperature','Fuel_Price','Unemployment','CPI']]
    return df1

OPTIMIZATION

In [14]:
#searching for best parameters
params = {'n_estimators':np.arange(100,200,10),'max_depth':np.arange(5,15),'min_samples_split':np.arange(1,10)}
cv2 = RandomizedSearchCV(RandomForestRegressor(),params,cv=4,n_iter=25)
cv2.fit(X_train, y_train)

12 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\LENOVO\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\LENOVO\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "c:\Users\LENOVO\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Users\LENOVO\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\LENOVO\

RandomizedSearchCV(cv=4, estimator=RandomForestRegressor(), n_iter=25,
                   param_distributions={'max_depth': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'n_estimators': array([100, 110, 120, 130, 140, 150, 160, 170, 180, 190])})

In [27]:
cv2.best_params_

{'n_estimators': 180, 'min_samples_split': 8, 'max_depth': 14}

FINAL MODEL

In [42]:
#random forest model

model4=RandomForestRegressor(n_estimators=180,max_depth=14,min_samples_split=8)
model4.fit(X_train,y_train)
print(model4.score(X_test,y_test))
print(model4.score(X_train,y_train))


0.9513057234140718
0.9649820773801129


PICKLE

In [55]:
pickle.dump(model4, open('model.pkl','wb'))

------------------------------------------PART 2---------------------------------------

In [46]:
#re-reading the data
train_df = pd.read_csv('train.csv')
store_df = pd.read_csv('stores.csv')
feature_df = pd.read_csv('features.csv')

In [47]:
#re-merginng and filling null values
feature_store = feature_df.merge(store_df, how='inner', on = "Store").reset_index(drop=True)
train = train_df.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by=['Store','Dept','Date']).reset_index(drop=True)
train.fillna(0,inplace=True)


In [48]:
#calling date_transformer function
date_transformer(train,"Date")

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Day,Week,Month,Year
0,1,1,2010-02-05,24924.50,False,42.31,2.572,0.00,0.00,0.00,0.00,0.00,211.096358,8.106,A,151315,5,5,2,2010
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.00,0.00,0.00,0.00,0.00,211.242170,8.106,A,151315,12,6,2,2010
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.00,0.00,0.00,0.00,0.00,211.289143,8.106,A,151315,19,7,2,2010
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.00,0.00,0.00,0.00,0.00,211.319643,8.106,A,151315,26,8,2,2010
4,1,1,2010-03-05,21827.90,False,46.50,2.625,0.00,0.00,0.00,0.00,0.00,211.350143,8.106,A,151315,5,9,3,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False,64.88,3.997,4556.61,20.64,1.50,1601.01,3288.25,192.013558,8.684,B,118221,28,39,9,2012
421566,45,98,2012-10-05,628.10,False,64.89,3.985,5046.74,0.00,18.82,2253.43,2340.01,192.170412,8.667,B,118221,5,40,10,2012
421567,45,98,2012-10-12,1061.02,False,54.47,4.000,1956.28,0.00,7.89,599.32,3990.54,192.327265,8.667,B,118221,12,41,10,2012
421568,45,98,2012-10-19,760.01,False,56.47,3.969,2004.02,0.00,3.18,437.73,1537.49,192.330854,8.667,B,118221,19,42,10,2012


In [49]:
#preparing the data frame that will be used in the model
train_sum = train.groupby(by = ["Store","Year","Week"]).sum()
train_sum = train_sum[["Weekly_Sales"]]
train_sum.rename(columns = {'Weekly_Sales':'Weekly sales for store'}, inplace = True)
feature_1 = date_transformer(feature_df,"Date")

feature_new_weeklysales = feature_1.merge(train_sum, how='inner', on = ["Store","Week","Year"]).reset_index(drop=True)
feature_new_weeklysales = feature_new_weeklysales.merge(store_df, how='inner', on = ["Store"]).reset_index(drop=True)
feature_new_weeklysales.fillna(0,inplace=True)
feature_new_weeklysales = pd.get_dummies(feature_new_weeklysales) # one hot encoding


In [67]:
#function for combining the sum of a set of features 
def join_sum(data_frame,list_of_columns):
    df_md = data_frame[list_of_columns]
    Markdown_sum = df_md.sum(axis=1)
    data_frame["Markdown_sum"] = Markdown_sum

list_col = ["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]
join_sum(feature_new_weeklysales,list_col)

X = feature_new_weeklysales[['Store','Size',"Week",'IsHoliday','Type_A', 'Type_B','Type_C','Temperature','Fuel_Price','Unemployment','CPI','Year']]
y = feature_new_weeklysales['Weekly sales for store']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30, random_state=50) # perform 80-20 split on dataset


In [60]:
X

Unnamed: 0,Size,Week,IsHoliday,Type_A,Type_B,Type_C,Temperature,Fuel_Price,Unemployment,CPI,Year
0,151315,5,False,1,0,0,42.31,2.572,8.106,211.096358,2010
1,151315,5,False,1,0,0,55.32,3.386,6.573,223.462779,2010
2,151315,6,True,1,0,0,38.51,2.548,8.106,211.242170,2010
3,151315,6,False,1,0,0,61.24,3.314,6.573,223.481307,2010
4,151315,7,False,1,0,0,39.93,2.514,8.106,211.289143,2010
...,...,...,...,...,...,...,...,...,...,...,...
8185,118221,16,False,0,1,0,64.88,3.997,8.684,192.013558,2011
8186,118221,17,False,0,1,0,64.89,3.985,8.667,192.170412,2011
8187,118221,18,False,0,1,0,54.47,4.000,8.667,192.327265,2011
8188,118221,19,False,0,1,0,56.47,3.969,8.667,192.330854,2011


In [58]:
y

0       1643690.90
1       1643690.90
2       1641957.44
3       1641957.44
4       1611968.17
           ...    
8185     813630.44
8186     786561.61
8187     810150.64
8188     793889.10
8189     727163.67
Name: Weekly sales for store, Length: 8190, dtype: float64

OPTIMIZATION

In [22]:
#defining a function as a grid search to optamize for a set of parameters

def grid(estimator_Reg, list_parameters, X_for_training, y_for_training, n_cv = 5, scoring_type = "r2"):
    grid_DTR = GridSearchCV(estimator= estimator_Reg, param_grid= list_parameters, cv = n_cv , scoring= scoring_type)   
    grid_DTR.fit(X_for_training,y_for_training)
    print (grid_DTR.best_score_)
    print (grid_DTR.best_params_)
    return grid_DTR



list_para = [{"max_depth": np.arange(10,50,3), 'n_estimators': np.arange(5,30,5), 'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']}]


In [68]:
#training the model with the best parameters 
list_para = [{"max_depth": [10], 'n_estimators': [100], 'criterion': [ 'friedman_mse']}]
grid(RandomForestRegressor(), list_para,X_train,y_train)

0.9664110319169001
{'criterion': 'friedman_mse', 'max_depth': 10, 'n_estimators': 100}


GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'criterion': ['friedman_mse'], 'max_depth': [10],
                          'n_estimators': [100]}],
             scoring='r2')

In [69]:
#training the model with the best parameters 
list_para = [{"max_depth": [10], 'n_estimators': [100], 'criterion': [ 'friedman_mse']}]
grid_mod = grid(RandomForestRegressor(), list_para,X_train,y_train)
print(grid_mod.score(X_test,y_test))
print(grid_mod.score(X_train,y_train))

0.9662770138257961
{'criterion': 'friedman_mse', 'max_depth': 10, 'n_estimators': 100}
0.9674943020258232
0.9871304618531465


In [70]:
import pickle
pickle.dump(grid_mod, open('model2.pkl','wb'))