### In this Notebook, we will consider three learning models predict Sales for each Item in each Outlet

### Namely, the three models are:
1. Linear Regression model.
2. KNN regression model.
3. Decision tree regression model.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split, validation_curve, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from statsmodels.regression.linear_model import OLS, OLSResults, add_constant
from sklearn.externals import joblib

%matplotlib inline

In [2]:
train = pd.read_csv('train_engineered.csv')
test = pd.read_csv('test_engineered.csv')

In [3]:
features = train.drop(columns=['Item_Identifier', 'Outlet_Identifier','Item_Outlet_Sales'])
response = train[['Item_Outlet_Sales']]

In [4]:
#Function to calculate the Adjusted R squared
def r_sq_adj(y_true, y_pred, p):
    r = r2_score(y_true, y_pred)
    n = y_true.shape[0]
    return 1 - (1-r) * ((n - 1)/(n-p-1))

## Baseline Model - Model 0

### In this model, the predicted sales for each item is the mean sales for this item across all Outlets.

In [5]:
mean_sales_type = train.pivot_table(values= 'Item_Outlet_Sales', index= 'Item_Identifier', aggfunc='mean')

In [6]:
pred0 = test.loc[:,'Item_Identifier'].apply(lambda x: mean_sales_type.loc[x])

In [7]:
model0 = pd.concat([test.Item_Identifier, test.Outlet_Identifier, pred0],axis=1)

In [8]:
model0.to_csv('model0.csv', index=False)

## K-Fold Cross Validation

In [9]:
kf = KFold(n_splits = 10)

In [10]:
for train_index, test_index in kf.split(features, y= response.values.reshape(-1)):
    x_train, y_train = features.loc[train_index], response.loc[train_index]
    x_test, y_test = features.loc[test_index], response.loc[test_index]

## Linear Regression - Model 1

### Including all features in training.

In [11]:
lr = LinearRegression()

In [12]:
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:
y_pred = lr.predict(x_test)

In [14]:
RMSE_lr = np.round(np.sqrt(mean_squared_error(y_test, y_pred)),3)

In [15]:
R_Sq_Adj_lr = np.round(np.sqrt(r_sq_adj(y_test, y_pred, features.shape[1])),3)

In [16]:
print('RMSE: {}, Adj R-Sq: {}'.format(RMSE_lr, R_Sq_Adj_lr))

RMSE: 1143.853, Adj R-Sq: 0.728


Exporting submission file.

In [None]:
y_pred_sub = lr.predict(test.drop(columns=['Item_Identifier','Outlet_Identifier']))

In [None]:
y_pred_df = pd.DataFrame(y_pred_sub,columns=['Item_Outlet_Sales'])

In [None]:
LR_model = pd.concat([test.Item_Identifier, test.Outlet_Identifier, y_pred_df],axis=1)

In [None]:
LR_model.to_csv('Linear_Regression_Model.csv', index=False)

## KNN - Model 2

In [None]:
knn = KNeighborsRegressor()

### Grid Search Cross Validation is used to optimize the paramters of KNN. 

In [None]:
param_grid = dict({'n_neighbors':list(range(3,16)),'weights':['uniform','distance'],'algorithm':['ball_tree','kd_tree', 'brute'],'p':[1,2]})

In [None]:
GSCV = GridSearchCV(knn, param_grid, scoring='neg_mean_squared_error', cv= kf)

In [None]:
GSCV.fit(features,y = response.values.reshape(-1))

In [None]:
joblib.dump(GSCV,'knn_fitted.joblib')

In [17]:
knn_fitted = joblib.load('knn_fitted.joblib')

In [18]:
knn_fitted.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}

In [19]:
RMSE_knn= np.round(np.sqrt(np.abs(knn_fitted.best_score_)),3)

In [20]:
y_pred = knn_fitted.predict(x_test)

In [21]:
R_Sq_Adj_knn = np.round(np.sqrt(r_sq_adj(y_test, y_pred, features.shape[1])),3)

In [22]:
print('RMSE: {}, Adj R-Sq: {}'.format(RMSE_knn, R_Sq_Adj_knn))

RMSE: 1128.175, Adj R-Sq: 0.776


Exporting submission file.

In [None]:
y_pred_sub = GSCV.predict(test.drop(columns=['Item_Identifier','Outlet_Identifier']))

In [None]:
y_pred_df = pd.DataFrame(y_pred_sub,columns=['Item_Outlet_Sales'])

In [None]:
KNN_model = pd.concat([test.Item_Identifier, test.Outlet_Identifier, y_pred_df],axis=1)

In [None]:
KNN_model.to_csv('KNN_Regression_Model.csv', index=False)

## Decision Tree - Model3

In [29]:
dt = DecisionTreeRegressor(max_depth= 29, max_features = 31, min_samples_split = 412)

### Grid Search Cross Validation is used to optimize the paramters of Decision Tree. 

In [None]:
param_grid = dict({'min_samples_split': np.arange(2,510,10).tolist(), 'max_features':list(range(1,features.shape[1])), 'max_depth': list(range(10,31))})

In [None]:
GSCV_dt = GridSearchCV(dt, param_grid, scoring='neg_mean_squared_error',cv= kf)

In [None]:
GSCV_dt.fit(features,y = response.values.reshape(-1))

In [None]:
joblib.dump(GSCV_dt,'dt_fitted.joblib')

In [23]:
dt_fitted = joblib.load('dt_fitted.joblib')

In [24]:
dt_fitted.best_params_

{'max_depth': 29, 'max_features': 31, 'min_samples_split': 412}

In [25]:
RMSE_dt = np.round(np.sqrt(np.abs(dt_fitted.best_score_)),3)

In [26]:
y_pred = dt_fitted.predict(x_test)

In [27]:
R_Sq_Adj_dt = np.round(np.sqrt(r_sq_adj(y_test, y_pred, features.shape[1])),3)

In [28]:
print('RMSE: {}, Adj R-Sq: {}'.format(RMSE_dt, R_Sq_Adj_dt))

RMSE: 1093.056, Adj R-Sq: 0.763


Exporting submission file.

In [None]:
y_pred_sub = dt.predict(test.drop(columns=['Item_Identifier','Outlet_Identifier']))

In [None]:
y_pred_df = pd.DataFrame(y_pred_sub,columns=['Item_Outlet_Sales'])

In [None]:
DT_model = pd.concat([test.Item_Identifier, test.Outlet_Identifier, y_pred_df],axis=1)

In [None]:
DT_model.to_csv('DecisionTree_Regression_Model.csv', index=False)

### Visualizing Decision Tree

In [None]:
from graphviz import Source
from IPython.display import display
from IPython.display import SVG

In [None]:
dt.fit(x_train,y_train)

In [None]:
graph2 = Source(export_graphviz(dt, out_file= None, feature_names= features.columns.tolist(),filled = True, rounded= True))

In [None]:
display(SVG(graph2.pipe(format='svg')))

# Model Comparison

In [31]:
pd.DataFrame([[RMSE_lr, R_Sq_Adj_lr], [RMSE_knn, R_Sq_Adj_knn], [RMSE_dt, R_Sq_Adj_dt]],
             columns=['RMSE','Adjusted R-Sq'], 
             index=['Linear Regression','KNN','Decision Tree'])

Unnamed: 0,RMSE,Adjusted R-Sq
Linear Regression,1143.853,0.728
KNN,1128.175,0.776
Decision Tree,1093.056,0.763
