### In this Notebook, we will consider three learning models to predict Sales for each Item in each Outlet

### Namely, the three models are:
1. Linear Regression model.
2. KNN regression model.
3. Decision tree regression model.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split, validation_curve, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from statsmodels.regression.linear_model import OLS, OLSResults, add_constant
from sklearn.externals import joblib

%matplotlib inline

In [None]:
train = pd.read_csv('train_engineered.csv')
test = pd.read_csv('test_engineered.csv')

In [None]:
features = train.drop(columns=['Item_Identifier', 'Outlet_Identifier','Item_Outlet_Sales'])
response = train[['Item_Outlet_Sales']]

## Baseline Model - Model 0

### In this model, the predicted sales for each item is the mean sales for this item across all Outlets.

In [None]:
mean_sales_type = train.pivot_table(values= 'Item_Outlet_Sales', index= 'Item_Identifier', aggfunc='mean')

In [None]:
pred0 = test.loc[:,'Item_Identifier'].apply(lambda x: mean_sales_type.loc[x])

In [None]:
model0 = pd.concat([test.Item_Identifier, test.Outlet_Identifier, pred0],axis=1)

In [None]:
model0.to_csv('model0.csv', index=False)

## K-Fold Cross Validation

In [None]:
kf = KFold(n_splits = 10)

## Linear Regression - Model 1

### Including all features in training.

In [None]:
lr = LinearRegression()

In [None]:
RMSE_lr = np.round(np.sqrt(np.mean(np.abs(cross_val_score(lr,features,y= response,scoring= 'neg_mean_squared_error', cv=kf)))),3)

In [None]:
R_Sq_lr = np.round(np.mean(cross_val_score(lr,features,y= response,scoring= 'r2', cv=kf)),3)

In [None]:
print('RMSE: {}, R-Sq: {}'.format(RMSE_lr, R_Sq_lr))

Exporting submission file.

In [None]:
y_pred_sub = lr.predict(test.drop(columns=['Item_Identifier','Outlet_Identifier']))

In [None]:
y_pred_df = pd.DataFrame(y_pred_sub,columns=['Item_Outlet_Sales'])

In [None]:
LR_model = pd.concat([test.Item_Identifier, test.Outlet_Identifier, y_pred_df],axis=1)

In [None]:
LR_model.to_csv('Linear_Regression_Model.csv', index=False)

## Linear Regression - Selected features - Model 2

### We will execlude features that are highly correlated and check if RMSE and R-Sq are improved.

In [None]:
features[['Item_Visibility','Item_MRP','visibility_ratio','Item_MeanPrice_Ratio', 'specific_price']].corr()

In [None]:
selected_features = features.drop(columns= ['Item_Weight','Item_Visibility','specific_price','visibility_ratio'])

In [None]:
lr_sf = LinearRegression()

In [None]:
RMSE_lr_sf = np.round(np.sqrt(np.mean(np.abs(cross_val_score(lr_sf,selected_features,y= response,scoring= 'neg_mean_squared_error', cv=kf)))),3)

In [None]:
R_Sq_lr_sf = np.round(np.mean(cross_val_score(lr_sf,selected_features,y= response,scoring= 'r2', cv=kf)),3)

In [None]:
print('RMSE: {}, R-Sq: {}'.format(RMSE_lr_sf, R_Sq_lr_sf))

## KNN - Model 3

In [None]:
knn = KNeighborsRegressor()

### Grid Search Cross Validation is used to optimize the paramters of KNN. 

In [None]:
param_grid = dict({'n_neighbors':list(range(3,16)),'weights':['uniform','distance'],'algorithm':['ball_tree','kd_tree', 'brute'],'p':[1,2]})

In [None]:
GSCV = GridSearchCV(knn, param_grid, scoring='neg_mean_squared_error', cv= kf)

In [None]:
GSCV.fit(features,y = response.values.reshape(-1))

In [None]:
joblib.dump(GSCV,'knn_fitted.joblib')

In [None]:
knn_fitted = joblib.load('knn_fitted.joblib')

In [None]:
knn_fitted.best_params_

In [None]:
RMSE_knn = np.round(np.sqrt(np.mean(np.abs(cross_val_score(knn_fitted,features,y=response, scoring='neg_mean_squared_error',cv=kf)),3)

In [None]:
R_Sq_knn = np.round(np.mean(cross_val_score(knn_fitted,selected_features,y= response,scoring= 'r2', cv=kf)),3)

In [None]:
print('RMSE: {}, Adj R-Sq: {}'.format(RMSE_knn, R_Sq_knn))

Exporting submission file.

In [None]:
y_pred_sub = GSCV.predict(test.drop(columns=['Item_Identifier','Outlet_Identifier']))

In [None]:
y_pred_df = pd.DataFrame(y_pred_sub,columns=['Item_Outlet_Sales'])

In [None]:
KNN_model = pd.concat([test.Item_Identifier, test.Outlet_Identifier, y_pred_df],axis=1)

In [None]:
KNN_model.to_csv('KNN_Regression_Model.csv', index=False)

## Decision Tree - Model3

In [None]:
dt = DecisionTreeRegressor(max_depth= 29, max_features = 31, min_samples_split = 412)

### Grid Search Cross Validation is used to optimize the paramters of Decision Tree. 

In [None]:
param_grid = dict({'min_samples_split': np.arange(2,510,10).tolist(), 'max_features':list(range(1,features.shape[1])), 'max_depth': list(range(10,31))})

In [None]:
GSCV_dt = GridSearchCV(dt, param_grid, scoring='neg_mean_squared_error',cv= kf)

In [None]:
GSCV_dt.fit(features,y = response.values.reshape(-1))

In [None]:
joblib.dump(GSCV_dt,'dt_fitted.joblib')

In [None]:
dt_fitted = joblib.load('dt_fitted.joblib')

In [None]:
dt_fitted.best_params_

In [None]:
RMSE_dt = np.round(np.sqrt(np.abs(dt_fitted.best_score_)),3)

In [None]:
y_pred = dt_fitted.predict(x_test)

In [None]:
R_Sq_Adj_dt = np.round(np.sqrt(r_sq_adj(y_test, y_pred, features.shape[1])),3)

In [None]:
print('RMSE: {}, Adj R-Sq: {}'.format(RMSE_dt, R_Sq_Adj_dt))

Exporting submission file.

In [None]:
y_pred_sub = dt.predict(test.drop(columns=['Item_Identifier','Outlet_Identifier']))

In [None]:
y_pred_df = pd.DataFrame(y_pred_sub,columns=['Item_Outlet_Sales'])

In [None]:
DT_model = pd.concat([test.Item_Identifier, test.Outlet_Identifier, y_pred_df],axis=1)

In [None]:
DT_model.to_csv('DecisionTree_Regression_Model.csv', index=False)

### Visualizing Decision Tree

In [None]:
from graphviz import Source
from IPython.display import display
from IPython.display import SVG

In [None]:
dt.fit(x_train,y_train)

In [None]:
graph2 = Source(export_graphviz(dt, out_file= None, feature_names= features.columns.tolist(),filled = True, rounded= True))

In [None]:
display(SVG(graph2.pipe(format='svg')))

# Model Comparison

In [None]:
pd.DataFrame([[RMSE_lr, R_Sq_lr], [RMSE_knn, R_Sq_knn], [RMSE_dt, R_Sq_Adj_dt]],
             columns=['RMSE','Adjusted R-Sq'], 
             index=['Linear Regression','KNN','Decision Tree'])