# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from prettytable import PrettyTable
from sklearn.metrics import accuracy_score
import warnings

In [2]:
#Loading our data
train_df = pd.read_csv("FeatureEngineering.csv")

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day,lags: 1,lags: 2,lags: 3,lags: 4,lags: 5,lags: 6,lags: 7,lags: 8,lags: 9,lags: 10,lags: 11,shop_name,item_name,item_category_id,item_category_name
0,0,1,0,30,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"!Якутск Орджоникидзе, 56 фран",007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,Кино - DVD
1,482,1,0,31,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"!Якутск Орджоникидзе, 56 фран",007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray
2,986,0,0,32,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"!Якутск Орджоникидзе, 56 фран",1+1,40,Кино - DVD
3,987,1,0,32,10.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"!Якутск Орджоникидзе, 56 фран",1+1,40,Кино - DVD
4,1771,0,0,33,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"!Якутск Орджоникидзе, 56 фран",1+1 (BD),37,Кино - Blu-Ray


In [4]:
#info of train
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704112 entries, 0 to 704111
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          704112 non-null  int64  
 1   date_block_num      704112 non-null  int64  
 2   shop_id             704112 non-null  int64  
 3   item_id             704112 non-null  int64  
 4   item_cnt_day        704112 non-null  float64
 5   lags: 1             704112 non-null  float64
 6   lags: 2             704112 non-null  float64
 7   lags: 3             704112 non-null  float64
 8   lags: 4             704112 non-null  float64
 9   lags: 5             704112 non-null  float64
 10  lags: 6             704112 non-null  float64
 11  lags: 7             704112 non-null  float64
 12  lags: 8             704112 non-null  float64
 13  lags: 9             704112 non-null  float64
 14  lags: 10            704112 non-null  float64
 15  lags: 11            704112 non-nul

# Separating Numerical From Categorical Data

In [5]:
# Separating categorical data from numerical data
train_categorical_data = train_df.select_dtypes(exclude=['int64', 'float','int32'])
train_numerical_data = train_df.select_dtypes(include=['int64', 'float','int32'])

# Using Label Encoder To Convert Categorical to Numerical Data

In [6]:
# Label Encode and Hot Encode for Categorical Columns
# the category data here will be converted to numbers format
le = LabelEncoder()
train_categorical_data = train_categorical_data.apply(LabelEncoder().fit_transform)

In [7]:
#Looking into our categrical data after conversion
train_categorical_data.head()

Unnamed: 0,shop_name,item_name,item_category_name
0,0,0,28
1,0,1,26
2,0,2,28
3,0,2,28
4,0,3,26


# Concatenating Both Categorical and Numerical Data

In [8]:
# Concatenating both Catagorical Data and Numerical Data
# Y will be our new dataframe why X will be our lable
X = pd.concat([train_categorical_data, train_numerical_data], axis=1)
y = train_df['item_cnt_day']

In [9]:
#Checking the X
X.head()

Unnamed: 0.1,shop_name,item_name,item_category_name,Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day,lags: 1,lags: 2,lags: 3,lags: 4,lags: 5,lags: 6,lags: 7,lags: 8,lags: 9,lags: 10,lags: 11,item_category_id
0,0,0,28,0,1,0,30,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40
1,0,1,26,482,1,0,31,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
2,0,2,28,986,0,0,32,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40
3,0,2,28,987,1,0,32,10.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40
4,0,3,26,1771,0,0,33,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37


In [10]:
# Checking the y
y.head()

0    31.0
1    11.0
2     6.0
3    10.0
4     3.0
Name: item_cnt_day, dtype: float64

# Splitting data into train and test

In [11]:
# Splitting the data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [12]:
X_test.head()

Unnamed: 0.1,shop_name,item_name,item_category_name,Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day,lags: 1,lags: 2,lags: 3,lags: 4,lags: 5,lags: 6,lags: 7,lags: 8,lags: 9,lags: 10,lags: 11,item_category_id
541491,48,3495,45,544993,25,48,15995,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64
324630,29,3365,44,529236,24,29,15446,1.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,63
262894,26,2864,8,464977,28,26,13400,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15
638026,56,1189,37,220056,30,56,5485,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,55
233939,25,1878,29,347714,11,25,9001,4.0,1.0,3.0,0.0,2.0,2.0,0.0,2.0,1.0,3.0,6.0,0.0,41


In [13]:
# Checking the X_train
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492878 entries, 249036 to 121958
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   shop_name           492878 non-null  int64  
 1   item_name           492878 non-null  int64  
 2   item_category_name  492878 non-null  int64  
 3   Unnamed: 0          492878 non-null  int64  
 4   date_block_num      492878 non-null  int64  
 5   shop_id             492878 non-null  int64  
 6   item_id             492878 non-null  int64  
 7   item_cnt_day        492878 non-null  float64
 8   lags: 1             492878 non-null  float64
 9   lags: 2             492878 non-null  float64
 10  lags: 3             492878 non-null  float64
 11  lags: 4             492878 non-null  float64
 12  lags: 5             492878 non-null  float64
 13  lags: 6             492878 non-null  float64
 14  lags: 7             492878 non-null  float64
 15  lags: 8             492878 no

In [14]:
# Checking the shape of bothe the train and test data
print("The size of training input is", X_train.shape)
print("The size of training output is", y_train.shape)
print("The size of testing input is", X_test.shape)
print("The size of testing output is", y_test.shape)

The size of training input is (492878, 20)
The size of training output is (492878,)
The size of testing input is (211234, 20)
The size of testing output is (211234,)


# Model Building


### In this part we are using GridSearchCV function to run our models, because it will automate the turning of the hyperparameter to create different models and finally will select one parameter that give the best fit for our prediction.


## 1. Ridge Regression

In [15]:
# Performing GridSearchCV on Ridge Regression
params = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
ridge_regressor = GridSearchCV(Ridge(), params, cv = 7, scoring = 'neg_mean_absolute_error', n_jobs = -1)
ridge_regressor.fit(X_train, y_train)


Ill-conditioned matrix (rcond=6.26981e-20): result may not be accurate.



GridSearchCV(cv=7, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                                   10000, 100000]},
             scoring='neg_mean_absolute_error')

In [16]:
# Predicting train and test results
y_test_pred = ridge_regressor.predict(X_test)

In [17]:
# Printing the result for test
print("Test Results for Ridge Regressor Model:")
print("Root Mean Squared Error: ", sqrt(mse(y_test.values, y_test_pred)))
print("R-Squared: ", r2_score(y_test.values, y_test_pred))

Test Results for Ridge Regressor Model:
Root Mean Squared Error:  3.805972809161793e-11
R-Squared:  1.0


## 2. Lasso Regression

In [18]:
# Performing GridSearchCV on Lasso Regression
params = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
lasso_regressor = GridSearchCV(Lasso(), params ,cv = 15,scoring = 'neg_mean_absolute_error', n_jobs = -1)
lasso_regressor.fit(X_train, y_train)

GridSearchCV(cv=15, estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                                   10000, 100000]},
             scoring='neg_mean_absolute_error')

In [19]:
# Predicting train and test results
y_test_pred = lasso_regressor.predict(X_test)

In [20]:
# printing the result for test
print("Test Results for Lasso Regressor Model:")
print("Root Mean squared Error: ", sqrt(mse(y_test, y_test_pred)))
print("R-Squared: ", r2_score(y_test, y_test_pred))

Test Results for Lasso Regressor Model:
Root Mean squared Error:  0.010361208743024555
R-Squared:  0.9999993074567651


## 3. Decision Tree Regression

In [21]:
# Performing GridSearchCV on Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor

depth = list(range(3,30))
param_grid = dict(max_depth = depth)
tree = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 10)
tree.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                       15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                       25, 26, 27, 28, 29]})

In [22]:
# Predicting train and test results
y_test_pred = tree.predict(X_test)

In [23]:
#Checking prediction for test

print("Test Results for Decision Tree Regressor Model:")
print("Root Mean Squared Error: ", sqrt(mse(y_test, y_test_pred)))
print("R-Squared: ", r2_score(y_test, y_test_pred))

Test Results for Decision Tree Regressor Model:
Root Mean Squared Error:  2.0793520978157614
R-Squared:  0.9721078421532136


## 4. Random Forest Regression

In [24]:
# Performing GridSearchCV on Random Forest Regression
tuned_params = {'n_estimators': [100, 200], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}  
random_regressor = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter = 3, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1)
random_regressor.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=3, n_jobs=-1,
                   param_distributions={'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200]},
                   scoring='neg_mean_absolute_error')

In [25]:
# Predicting train and test results
y_test_pred = random_regressor.predict(X_test)

In [26]:
# Checking Prediction for test

print("Test Results for Random Forest Regressor Model:")
print("Root Mean Squared Error: ", sqrt(mse(y_test, y_test_pred)))
print("R-Squared: ", r2_score(y_test, y_test_pred))

Test Results for Random Forest Regressor Model:
Root Mean Squared Error:  2.467082334990672
R-Squared:  0.9607361093273559


## 5. PrettyTable
### This is to display the prediction from the models

In [70]:
# Training = Tr.
# Testing = Te.

x = PrettyTable()
x.field_names = ["Model Name", "Te. RMSE", "Te. R-Squared",]
x.add_row(['Ridge Regression','3.852','1.0'])
x.add_row(['Lasso Regression','0.010','0.999'])
x.add_row(['Decision Tree Regressor','2.279','0.966'])
x.add_row(['Random Forest Regressor','2.442','0.962'])
print(x)

+-------------------------+----------+---------------+
|        Model Name       | Te. RMSE | Te. R-Squared |
+-------------------------+----------+---------------+
|     Ridge Regression    |  3.852   |      1.0      |
|     Lasso Regression    |  0.010   |     0.999     |
| Decision Tree Regressor |  2.279   |     0.966     |
| Random Forest Regressor |  2.442   |     0.962     |
+-------------------------+----------+---------------+


### As we can see by comparing the models( Ridge Regression, Lasso Regression, Decision Tree, and Random Forest Regression) Decision Tree and Random Forest perfome best. 