# ML REGRESSION - {"BIGMART SALES" DATASET}

## 1. Importing Modules and Setting Configurations

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from pickle import dump, load
from math import sqrt

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_selection import SelectKBest, mutual_info_regression

from sklearn.linear_model import LinearRegression

from sklearn.pipeline import Pipeline

from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import KFold, cross_val_score, cross_validate


import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

In [3]:
# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

In [4]:
from sklearn import set_config
set_config(display='diagram')

## 2. Importing Dataset

### 2.1 Train Dataset

In [5]:
tr = pd.read_pickle('bms_FE_train_final.pkl')


print(f'Shape of the train dataset : {tr.shape}')
tr.head(5)

Shape of the train dataset : (8323, 11)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Age,Item_Outlet_Sales
2171,13.65,Regular,0.0808,Frozen Foods,262.8936,Medium,Tier 1,Supermarket Type1,Foods,14.0,4958.8784
5657,6.98,Low Fat,0.0412,Canned,82.8934,Small,Tier 2,Supermarket Type1,Foods,11.0,818.934
2156,20.1,Low Fat,0.0746,Dairy Foods,110.3228,Small,Tier 1,Supermarket Type1,Foods,16.0,1768.3648
110,13.3,Low Fat,0.0798,Dairy Foods,232.53,Small,Tier 1,Supermarket Type1,Foods,16.0,699.09
6709,10.1,Non Edible,0.0301,Health and Hygiene,154.3656,Medium,Tier 3,Supermarket Type3,Non Consumables,28.0,2471.4496


In [6]:
Xtr = tr.drop(columns='Item_Outlet_Sales')
ytr = tr['Item_Outlet_Sales']

### 2.2 Validation Dataset

In [7]:
val = pd.read_pickle('bms_FE_valid_final.pkl')

print(f'Shape of the validation dataset : {val.shape}')
val.head(5)

Shape of the validation dataset : (100, 11)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Category,Outlet_Age
945,13.65,Regular,0.0213,Snack Foods,58.3588,Small,Tier 3,Grocery Store,114.5176,Foods,15
1794,7.3,Low Fat,0.0861,Frozen Foods,149.8076,High,Tier 3,Supermarket Type1,2808.3444,Foods,26
3022,11.3,Regular,0.0667,Frozen Foods,257.2962,Small,Tier 1,Supermarket Type1,4920.9278,Foods,16
23,15.5,Low Fat,0.0576,Baking Goods,107.6938,Small,Tier 1,Grocery Store,214.3876,Foods,28
2601,9.195,Low Fat,0.1149,Frozen Foods,59.0246,Small,Tier 2,Supermarket Type1,1853.5872,Foods,9


In [8]:
Xval = val.drop(columns='Item_Outlet_Sales')
yval = val['Item_Outlet_Sales']

## 3. Pipeline Creation for Model Building

In [9]:
def ft_exp(x):
    y = x**(1/1.2)
    return y


def ft_sqrt(x):
    y = x**(1/2)  
    return y

ft_mrp_exp = FunctionTransformer(func=ft_exp)
ft_age_sqrt = FunctionTransformer(func=ft_sqrt)

pre_proc_num = ColumnTransformer(transformers=[
                ('pt_it_vis',PowerTransformer(method='yeo-johnson', standardize=False),[2]),
                ('ft_it_mrp',ft_mrp_exp,[4]),
                ('ft_ol_age',ft_age_sqrt,[9])
                ],
                remainder='passthrough')

In [10]:
pre_proc_ss = ColumnTransformer(transformers=[
                ('ss',StandardScaler(),[0,1,2,3])
                ],
                remainder='passthrough')

In [11]:
pre_proc_cat = ColumnTransformer(transformers=[
                ('oe',OrdinalEncoder(categories=[['Small','Medium','High'], ['Tier 3','Tier 2','Tier 1'], 
                        ['Grocery Store','Supermarket Type3','Supermarket Type2','Supermarket Type1']], dtype='object'),[6,7,8]),
                ('ohe',OneHotEncoder(drop='first', sparse_output=False, dtype='int8'),[4,5,9]),
                ],
                remainder='passthrough')

In [12]:
skb = SelectKBest(mutual_info_regression, k='all')

In [13]:
mdl = LinearRegression()

In [14]:
steps = [('num',pre_proc_num),
         ('ss',pre_proc_ss),
         ('cat',pre_proc_cat),
         ('skb',skb),
         ('mdl',mdl)]

pipe_mdl = Pipeline(steps)

## 4. Model Training

In [15]:
pipe = TransformedTargetRegressor(regressor=pipe_mdl, transformer=PowerTransformer(method='yeo-johnson', standardize=True))

pipe.fit(Xtr,ytr)

## 5. Model Evaluation On Train and Test Data

In [16]:
yhat_tr = pipe.predict(Xtr)
yhat_val = pipe.predict(Xval)

print(f'Train Dataset R2 score : {round(r2_score(ytr,yhat_tr),4)}')
print(f'Train Dataset RMSE : {round(sqrt(mean_squared_error(ytr,yhat_tr)),4)} \n')

print(f'Validation Dataset R2 score : {round(r2_score(yval,yhat_val),4)}')
print(f'Validation Dataset RMSE : {round(sqrt(mean_squared_error(yval,yhat_val)),4)}')

Train Dataset R2 score : 0.3536
Train Dataset RMSE : 1372.1487 

Validation Dataset R2 score : 0.2358
Validation Dataset RMSE : 1303.4476


## 6. Cross Validation

In [17]:
print('Cross Validation Results : ------------------------------------------------------------------------')

kfold = KFold(n_splits=5, shuffle=True, random_state=46)
scoring = ['r2','neg_mean_squared_error','neg_root_mean_squared_error']
cv_scores = cross_validate(pipe, Xtr, ytr, cv=kfold, scoring=scoring)

r2_scores = cv_scores['test_r2']
print(f'R2 Scores : {r2_scores}')
print(f'Mean R2 Score : {np.mean(r2_scores)}, St.Dev R2 Score : {np.std(r2_scores)} \n ----------------------------------------')

mse_scores = cv_scores['test_neg_mean_squared_error']
mse_scores = [-1*score for score in mse_scores]
print(f'MSE Scores : {mse_scores}')
print(f'Mean MSE Score : {np.mean(mse_scores)}, St.Dev MSE Score : {np.std(mse_scores)} \n ------------------------------------')

rmse_scores = cv_scores['test_neg_root_mean_squared_error']
rmse_scores = [-1*score for score in rmse_scores]
print(f'RMSE Scores : {rmse_scores}')
print(f'Mean RMSE Score : {np.mean(rmse_scores)}, St.Dev RMSE Score : {np.std(rmse_scores)}')

Cross Validation Results : ------------------------------------------------------------------------
R2 Scores : [0.30364599 0.38922311 0.33698634 0.3620866  0.34113367]
Mean R2 Score : 0.3466151419365644, St.Dev R2 Score : 0.0283686915120103 
 ----------------------------------------
MSE Scores : [2040805.0392226044, 1793964.2410646083, 1959576.0950385523, 1853359.1817296331, 1865347.6386938256]
Mean MSE Score : 1902610.439149845, St.Dev MSE Score : 87133.26856462358 
 ------------------------------------
RMSE Scores : [1428.5674780081633, 1339.3895031187187, 1399.8485971841928, 1361.3813505883033, 1365.7773020129694]
Mean RMSE Score : 1378.9928461824697, St.Dev RMSE Score : 31.451062421112137


## 7. Model Save/Export

In [18]:
dump(pipe, open('bms_mdl_simple.pkl','wb'))
print('Model Object Saved Successfully \n')

dump(Xtr, open('bms_X_simple.pkl','wb'))
print('"Xtr" Features Saved Successfully')

Model Object Saved Successfully 

"Xtr" Features Saved Successfully


## 8. Simple Prediction System

### 8.1 Combining Xval and yval

In [19]:
samp = Xval.sample(10).index.values.tolist()
samp

[3585, 2780, 1075, 3641, 7887, 4156, 5530, 863, 5457, 3402]

In [20]:
val_df = pd.concat([Xval.loc[samp],yval.loc[samp]],axis=1)
val_df.head(10)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Age,Item_Outlet_Sales
3585,5.175,Low Fat,0.0372,Canned,86.8224,Medium,Tier 3,Supermarket Type3,Foods,28,1448.7808
2780,19.0,Low Fat,0.1119,Frozen Foods,106.5622,Small,Tier 1,Supermarket Type1,Foods,16,2117.244
1075,9.8,Regular,0.1406,Baking Goods,49.8008,Small,Tier 1,Supermarket Type1,Foods,16,809.6128
3641,16.0,Regular,0.1067,Baking Goods,183.6634,Small,Tier 2,Supermarket Type1,Foods,9,2544.6876
7887,17.75,Non Edible,0.0534,Others,181.766,Small,Tier 1,Grocery Store,Non Consumables,28,179.766
4156,9.5,Regular,0.0488,Starchy Foods,186.5898,Medium,Tier 1,Supermarket Type1,Foods,14,5238.5144
5530,18.75,Non Edible,0.0523,Health and Hygiene,105.728,Medium,Tier 3,Supermarket Type2,Non Consumables,4,1704.448
863,8.05,Regular,0.1645,Dairy Foods,112.2518,High,Tier 3,Supermarket Type1,Foods,26,1707.777
5457,21.25,Non Edible,0.0247,Household,145.1102,Small,Tier 2,Supermarket Type1,Non Consumables,11,1603.9122
3402,19.6,Non Edible,0.0661,Health and Hygiene,150.3024,Medium,Tier 3,Supermarket Type2,Non Consumables,4,2580.6408


### 8.2 Prediction on Validation Data

In [21]:
#inp_data = (13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.0014,3.06,0.80,10.0)
#inp_data = np.array(inp_data).reshape(1,-1)

idx = val_df.sample().index.values
inp_data = Xval.loc[idx].values  

print(f'Validation Data with Index : {idx[0]} is Selected for Prediction: \n')

for fea_name, fea_val in zip(Xval.columns.tolist(), inp_data[0]):
    print(f'"{fea_name}" ---:--- {fea_val}')

Validation Data with Index : 863 is Selected for Prediction: 

"Item_Weight" ---:--- 8.05
"Item_Fat_Content" ---:--- Regular
"Item_Visibility" ---:--- 0.164542555
"Item_Type" ---:--- Dairy Foods
"Item_MRP" ---:--- 112.2518
"Outlet_Size" ---:--- High
"Outlet_Location_Type" ---:--- Tier 3
"Outlet_Type" ---:--- Supermarket Type1
"Item_Category" ---:--- Foods
"Outlet_Age" ---:--- 26


In [22]:
print(f'------------- Prediction for Validation Data with Index : {idx[0]} --------------------\n')

print(f'Actual Sales for the Selected Data : {yval[idx[0]]} \n')

pipe = load(open('bms_mdl_simple.pkl','rb'))
pred = pipe.predict(inp_data)

print(f'Predicted Sales for the Selected Data : {pred[0]}')

------------- Prediction for Validation Data with Index : 863 --------------------

Actual Sales for the Selected Data : 1707.777 

Predicted Sales for the Selected Data : 2352.245234763344
