# ML REGRESSION - {"BIGMART SALES" DATASET}

## 1. Importing Modules and Setting Configurations

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from pickle import dump, load
from math import sqrt

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_selection import SelectKBest, mutual_info_regression

from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import KFold, cross_validate


import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

## 2. Importing Train Dataset

In [3]:
tr = pd.read_pickle('bms_FE_prod_final.pkl')

print(f'Shape of the production dataset : {tr.shape}')
tr.head(5)

Shape of the production dataset : (8423, 11)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Age,Item_Outlet_Sales
2171,13.65,Regular,0.0808,Frozen Foods,262.8936,Medium,Tier 1,Supermarket Type1,Foods,14.0,4958.8784
5657,6.98,Low Fat,0.0412,Canned,82.8934,Small,Tier 2,Supermarket Type1,Foods,11.0,818.934
2156,20.1,Low Fat,0.0746,Dairy Foods,110.3228,Small,Tier 1,Supermarket Type1,Foods,16.0,1768.3648
110,13.3,Low Fat,0.0798,Dairy Foods,232.53,Small,Tier 1,Supermarket Type1,Foods,16.0,699.09
6709,10.1,Non Edible,0.0301,Health and Hygiene,154.3656,Medium,Tier 3,Supermarket Type3,Non Consumables,28.0,2471.4496


In [4]:
Xtr = tr.drop(columns='Item_Outlet_Sales')
ytr = tr['Item_Outlet_Sales']

## 3. Production Model With Hyper-Parameters Tuned

### 3.1 Production Model

In [5]:
# Pre Processors -------------------------------------------------------------------------------------------------------------
def ft_exp(x):
    y = x**(1/1.2)
    return y

def ft_sqrt(x):
    y = x**(1/2)  
    return y

ft_mrp_exp = FunctionTransformer(func=ft_exp)
ft_age_sqrt = FunctionTransformer(func=ft_sqrt)

pre_proc_num = ColumnTransformer(transformers=[
                ('pt_it_vis',PowerTransformer(method='yeo-johnson', standardize=False),[2]),
                ('ft_it_mrp',ft_mrp_exp,[4]),
                ('ft_ol_age',ft_age_sqrt,[9])
                ],
                remainder='passthrough')

pre_proc_ss = ColumnTransformer(transformers=[
                ('ss',StandardScaler(),[0,1,2,3])
                ],
                remainder='passthrough')

pre_proc_cat = ColumnTransformer(transformers=[
                ('oe',OrdinalEncoder(categories=[['Small','Medium','High'], ['Tier 3','Tier 2','Tier 1'], 
                        ['Grocery Store','Supermarket Type3','Supermarket Type2','Supermarket Type1']], dtype='object'),[6,7,8]),
                ('ohe',OneHotEncoder(drop='first', sparse_output=False, dtype='int8'),[4,5,9]),
                ],
                remainder='passthrough')

# Feature Selection -----------------------------------------------------------------------------------------------------------
skb = SelectKBest(mutual_info_regression, k='all')

mdl = RandomForestRegressor(bootstrap=True, criterion='squared_error', max_depth=5, max_samples=0.25, n_estimators=100, oob_score=True, random_state=46)

# ML Pipeline -----------------------------------------------------------------------------------------------------------------
steps = [('num',pre_proc_num),
     ('ss',pre_proc_ss),
     ('cat',pre_proc_cat),
     ('skb',skb),    
     ('mdl',mdl)]

pipe_mdl = Pipeline(steps)
#----------------------------------------------------------------------------

pipe_prod = TransformedTargetRegressor(regressor=pipe_mdl, transformer=PowerTransformer(method='yeo-johnson', standardize=True))

pipe_prod.fit(Xtr,ytr)

### 3.2 Production Model Evaluation with cross_validate using scoring=['r2','neg_mean_squared_error','neg_root_mean_squared_error']

In [6]:
print('Cross Validation Results : ------------------------------------------------------------------------ \n')

kfold = KFold(n_splits=10, shuffle=True, random_state=46)
scoring = ['r2','neg_mean_squared_error','neg_root_mean_squared_error']
cv_scores = cross_validate(pipe_prod, Xtr, ytr, cv=kfold, scoring=scoring)

r2_scores = cv_scores['test_r2']
print(f'R2 Scores : {r2_scores}')
print(f'Mean R2 Score : {np.mean(r2_scores)}, St.Dev R2 Score : {np.std(r2_scores)} \n --------------------------------------\n ')

mse_scores = cv_scores['test_neg_mean_squared_error']
mse_scores = [-1*score for score in mse_scores]
print(f'MSE Scores : {mse_scores}')
print(f'Mean MSE Score : {np.mean(mse_scores)}, St.Dev MSE Score : {np.std(mse_scores)} \n ----------------------------------\n ')

rmse_scores = cv_scores['test_neg_root_mean_squared_error']
rmse_scores = [-1*score for score in rmse_scores]
print(f'RMSE Scores : {rmse_scores}')
print(f'Mean RMSE Score : {np.mean(rmse_scores)}, St.Dev RMSE Score : {np.std(rmse_scores)}')

Cross Validation Results : ------------------------------------------------------------------------ 

R2 Scores : [0.56233248 0.5953046  0.59923284 0.58078315 0.6078227  0.59501358
 0.58875914 0.58614914 0.57506597 0.59400541]
Mean R2 Score : 0.5884469010520589, St.Dev R2 Score : 0.012401318251271762 
 --------------------------------------
 
MSE Scores : [1231656.5025193237, 1092364.794766022, 1149837.0282353936, 1310403.3747983857, 1205989.908427719, 1188579.8626277996, 1162524.3790129009, 1214202.6180125754, 1163685.0036823617, 1223456.67963368]
Mean MSE Score : 1194270.015171616, St.Dev MSE Score : 55322.51173225089 
 ----------------------------------
 
RMSE Scores : [1109.800208379564, 1045.1625685825252, 1072.3045408070384, 1144.728515761875, 1098.1757183746683, 1090.2200982497982, 1078.2042380796418, 1101.9086250740465, 1078.7423249703154, 1106.099760253875]
Mean RMSE Score : 1092.534659853335, St.Dev RMSE Score : 25.25929909506814


## 4. Model Save/Export

In [7]:
dump(pipe_prod, open('bms_mdl_prod.pkl','wb'))
print('Model Object Saved Successfully \n')

dump(Xtr, open('bms_X_prod.pkl','wb'))
print('"X" Features Saved Successfully')

Model Object Saved Successfully 

"X" Features Saved Successfully


## 5. Simple Prediction System

### 5.1 Load Production Model

In [8]:
prod_pipe = load(open('bms_mdl_prod.pkl','rb'))

### 5.2 Test Dataset

#### 5.2.1 Importing Dataset

In [9]:
te = pd.read_pickle('bms_FE_test_final.pkl')

print(f'Shape of the test dataset : {te.shape}')
te.head(5)

Xte = te.drop(columns='Item_Outlet_Sales')
yte = te['Item_Outlet_Sales']

Shape of the test dataset : (100, 11)


#### 5.2.2 Overall Result on Test Data

In [10]:
yte_pred = prod_pipe.predict(Xte)

print(f'R2 Score on Test Data : {round(r2_score(yte, yte_pred),4)} \n')
print(f'Test Dataset RMSE     : {round(sqrt(mean_squared_error(yte, yte_pred)),4)}')

R2 Score on Test Data : 0.5693 

Test Dataset RMSE     : 1231.6046


#### 5.2.3 Predictions on Test Samples

In [11]:
samp = Xte.sample(10).index.values.tolist()
samp

te_df = pd.concat([Xte.loc[samp],yte.loc[samp]],axis=1)
te_df.head(10)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Age,Item_Outlet_Sales
5100,8.97,Non Edible,0.0932,Household,55.7956,Medium,Tier 1,Supermarket Type1,Non Consumables,14,1037.3164
1849,20.2,Non Edible,0.1123,Household,123.6046,Small,Tier 1,Supermarket Type1,Non Consumables,16,2988.1104
3806,17.5,Regular,0.0076,Dairy Foods,145.8102,Medium,Tier 1,Supermarket Type1,Foods,14,1895.5326
7969,11.65,Low Fat,0.0324,Hard Drinks,38.8164,Small,Tier 3,Grocery Store,Drinks,15,77.2328
1345,11.65,Regular,0.0587,Baking Goods,171.1422,Small,Tier 2,Supermarket Type1,Foods,9,2931.5174
1314,17.35,Low Fat,0.0147,Frozen Foods,74.5038,Small,Tier 2,Supermarket Type1,Foods,9,739.038
257,8.645,Non Edible,0.1433,Health and Hygiene,96.341,High,Tier 3,Supermarket Type1,Non Consumables,26,193.082
2804,6.57,Non Edible,0.0969,Household,193.982,Small,Tier 2,Supermarket Type1,Non Consumables,9,2316.984
2216,9.895,Regular,0.0488,Frozen Foods,260.5278,Small,Tier 2,Supermarket Type1,Foods,11,8851.1452
4772,19.5,Regular,0.0768,Soft Drinks,231.8958,Medium,Tier 3,Supermarket Type3,Drinks,28,8413.0488


In [16]:
#inp_data = (13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.0014,3.06,0.80,10.0)
#inp_data = np.array(inp_data).reshape(1,-1)

idx = te_df.sample().index.values
inp_data = Xte.loc[idx].values

print(f'Test Data with Index : {idx[0]} is Selected for Prediction: \n')

for fea_name, fea_val in zip(Xte.columns.tolist(), inp_data[0]):
    print(f'"{fea_name}" ---:--- {fea_val}')

Test Data with Index : 1345 is Selected for Prediction: 

"Item_Weight" ---:--- 11.65
"Item_Fat_Content" ---:--- Regular
"Item_Visibility" ---:--- 0.058719726
"Item_Type" ---:--- Baking Goods
"Item_MRP" ---:--- 171.1422
"Outlet_Size" ---:--- Small
"Outlet_Location_Type" ---:--- Tier 2
"Outlet_Type" ---:--- Supermarket Type1
"Item_Category" ---:--- Foods
"Outlet_Age" ---:--- 9


In [17]:
print(f'------------- Prediction for Test Data with Index : {idx[0]} --------------------\n')

print(f'Actual Sales for the Selected Data    : {yte[idx[0]]} \n')

pred = prod_pipe.predict(inp_data)

print(f'Predicted Sales for the Selected Data : {pred[0]}')

------------- Prediction for Test Data with Index : 1345 --------------------

Actual Sales for the Selected Data    : 2931.5174 

Predicted Sales for the Selected Data : 2605.9936995638677
