# ML REGRESSION - {"BIGMART SALES" DATASET}

## 1. Importing Modules and Setting Configurations

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from pickle import dump, load
from math import sqrt

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_selection import SelectKBest, mutual_info_regression

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline

from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import KFold, cross_val_score, cross_validate

from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

## 2. Importing Train Dataset

In [3]:
tr = pd.read_pickle('bms_FE_train_final.pkl')

print(f'Shape of the train dataset : {tr.shape}')
tr.head(5)

Shape of the train dataset : (8323, 11)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Age,Item_Outlet_Sales
2171,13.65,Regular,0.0808,Frozen Foods,262.8936,Medium,Tier 1,Supermarket Type1,Foods,14.0,4958.8784
5657,6.98,Low Fat,0.0412,Canned,82.8934,Small,Tier 2,Supermarket Type1,Foods,11.0,818.934
2156,20.1,Low Fat,0.0746,Dairy Foods,110.3228,Small,Tier 1,Supermarket Type1,Foods,16.0,1768.3648
110,13.3,Low Fat,0.0798,Dairy Foods,232.53,Small,Tier 1,Supermarket Type1,Foods,16.0,699.09
6709,10.1,Non Edible,0.0301,Health and Hygiene,154.3656,Medium,Tier 3,Supermarket Type3,Non Consumables,28.0,2471.4496


In [4]:
Xtr = tr.drop(columns='Item_Outlet_Sales')
ytr = tr['Item_Outlet_Sales']

## 3. Comparing Performance of Hyper Parameter Tuned Models

### 3.1 Dictionary of Models

In [5]:
mdl_dict = {
    'Lin_Reg':LinearRegression(),
    
    'Lasso':Lasso(alpha=0.5, max_iter=1000),
    
    'Ridge':Ridge(alpha=0.05, max_iter=1500), 

    'KN_REG':KNeighborsRegressor(algorithm='brute', metric='euclidean', n_neighbors=17, weights='uniform'),

    'SV_REG':SVR(C=1.0, degree=2, gamma='scale', kernel='linear'),

    'DT_REG':DecisionTreeRegressor(criterion='squared_error', max_depth=5, min_impurity_decrease=0.0, min_samples_split=0.3, splitter='best', random_state=46),

    'BAG_REG':BaggingRegressor(bootstrap=True, estimator=KNeighborsRegressor(), max_samples=0.25, n_estimators=200, oob_score=True, random_state=46),

    'RF_REG':RandomForestRegressor(bootstrap=True, criterion='squared_error', max_depth=5, max_samples=0.25, n_estimators=100, oob_score=True, random_state=46),

    'GB_REG':GradientBoostingRegressor(criterion='squared_error',learning_rate=0.1, max_depth=3, n_estimators=50, subsample=0.75, random_state=46),

    'HGB_REG':HistGradientBoostingRegressor(learning_rate=0.1, max_depth=3, max_iter=50, max_leaf_nodes=20, l2_regularization=0.1, random_state=46),

    'XGB_REG':XGBRegressor(objective='reg:squarederror', eval_metric='rmse', seed=46, eta=0.1, gamma=0.01, max_depth=3, n_estimators=50, subsample=0.75)
    }

print(f'Models for Performance Comparison : \n')
for model_name, model in mdl_dict.items():
    print(f'{model_name} : \n {model} \n')

Models for Performance Comparison : 

Lin_Reg : 
 LinearRegression() 

Lasso : 
 Lasso(alpha=0.5) 

Ridge : 
 Ridge(alpha=0.05, max_iter=1500) 

KN_REG : 
 KNeighborsRegressor(algorithm='brute', metric='euclidean', n_neighbors=17) 

SV_REG : 
 SVR(degree=2, kernel='linear') 

DT_REG : 
 DecisionTreeRegressor(max_depth=5, min_samples_split=0.3, random_state=46) 

BAG_REG : 
 BaggingRegressor(estimator=KNeighborsRegressor(), max_samples=0.25,
                 n_estimators=200, oob_score=True, random_state=46) 

RF_REG : 
 RandomForestRegressor(max_depth=5, max_samples=0.25, oob_score=True,
                      random_state=46) 

GB_REG : 
 GradientBoostingRegressor(criterion='squared_error', n_estimators=50,
                          random_state=46, subsample=0.75) 

HGB_REG : 
 HistGradientBoostingRegressor(l2_regularization=0.1, max_depth=3, max_iter=50,
                              max_leaf_nodes=20, random_state=46) 

XGB_REG : 
 XGBRegressor(base_score=None, booster=None, callbac

### 3.2 Calculating Model Preformance

In [6]:
# Pre Processors -------------------------------------------------------------------------------------------------------------
def ft_exp(x):
    y = x**(1/1.2)
    return y

def ft_sqrt(x):
    y = x**(1/2)  
    return y

ft_mrp_exp = FunctionTransformer(func=ft_exp)
ft_age_sqrt = FunctionTransformer(func=ft_sqrt)

pre_proc_num = ColumnTransformer(transformers=[
                ('pt_it_vis',PowerTransformer(method='yeo-johnson', standardize=False),[2]),
                ('ft_it_mrp',ft_mrp_exp,[4]),
                ('ft_ol_age',ft_age_sqrt,[9])
                ],
                remainder='passthrough')

pre_proc_ss = ColumnTransformer(transformers=[
                ('ss',StandardScaler(),[0,1,2,3])
                ],
                remainder='passthrough')

pre_proc_cat = ColumnTransformer(transformers=[
                ('oe',OrdinalEncoder(categories=[['Small','Medium','High'], ['Tier 3','Tier 2','Tier 1'], 
                        ['Grocery Store','Supermarket Type3','Supermarket Type2','Supermarket Type1']], dtype='object'),[6,7,8]),
                ('ohe',OneHotEncoder(drop='first', sparse_output=False, dtype='int8'),[4,5,9]),
                ],
                remainder='passthrough')

# Feature Selection -----------------------------------------------------------------------------------------------------------
skb = SelectKBest(mutual_info_regression, k='all')


# Function to calculate models performance using Pre-Processors, Feature Selection, and Estimators in the Pipeline -------------
def mdl_scores(mod_name, mod):

    output = []
    output.append(mod_name)

    #-----------------------------------------------------------------------------
    steps = [('num',pre_proc_num),
         ('ss',pre_proc_ss),
         ('cat',pre_proc_cat),
         ('skb',skb),    
         ('mdl',mdl)]

    pipe_mdl = Pipeline(steps)
    
    #----------------------------------------------------------------------------
    
    pipe = TransformedTargetRegressor(regressor=pipe_mdl, transformer=PowerTransformer(method='yeo-johnson', standardize=True))
    
    
    # K-fold cross-validation ---------------------------------------------------
    kfold = KFold(n_splits=5, shuffle=True, random_state=46)
    scores = cross_val_score(pipe, Xtr, ytr, cv=kfold, scoring='r2')
    output.append(scores.mean())

    #----------------------------------------------------------------------------
    return output

### 3.3 Comparing Model Performance

In [7]:
mdl_output = []

for mdl_name,mdl in mdl_dict.items():
    mdl_output.append(mdl_scores(mdl_name, mdl))

mdl_perf = pd.DataFrame(mdl_output, columns=['Model','R2'])

print(f'Comparsion of Model Performance : \n')
mdl_perf.sort_values(['R2'], ascending=False)

Comparsion of Model Performance : 



Unnamed: 0,Model,R2
7,RF_REG,0.5885
9,HGB_REG,0.5877
10,XGB_REG,0.5876
8,GB_REG,0.5868
6,BAG_REG,0.5525
3,KN_REG,0.5455
5,DT_REG,0.5011
2,Ridge,0.3491
0,Lin_Reg,0.3466
4,SV_REG,0.3267


## 4. Best Model With Hyper-Parameters Tuned

### 4.1 Best Model

In [8]:
# Pre Processors -------------------------------------------------------------------------------------------------------------
def ft_exp(x):
    y = x**(1/1.2)
    return y

def ft_sqrt(x):
    y = x**(1/2)  
    return y

ft_mrp_exp = FunctionTransformer(func=ft_exp)
ft_age_sqrt = FunctionTransformer(func=ft_sqrt)

pre_proc_num = ColumnTransformer(transformers=[
                ('pt_it_vis',PowerTransformer(method='yeo-johnson', standardize=False),[2]),
                ('ft_it_mrp',ft_mrp_exp,[4]),
                ('ft_ol_age',ft_age_sqrt,[9])
                ],
                remainder='passthrough')

pre_proc_ss = ColumnTransformer(transformers=[
                ('ss',StandardScaler(),[0,1,2,3])
                ],
                remainder='passthrough')

pre_proc_cat = ColumnTransformer(transformers=[
                ('oe',OrdinalEncoder(categories=[['Small','Medium','High'], ['Tier 3','Tier 2','Tier 1'], 
                        ['Grocery Store','Supermarket Type3','Supermarket Type2','Supermarket Type1']], dtype='object'),[6,7,8]),
                ('ohe',OneHotEncoder(drop='first', sparse_output=False, dtype='int8'),[4,5,9]),
                ],
                remainder='passthrough')

# Feature Selection -----------------------------------------------------------------------------------------------------------
skb = SelectKBest(mutual_info_regression, k='all')

mdl = RandomForestRegressor(bootstrap=True, criterion='squared_error', max_depth=5, max_samples=0.25, n_estimators=100, oob_score=True, random_state=46)

# ML Pipeline -----------------------------------------------------------------------------------------------------------------
steps = [('num',pre_proc_num),
     ('ss',pre_proc_ss),
     ('cat',pre_proc_cat),
     ('skb',skb),    
     ('mdl',mdl)]

pipe_mdl = Pipeline(steps)
#----------------------------------------------------------------------------

pipe_best = TransformedTargetRegressor(regressor=pipe_mdl, transformer=PowerTransformer(method='yeo-johnson', standardize=True))

pipe_best.fit(Xtr,ytr)

### 4.2 Best Model Evaluation with cross_validate using scoring=['r2','neg_root_mean_squared_error']

In [9]:
print('Cross Validation Results : ------------------------------------------------------------------------')

kfold = KFold(n_splits=5, shuffle=True, random_state=46)
scoring = ['r2','neg_mean_squared_error','neg_root_mean_squared_error']
cv_scores = cross_validate(pipe_best, Xtr, ytr, cv=kfold, scoring=scoring)

r2_scores = cv_scores['test_r2']
print(f'R2 Scores : {r2_scores}')
print(f'Mean R2 Score : {np.mean(r2_scores)}, St.Dev R2 Score : {np.std(r2_scores)} \n ----------------------------------------')

mse_scores = cv_scores['test_neg_mean_squared_error']
mse_scores = [-1*score for score in mse_scores]
print(f'MSE Scores : {mse_scores}')
print(f'Mean MSE Score : {np.mean(mse_scores)}, St.Dev MSE Score : {np.std(mse_scores)} \n ------------------------------------')

rmse_scores = cv_scores['test_neg_root_mean_squared_error']
rmse_scores = [-1*score for score in rmse_scores]
print(f'RMSE Scores : {rmse_scores}')
print(f'Mean RMSE Score : {np.mean(rmse_scores)}, St.Dev RMSE Score : {np.std(rmse_scores)}')

Cross Validation Results : ------------------------------------------------------------------------
R2 Scores : [0.55153002 0.6030768  0.60628863 0.59814181 0.58346059]
Mean R2 Score : 0.5884995694463127, St.Dev R2 Score : 0.020070165890641884 
 ----------------------------------------
MSE Scores : [1314331.189516851, 1165836.569334437, 1163637.2372110842, 1167537.1058807692, 1179284.4390250074]
Mean MSE Score : 1198125.3081936296, St.Dev MSE Score : 58354.799328045774 
 ------------------------------------
RMSE Scores : [1146.442841801043, 1079.7391209613722, 1078.7201848538314, 1080.5263096661595, 1085.9486355371544]
Mean RMSE Score : 1094.2754185639121, St.Dev RMSE Score : 26.203368493473278


## 5. Model Save/Export

In [10]:
dump(pipe_best, open('bms_mdl_best.pkl','wb'))
print('Model Object Saved Successfully \n')

dump(Xtr, open('bms_X_best.pkl','wb'))
print('"X" Features Saved Successfully')

Model Object Saved Successfully 

"X" Features Saved Successfully


## 6. Simple Prediction System

### 6.1 Load Best Model

In [11]:
mdl_pipe = load(open('bms_mdl_best.pkl','rb'))

### 6.2 Validation Dataset 

#### 6.2.1 Importing Dataset

In [12]:
val = pd.read_pickle('bms_FE_valid_final.pkl')

print(f'Shape of the validation dataset : {val.shape}')
val.head(5)

Xval = val.drop(columns='Item_Outlet_Sales')
yval = val['Item_Outlet_Sales']

Shape of the validation dataset : (100, 11)


#### 6.2.2 Overall Result on Validation Data

In [13]:
yval_pred = mdl_pipe.predict(Xval)

print(f'R2 Score on Validation Data : {round(r2_score(yval, yval_pred),4)} \n')
print(f'Validation Dataset RMSE     : {round(sqrt(mean_squared_error(yval, yval_pred)),4)}')

R2 Score on Validation Data : 0.6415 

Validation Dataset RMSE     : 892.7377


#### 6.2.3 Predictions on Validation Samples

In [14]:
samp = Xval.sample(10).index.values.tolist()
samp

val_df = pd.concat([Xval.loc[samp],yval.loc[samp]],axis=1)
val_df.head(10)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Age,Item_Outlet_Sales
5954,9.5,Low Fat,0.0406,Hard Drinks,224.6088,Medium,Tier 3,Supermarket Type3,Drinks,28,4474.176
3360,8.785,Low Fat,0.0661,Fruits and Vegetables,120.5414,High,Tier 3,Supermarket Type1,Foods,26,1583.9382
7604,6.135,Low Fat,0.0793,Soft Drinks,111.286,Medium,Tier 3,Supermarket Type2,Drinks,4,2263.72
2780,19.0,Low Fat,0.1119,Frozen Foods,106.5622,Small,Tier 1,Supermarket Type1,Foods,16,2117.244
945,13.65,Regular,0.0213,Snack Foods,58.3588,Small,Tier 3,Grocery Store,Foods,15,114.5176
6010,7.0,Low Fat,0.1517,Canned,104.828,Small,Tier 2,Supermarket Type1,Foods,11,1278.336
5457,21.25,Non Edible,0.0247,Household,145.1102,Small,Tier 2,Supermarket Type1,Non Consumables,11,1603.9122
237,12.3,Regular,0.0646,Starchy Foods,92.9804,High,Tier 3,Supermarket Type1,Foods,26,1929.4884
6638,7.405,Low Fat,0.0153,Canned,89.7146,Medium,Tier 1,Supermarket Type1,Foods,14,547.2876
4621,10.65,Regular,0.0851,Snack Foods,232.3668,Small,Tier 1,Supermarket Type1,Foods,16,3685.8688


In [15]:
#inp_data = (13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.0014,3.06,0.80,10.0)
#inp_data = np.array(inp_data).reshape(1,-1)

idx = val_df.sample().index.values
inp_data = Xval.loc[idx].values

print(f'Validation Data with Index : {idx[0]} is Selected for Prediction: \n')

for fea_name, fea_val in zip(Xval.columns.tolist(), inp_data[0]):
    print(f'"{fea_name}" ---:--- {fea_val}')

Validation Data with Index : 3360 is Selected for Prediction: 

"Item_Weight" ---:--- 8.785
"Item_Fat_Content" ---:--- Low Fat
"Item_Visibility" ---:--- 0.06614347253994954
"Item_Type" ---:--- Fruits and Vegetables
"Item_MRP" ---:--- 120.5414
"Outlet_Size" ---:--- High
"Outlet_Location_Type" ---:--- Tier 3
"Outlet_Type" ---:--- Supermarket Type1
"Item_Category" ---:--- Foods
"Outlet_Age" ---:--- 26


In [16]:
print(f'------------- Prediction for Validation Data with Index : {idx[0]} --------------------\n')

print(f'Actual Sales for the Selected Data    : {yval[idx[0]]} \n')

pred = mdl_pipe.predict(inp_data)

print(f'Predicted Sales for the Selected Data : {pred[0]}')

------------- Prediction for Validation Data with Index : 3360 --------------------

Actual Sales for the Selected Data    : 1583.9382 

Predicted Sales for the Selected Data : 1837.1758322339815


### 6.3 Test Data

#### 6.2.1 Importing Dataset

In [17]:
te = pd.read_pickle('bms_FE_test_final.pkl')

print(f'Shape of the test dataset : {te.shape}')
te.head(5)

Xte = te.drop(columns='Item_Outlet_Sales')
yte = te['Item_Outlet_Sales']

Shape of the test dataset : (100, 11)


#### 6.2.2 Overall Result on Test Data

In [18]:
yte_pred = mdl_pipe.predict(Xte)

print(f'R2 Score on Test Data : {round(r2_score(yte, yte_pred),4)} \n')
print(f'Test Dataset RMSE     : {round(sqrt(mean_squared_error(yte, yte_pred)),4)}')

R2 Score on Test Data : 0.5689 

Test Dataset RMSE     : 1232.2241


#### 6.2.3 Predictions on Test Samples

In [19]:
samp = Xte.sample(10).index.values.tolist()
samp

te_df = pd.concat([Xte.loc[samp],yte.loc[samp]],axis=1)
te_df.head(10)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Category,Outlet_Age,Item_Outlet_Sales
1727,16.85,Low Fat,0.0661,Snack Foods,146.076,Small,Tier 2,Supermarket Type1,Foods,6,2636.568
339,7.895,Regular,0.0951,Breakfast,104.4332,Small,Tier 2,Supermarket Type1,Foods,6,2870.9296
7725,13.15,Low Fat,0.0565,Soft Drinks,143.4812,Medium,Tier 1,Supermarket Type1,Drinks,14,854.8872
6374,11.3,Non Edible,0.0479,Household,179.866,Medium,Tier 3,Supermarket Type2,Non Consumables,4,1258.362
1654,7.935,Non Edible,0.0317,Household,263.091,High,Tier 3,Supermarket Type1,Non Consumables,26,5522.811
6678,10.1,Non Edible,0.0939,Household,115.9492,Small,Tier 3,Grocery Store,Non Consumables,15,231.6984
8307,20.2,Non Edible,0.0661,Household,61.351,Small,Tier 2,Supermarket Type1,Non Consumables,11,1201.769
7969,11.65,Low Fat,0.0324,Hard Drinks,38.8164,Small,Tier 3,Grocery Store,Drinks,15,77.2328
257,8.645,Non Edible,0.1433,Health and Hygiene,96.341,High,Tier 3,Supermarket Type1,Non Consumables,26,193.082
2239,12.1,Low Fat,0.036,Snack Foods,148.0734,Small,Tier 1,Grocery Store,Foods,28,593.8936


In [20]:
#inp_data = (13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.0014,3.06,0.80,10.0)
#inp_data = np.array(inp_data).reshape(1,-1)

idx = te_df.sample().index.values
inp_data = Xte.loc[idx].values

print(f'Test Data with Index : {idx[0]} is Selected for Prediction: \n')

for fea_name, fea_val in zip(Xte.columns.tolist(), inp_data[0]):
    print(f'"{fea_name}" ---:--- {fea_val}')

Test Data with Index : 1727 is Selected for Prediction: 

"Item_Weight" ---:--- 16.85
"Item_Fat_Content" ---:--- Low Fat
"Item_Visibility" ---:--- 0.06614347253994954
"Item_Type" ---:--- Snack Foods
"Item_MRP" ---:--- 146.076
"Outlet_Size" ---:--- Small
"Outlet_Location_Type" ---:--- Tier 2
"Outlet_Type" ---:--- Supermarket Type1
"Item_Category" ---:--- Foods
"Outlet_Age" ---:--- 6


In [21]:
print(f'------------- Prediction for Test Data with Index : {idx[0]} --------------------\n')

print(f'Actual Sales for the Selected Data    : {yte[idx[0]]} \n')

pred = mdl_pipe.predict(inp_data)

print(f'Predicted Sales for the Selected Data : {pred[0]}')

------------- Prediction for Test Data with Index : 1727 --------------------

Actual Sales for the Selected Data    : 2636.568 

Predicted Sales for the Selected Data : 2283.3200681659905
