# Step 1 Import necessary libraries    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Step 2 DataSet Load

In [None]:
BM_train = pd.read_csv('bigdatamart_Train.csv')

In [None]:
BM_test =  pd.read_csv('bigdatamart_Test.csv')

In [None]:
BM_train

In [None]:
BM_test

# Step 3 Null value check with sum

In [None]:
BM_train.isnull().sum()

In [None]:
BM_train.shape

'Item Weight' has 17.16% null values

'Outlet_Size' has 28.27% null values

In [None]:
BM_test.isnull().sum()

In [None]:
BM_test.shape

'Item Weight' has 17.18% null values

'Outlet_Size' has 28.26% null values



# Step 4 Zero Values check in dataset

In [None]:
(BM_train==0).sum()

'Item_Visibility' has 526 '0' values

In [None]:
(BM_test==0).sum()

'Item_Visibility' has 353 '0' values

# Step 5 Getting the basic summary and statistical information of the data.

In [None]:
BM_train.nunique()

Item_Fat_Content has 5 unique categories

Item_Type has 16 unique categories

Outlet_Identifier has 10 unique categories

Outlet_Establishment_Year 9 discrete values

Outlet_Size has 3 unique categories

Outlet_Location_Type has 3 unique categories

Outlet_Type has 4 unique categories

In [None]:
BM_test.nunique()

In [None]:
BM_test.skew()

Item_Fat_Content has 5 unique categories

Item_Type has 16 unique categories

Outlet_Identifier has 10 unique categories

Outlet_Establishment_Year 9 discrete values

Outlet_Size has 3 unique categories

Outlet_Location_Type has 3 unique categories

Outlet_Type has 4 unique categories

# Step 6 Getting the basic summary and statistical information of the data.

In [None]:
BM_train.info()

In [None]:
BM_test.info()

Item_Fat_Content, Item_Type,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type are object type meaning they are categorical in nature, while Outlet_Establishment_Year is discrete int type,

Item_Weight, Item_Visibility, Item_MRP, Item_Outlet_Sales are float type and continuous in nature.

In [None]:
BM_train.describe()

In [None]:
BM_test.describe()

From the above tables it is observed that the outlets have been operating from as early as 1985, on an average the outlets have been running since 1997, with last one having been established in 2009.

The average MRP of the items is 140.9 and 50% of the items are priced upto around 143.

The average sales made by the outlets is 2181.28

# Step 7 Filling Null Values in the datasets

In [None]:
BM_train.isnull().sum()

'Item Weight' has 17.16% null values

'Outlet_Size' has 28.27% null values

In [None]:
BM_test.isnull().sum()

'Item Weight' has 17.18% null values

'Outlet_Size' has 28.26% null values

'Item_Weight' is continuous in nature, imputing null values with either mean or median will be the ideal strategy.

In [None]:
BM_train['Item_Weight'].head(50) #Examining the null values in 'Item_Weight'

In [None]:
BM_test['Item_Weight'].head(50)  #Examining the null values in 'Item_Weight'

# Step 8 Checking for outliers

In [None]:
sns.boxplot(BM_train['Item_Weight'])

No outliers exist

In [None]:
sns.boxplot(BM_test['Item_Weight'])

No outliers exist

'Item_Weight' also has a relation to 'item type', for eg: items belonging to 'Meat' / 'Fruits and Vegetables' won't be in the same weight range as those items that belong to 'Soft Drinks' or 'Hard Drinks'. Therefore filling in the null values with the mean of Item_weight values belonging to their own respective 'item type'.

In [None]:
BM_train['Item_Type'].value_counts()

In [None]:
BM_test['Item_Type'].value_counts()

In [None]:
mean_wt = BM_test.groupby('Item_Type').agg({'Item_Weight':'mean'})
mean_wt1 = BM_train.groupby('Item_Type').agg({'Item_Weight':'mean'})

Creating a series of mean weights based on their item identifier type.

In [None]:
mean_wt1

In [None]:
mean_wt

In [None]:
mean_wt1.index

In [None]:
mean_wt.index

Creating a new column in main dataframes to map corresponding values of the column to nan values.

In [None]:
meanwtdt = dict(zip(mean_wt.index,mean_wt.Item_Weight)) #dictionary with Item_Type values as keys and avg_wt values as values.

In [None]:
meanwt1dt = dict(zip(mean_wt1.index,mean_wt1.Item_Weight)) #dictionary with Item_Type values as keys and avg_wt values as values.

In [None]:
BM_test['meanwt'] = BM_test['Item_Type'].map(meanwtdt) # This creates a column "meanwt" and maps values into it based on corresponding values of 'Item_Type'

In [None]:
BM_train['meanwt1'] = BM_train['Item_Type'].map(meanwt1dt) 

In [None]:
BM_test

In [None]:
BM_train

In [None]:
BM_test['Item_Weight'] = BM_test['Item_Weight'].replace('None', np.nan) #replacing none type values with nan values

In [None]:
BM_test['Item_Weight'] = BM_test['Item_Weight'].fillna((BM_test['meanwt'])) #filling null values in 'Item_Weight' with values from 'meanwt'

In [None]:
BM_train['Item_Weight'] = BM_train['Item_Weight'].replace('None', np.nan) #replacing none type values with nan values

In [None]:
BM_train['Item_Weight'] = BM_train['Item_Weight'].fillna((BM_train['meanwt1'])) #filling null values in 'Item_Weight' with values from 'meanwt'

In [None]:
BM_test.drop(columns='meanwt',inplace=True)

In [None]:
BM_test

In [None]:
BM_test.isnull().sum()

No more null values exist in Item_Weight

In [None]:
BM_train.isnull().sum()

No more null values exist in Item_Weight

In [None]:
BM_train.drop(columns = ['meanwt1'],inplace=True)

In [None]:
BM_test['Item_Weight'].head(50)

In [None]:
BM_train['Item_Weight'].head(50)

# Step 9 Filling out the null values in Outlet_Size


In [None]:
BM_test['Outlet_Size'].value_counts()

In [None]:
BM_train['Outlet_Size'].value_counts()

In [None]:
BM_train['Outlet_Size'] =BM_train['Outlet_Size'].fillna(BM_train['Outlet_Size'].mode()[0]) #filling null values in most frequently occuring value in Outlet_Size.

In [None]:
BM_test['Outlet_Size'] =BM_test['Outlet_Size'].fillna(BM_test['Outlet_Size'].mode()[0]) #filling null values in most frequently occuring value in Outlet_Size.

In [None]:
BM_test.isnull().sum()

In [None]:
BM_train.isnull().sum()

# Step 10 Dropping column 'Item_Identifier' since it doesn't contribute to building a good model for predicting the target variable values

In [None]:
BM_train.drop(columns = ['Item_Identifier'],inplace=True)
BM_test.drop(columns = ['Item_Identifier'],inplace=True)

# Step 11 Analysing Features with Continuous data type

In [None]:
BM_train.nunique()

In [None]:
sns.distplot(BM_train.Item_Weight )

Data looks normally distributed with mean,median and mode values falling within 10-15 range

In [None]:
sns.distplot(BM_train.Item_Visibility )

Data distribution is right skewed with values trailing off from 0.19 mark

In [None]:
BM_train.Item_Visibility.skew()

In [None]:
(BM_train.Item_Visibility==0).sum()

In [None]:
sns.boxplot(BM_train.Item_Visibility)

Outliers exist

Removing outliers using z-score technique

In [None]:
df2 = BM_train.Item_Visibility.copy()

In [None]:
df2

In [None]:
from scipy.stats import zscore
zscor = zscore(df2)
z_score_abs = np.abs(zscor)

df3 = df2[(z_score_abs < 3)] #taking 3 as threshold value

In [None]:
df2.shape

In [None]:
df3.shape

# Step 11 Data Loss % Check

In [None]:
(8523-8428)/8523*100

1.11% Data loss is within range that is acceptable

In [None]:
dropindx = BM_train.index.difference(df3.index)

In [None]:
BM_train.drop(dropindx,inplace = True) #dropping the outliers from original features Dataframe

In [None]:
sns.boxplot(BM_train.Item_Visibility)

Some amount of outliers has been removed.

In [None]:
BM_train.skew()

Imputing of 0 values using simple imputer and strategy = median

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
si = SimpleImputer(missing_values = 0,strategy = 'median',verbose = 0 )

In [None]:
si = si.fit(BM_train[['Item_Visibility']])

In [None]:
BM_train[['Item_Visibility']] = si.transform(BM_train[['Item_Visibility']])

In [None]:
(BM_train==0).sum()

There are no more 0 values.

In [None]:
si = SimpleImputer(missing_values = 0,strategy = 'median',verbose = 0 )

In [None]:
Si = si.fit(BM_test[['Item_Visibility']])

In [None]:
BM_test[['Item_Visibility']] = Si.transform(BM_test[['Item_Visibility']])

In [None]:
(BM_test==0).sum()

There are no more 0 values.

In [None]:
BM_test.Item_Visibility.skew() 

Removing outliers using z-score technique

In [None]:
df2 = BM_test.Item_Visibility.copy()

In [None]:
from scipy.stats import zscore
zscor = zscore(df2)
z_score_abs = np.abs(zscor)

df3 = df2[(z_score_abs < 3)] #taking 3 as threshold value

In [None]:
df2.shape

In [None]:
df3.shape

# Step 12 Data Loss % Check

In [None]:
(5681-5594)/5681*100

1.53% Data loss is within acceptable range

In [None]:
dropindx = BM_test.index.difference(df3.index)

In [None]:
BM_test.drop(dropindx,inplace = True) #dropping the outliers from original features Dataframe

In [None]:
BM_test.shape

In [None]:
BM_train.reset_index(drop=True,inplace=True)

In [None]:
BM_test.reset_index(drop=True,inplace=True)

# Step 13 Reducing skewness further using Power Transformer Method

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
powtrans= PowerTransformer(method='yeo-johnson', standardize=True)

In [None]:
df4 = BM_train[['Item_Visibility']]

In [None]:
transformed= powtrans.fit_transform(df4[['Item_Visibility']])

In [None]:
transformed = pd.DataFrame(transformed, columns=df4.columns) #to convert numpy array back into dataframe

In [None]:
transformed.skew()

In [None]:
transformed.index = BM_train.index

In [None]:
BM_train[['Item_Visibility']]= transformed[['Item_Visibility']]

In [None]:
BM_train.Item_Visibility.skew()

Skewness has been considerably reduced

In [None]:
df5 = BM_test[['Item_Visibility']]

In [None]:
transformed= powtrans.fit_transform(df5[['Item_Visibility']])

In [None]:
transformed = pd.DataFrame(transformed, columns=df5.columns) #to convert numpy array back into dataframe

In [None]:
transformed.skew()

In [None]:
transformed.index = BM_test.index

In [None]:
BM_test[['Item_Visibility']]= transformed[['Item_Visibility']]

In [None]:
BM_test.skew()

Skewness has been considerably reduced

In [None]:
sns.distplot(BM_train.Item_MRP )

Distribution appears to be multi modal. The mean MRP of the items is 140.9.

In [None]:
sns.boxplot(BM_train.Item_MRP)

No outliers present

# Step 14 Analysing Outlet_Establishment_Year Feature

In [None]:
BM_train.Outlet_Establishment_Year.value_counts()

In [None]:
sns.countplot(BM_train.Outlet_Establishment_Year)

Highest number of outlets were established in 1985. With the exception of 1998,1987,1997,1999,2002,2004,2007,2009 saw a similar number of new outlets having been established.

# Step 15 Extracting the age of outlets from Outlet_Establishment_Year column will give a better insight into its relationship with target column data.

In [None]:
BM_train['Age'] = BM_train['Outlet_Establishment_Year'].apply(lambda y: 2021 - y) 

BM_test['Age'] = BM_test['Outlet_Establishment_Year'].apply(lambda x: 2021 - x)

In [None]:
BM_test.drop(columns=['Outlet_Establishment_Year'],inplace=True)
BM_train.drop(columns=['Outlet_Establishment_Year'],inplace=True)

In [None]:
BM_test.head()

In [None]:
BM_train.head()

In [None]:
BM_train.Age.value_counts()

In [None]:
sns.countplot(BM_train.Age)

Highest number of outlet is 36 years old

Item_Weight 431 Item_Fat_Content 5 Item_Visibility 7880 Item_Type 16 Item_MRP 5938 Outlet_Identifier 10 Outlet_Establishment_Year 9 Outlet_Size 3 Outlet_Location_Type 3 Outlet_Type 4 Item_Outlet_Sales 3493 dtype: int64

In [None]:
BM_train.Item_Fat_Content.unique()

'Item_Fat_Content' actually has only 2 unique categories : 'Low Fat' and 'Regular' while 'low fat' and 'LF' are infact 'Low Fat' while 'reg' is actually 'Regular.'

# Step 16 Converting the mistyped ones to the original categories

In [None]:
BM_train['Item_Fat_Content'] = BM_train['Item_Fat_Content'].replace({'reg':'Regular','low fat':'Low Fat','LF':'Low Fat'})

In [None]:
BM_test['Item_Fat_Content'] = BM_test['Item_Fat_Content'].replace({'reg':'Regular','low fat':'Low Fat','LF':'Low Fat'})

In [None]:
BM_test.Item_Fat_Content.value_counts()

In [None]:
BM_train.Item_Fat_Content.value_counts()

In [None]:
sns.countplot(BM_train.Item_Fat_Content)

Low fat products are mor popular than Regular fat products

In [None]:
labels = 'Low Fat','Regular'
fig, ax = plt.subplots()
ax.pie(BM_train.Item_Fat_Content.value_counts(),labels = labels,radius =1,autopct = '%1.2f%%', shadow=True,)
plt.show()

Low Fat products form 64.82% of total sold items while Regular forms 35.18%

In [None]:
BM_train.Item_Type.value_counts()

Item_Type has 16 unique categories.

In [None]:
plt.figure(figsize=(20,11))
sns.countplot(BM_train.Item_Type)

Most bought items are Fruits and Vegetables, Household items, Snack Food, and Frozen foods.

# Step 17 We can further club these catergories into Foods, Beverages and Inedibles

In [None]:
BM_train['Item_Type'] = BM_train['Item_Type'].replace({'Dairy':'Foods','Soft Drinks':'Beverages','Meat':'Foods', 'Fruits and Vegetables':'Foods','Household':'Inedibles','Baking Goods':'Foods','Snack Foods':'Foods','Frozen Foods':'Foods','Breakfast':'Foods','Health and Hygiene':'Inedibles','Hard Drinks':'Beverages','Canned':'Foods','Breads':'Foods','Starchy Foods':'Foods','Others':'Inedibles','Seafood':'Foods'})

In [None]:
BM_test['Item_Type'] = BM_train['Item_Type'].replace({'Dairy':'Foods','Soft Drinks':'Beverages','Meat':'Foods', 'Fruits and Vegetables':'Foods','Household':'Inedibles','Baking Goods':'Foods','Snack Foods':'Foods','Frozen Foods':'Foods','Breakfast':'Foods','Health and Hygiene':'Inedibles','Hard Drinks':'Beverages','Canned':'Foods','Breads':'Foods','Starchy Foods':'Foods','Others':'Inedibles','Seafood':'Foods'})

In [None]:
BM_train.head() 

In [None]:
BM_test.head()

Successfully converted the Item_type categories into 'Foods','Beverages' and 'Inedibles'

# Step 18 Item_fat_Content is not Applicable to 'Inedibles', therefore creating a separate category for them under 'Item_Fat_Content'

In [None]:
BM_train.loc[BM_train['Item_Type']=="Inedibles",'Item_Fat_Content'] = "Inedible" 
#assigns all those values in 'Item_Fat_Content' column to 'Inedible' category, which correspond to values in 'Item_Type' column

In [None]:
BM_test.loc[BM_test['Item_Type']=="Inedibles",'Item_Fat_Content'] = "Inedible" 
#assigns all those values in 'Item_Fat_Content' column to 'Inedible' category, which correspond to values in 'Item_Type' column

In [None]:
BM_train.head(10)

In [None]:
BM_test.head(10)

In [None]:
BM_train.Outlet_Size.value_counts()

In [None]:
labels = 'Medium','Small','High' 
fig, ax = plt.subplots()
ax.pie(BM_train.Outlet_Size.value_counts(),labels = labels,radius =1,autopct = '%1.2f%%', shadow=True,)
plt.show()

Medium Sized outlets form the majority with 61.24% of total outlets, Small outlets are 27.71% while High sized outlets are 11.06% of the total outlets.

In [None]:
BM_train.Outlet_Location_Type.value_counts()

In [None]:
labels = 'Tier 3','Tier 2','Tier 1' 
fig, ax = plt.subplots()
ax.pie(BM_train.Outlet_Location_Type.value_counts(),labels = labels,radius =1,autopct = '%1.2f%%', shadow=True,)
plt.show()

39.25% of total outlets are in Tier 3 cities, 33.04% of the outlets are in Tier 2 cities while 27.71% of the total outlets are in Tier 1 cities.

In [None]:
BM_train.Outlet_Type.value_counts()

In [None]:
sns.countplot(BM_train.Outlet_Type)

Highest number of outlets are of Supermarket type 1. while the rest are almost equally divided amongst Supermarket type 2,Supermarket type 3 and Grocery Store.

In [None]:
BM_train.isnull().sum()

# Step 19  Interpreting Relationship between Dependent Variable and Independent Variables

# 'Item_Outlet_Sales' vs MRP

# 'Item_Outlet_Sales' vs Continuous Data Columns

In [None]:
X = BM_train[['Item_MRP','Item_Weight','Item_Visibility']]
y = BM_train['Item_Outlet_Sales']

In [None]:
plt.figure(figsize=(20,25),facecolor='white')
plotnum=1
for col in X:
    if plotnum<=23:
        plt.subplot(5,5,plotnum)
        sns.scatterplot(X[col],y)
        plt.xlabel(col,fontsize=20)
        plt.ylabel('Item_Outlet_Sales',fontsize=20)
    plotnum+=1
plt.tight_layout()

From the graph above, it is observed that There is a positive linear relationship between Item_Outlet_Sales and Item_MRP

'Item_Outlet_Sales' vs Categorical/Discrete Data Columns

In [None]:
plt.figure(figsize=(20,25),facecolor='white')
plotnum=1
y = BM_train['Item_Outlet_Sales']
X = BM_train.drop(columns=['Item_MRP','Item_Weight','Item_Visibility','Item_Outlet_Sales'])
for col in X:
    if plotnum<=23:
        plt.subplot(5,5,plotnum)
        sns.scatterplot(X[col],y)
        plt.xlabel(col,fontsize=20)
        plt.ylabel('Item_Outlet_Sales',fontsize=20)
    plotnum+=1
plt.tight_layout()

From the above graphs it is observed that Outlet_type has a positive relation with Item_Outlet_sales.

Supermarket Type3 generates the more sales when compared to the Supermarket types 1 and 2 and Grocery Stores.

Tier 3 City Outlets generate a lot of sales. Age of the establishment also plays an important role when age of the establishment is above 30. This maybe because Outlets that have been running for the longest sell the highest.

In [None]:
sns.scatterplot(data=BM_train, x='Outlet_Location_Type', y='Item_Outlet_Sales', hue='Outlet_Type')

From the graph it is observed that Tier 3 Cities generate the most sales and have a good mix of Supermarket Type 2 and Type 3 along with Grocery stores.

Tier 2 and Tier 1 cities have more of Supermarket Type1 with a few Grocery stores.

Outlet_Location_Type has a good relation with Item_Outlet_sales

In [None]:
sns.scatterplot(data=BM_train, x='Outlet_Size', y='Item_Outlet_Sales', hue='Outlet_Type')

From the graph it is observed that Medium Sized Outlets generate the Highest sales and are a mix of Supermaraket Type1,Type2,Type3 and Grocery Stores.

High and small Sized Outlets comprise mostly of Supermarket Type1

# Step 20 Encoding Categorical Columns



In [None]:
dumm = pd.get_dummies(BM_train[['Outlet_Type','Outlet_Identifier']])

In [None]:
dumm

In [None]:
dumm2 = pd.get_dummies(BM_test[['Outlet_Type','Outlet_Identifier']],drop_first = False)

In [None]:
dumm2

In [None]:
dm_train = BM_train.copy()
dm_test = BM_test.copy()

In [None]:
BM_train = BM_train.join(dumm)

In [None]:
BM_train.drop(columns=['Outlet_Type','Outlet_Identifier'],inplace=True)

In [None]:
BM_train

In [None]:
BM_test = BM_test.join(dumm2)

In [None]:
BM_test.drop(columns=['Outlet_Type','Outlet_Identifier'],inplace=True)

In [None]:
BM_test

In [None]:
BM_train['Item_Fat_Content'] = BM_train.Item_Fat_Content.map({'Inedible':1,'Low Fat':2,'Regular':3})

In [None]:
BM_test['Item_Fat_Content'] = BM_test.Item_Fat_Content.map({'Inedible':1,'Low Fat':2,'Regular':3})

In [None]:
BM_train['Item_Type'] = BM_train.Item_Type.map({'Inedibles':1,'Foods':2,'Beverages':3})

In [None]:
BM_test['Item_Type'] = BM_test.Item_Type.map({'Inedibles':1,'Foods':2,'Beverages':3})

In [None]:
BM_train['Outlet_Size'] = BM_train.Outlet_Size.map({'Small':1,'Medium':2,'High':3})

In [None]:
BM_test['Outlet_Size'] = BM_test.Outlet_Size.map({'Small':1,'Medium':2,'High':3})

In [None]:
BM_train['Outlet_Location_Type'] = BM_train.Outlet_Location_Type.map({'Tier 1':1,'Tier 2':2,'Tier 3':3})

In [None]:
BM_test['Outlet_Location_Type'] = BM_test.Outlet_Location_Type.map({'Tier 1':1,'Tier 2':2,'Tier 3':3})

In [None]:
BM_train.head(10)

In [None]:
BM_test.head(10)

# Step 21 All categorical columns have been encoded

# 1) Finding Correlation

In [None]:
d_corr = BM_train.corr()

In [None]:
d_corr

In [None]:
plt.figure(figsize=(15,16))
sns.heatmap(d_corr,annot=True,linewidth=1)
plt.show()

# 2) Visualizing correlation of feature columns with label column.

In [None]:
plt.figure(figsize = (20,8))
BM_train.corr()['Item_Outlet_Sales'].sort_values(ascending = False).drop(['Item_Outlet_Sales']).plot(kind='bar',color = 'c')
plt.xlabel('Features',fontsize=15)
plt.ylabel('Item_Outlet_Sales',fontsize=15)
plt.title('correlation',fontsize = 18)
plt.show()

Item_MRP has the highest positive correlation with Item_Outlet_Sales, followed by Outlet_Type_Supermarket Type3 and Outlet_Identifier_OUT027.

Outlet_Type_Grocery Store has highest negative correlation with Item_Outlet_sales followed by Outlet_Identifier_OUT010 and Outlet_Identifier_OUT019


# 3) Feature Selection


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X =BM_train.drop(columns=['Item_Outlet_Sales'])
y =BM_train['Item_Outlet_Sales']

In [None]:
scaler= StandardScaler()

In [None]:
scaled_X = scaler.fit_transform(X)

# 4) Checking for Multicollinearity using Variance Inflation Factor

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()

In [None]:
vif["Features"] = X.columns
vif['vif'] = [variance_inflation_factor(scaled_X,i) for i in range(scaled_X.shape[1])]

In [None]:
vif

There is no multicollinearity

In [None]:
scaler= StandardScaler()
scaled_X_test = scaler.fit_transform(BM_test)

In [None]:
scaled_X_test

# 5) Selecting Kbest Features

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
bestfeat = SelectKBest(score_func = f_classif, k = 22)
fit = bestfeat.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
fit = bestfeat.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
dfcolumns.head()
featureScores = pd.concat([dfcolumns,dfscores],axis = 1)
featureScores.columns = ['Feature', 'Score']
print(featureScores.nlargest(22,'Score'))

In [None]:
xbest = X.drop(columns=['Outlet_Identifier_OUT018','Outlet_Identifier_OUT045'])

In [None]:
xbest

In [None]:
xbest_ss = scaler.fit_transform(xbest)

In [None]:
x_best_test = BM_test.drop(columns=['Outlet_Identifier_OUT018','Outlet_Identifier_OUT045'])

In [None]:
scaler= StandardScaler()
scaled_X_test = scaler.fit_transform(x_best_test)

# The Data to be predicted in Target / Label column is continuous in nature since it is the total value of item Sales from each outlet. Therefore Regression Models will be used.

# Step 22 Regression Model Building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.ensemble import RandomForestRegressor
maxAcc = 0
maxRS=0
for i in range(1,100):
    x_train,x_test,y_train,y_test = train_test_split(xbest_ss,y,test_size = .33, random_state = i)
    modRF =  RandomForestRegressor()
    modRF.fit(x_train,y_train)
    pred = modRF.predict(x_test)
    acc  = r2_score(y_test,pred)
    if acc>maxAcc:
        maxAcc=acc
        maxRS=i
print(f"Best Accuracy is: {maxAcc} on random_state: {maxRS}")

In [None]:
x_train,x_test,y_train,y_test = train_test_split(xbest_ss,y,test_size = .33, random_state = 78)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_squared_error

In [None]:
rf = RandomForestRegressor()
xg = XGBRegressor()
SV= SVR()
r=Ridge()

# Step 23 Training the Models

In [None]:
rf.fit(x_train,y_train)
xg.fit(x_train,y_train)
SV.fit(x_train,y_train)
r.fit(x_train,y_train)

# Step 24 All models have been trained

# 1)Ridge Regression Model

In [None]:
y_r_pred = r.predict(x_test)

# 2) R2 Score

In [None]:
r2_score(y_test,y_r_pred)

In [None]:
# Mean Squared Error

mean_squared_error(y_test,y_r_pred)

# 3) Random Forest Regression Model

In [None]:
y_rf_pred = rf.predict(x_test)

In [None]:
# R2 Score

r2_score(y_test,y_rf_pred)

In [None]:
# Mean Squared Error

mean_squared_error(y_test,y_rf_pred)

# 4) XGB Regression Model

In [None]:
y_xg_pred = xg.predict(x_test)

In [None]:
# R2 Score

r2_score(y_test,y_xg_pred)

In [None]:
# Mean Squared Error

mean_squared_error(y_test,y_xg_pred)

# 5)Support Vector Regression Model

In [None]:
y_svr_pred = SV.predict(x_test)

In [None]:
# R2 Score

r2_score(y_test,y_svr_pred)

In [None]:
# Mean Squared Error

mean_squared_error(y_test,y_svr_pred)

# Step 25  Model Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

# 1) Ridge Regression

In [None]:
cross_val_score(r,xbest_ss,y,cv=5).mean()

# 2) Random Forest Regression

In [None]:
cross_val_score(rf,xbest_ss,y,cv=5).mean()

# 3) XGB Regression

In [None]:
cross_val_score(xg,xbest_ss,y,cv=5).mean()

# 4) SV Regression

In [None]:
cross_val_score(SV,xbest_ss,y,cv=5).mean()

# Based on comparing Accuracy Score results with Cross Validation results, it is determined that XGB Regressor is the best model.

# Step 26 Hyper Parameter Tuning

# XGB Regressor

In [None]:
parameter = {'booster':["gbtree","gblinear"],'eta': [0.01,0.1,0.2,0.3],'min_child_weight':np.arange(5),'max_depth':[10,20,40,60,80],'subsample':[0.5,1]}

In [None]:
GridCV = GridSearchCV(XGBRegressor(),parameter,cv=5,n_jobs = -1,verbose = 1)

In [None]:
GridCV.fit(x_train,y_train)

In [None]:
GridCV.best_params_

In [None]:
Best_mod = XGBRegressor(booster = 'gblinear',eta = 0.1, max_depth= 40, min_child_weight = 0,subsample = 0.5)
Best_mod.fit(x_train,y_train)

In [None]:
xgbpred = Best_mod.predict(x_test)

acc = r2_score(y_test,xgbpred)
print(acc*100)

XGB Regressor has an accuracy of 57.72%

# Step 27 Saving The Model

In [None]:
import joblib
joblib.dump(Best_mod,"BestModelBM.pkl")

# Step 28 Loading The Model

In [None]:
mod=joblib.load("BestModelBM.pkl")

In [None]:
print(mod.predict(scaled_X_test)) #loading the test data