<a href="https://colab.research.google.com/github/nikitha-ramchander/sales-prediction/blob/main/SalesMachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [113]:
# Mount Data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [114]:
# Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import standard scaler and knn regressor 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
# Import linear regression model and regression metrics  
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [115]:
# Load Data
sales = pd.read_csv('/content/drive/MyDrive/Coding Dojo/Project/salesdatacleaning.csv')
sales.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,No_Weight_Flag,No_Size_Flag
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,0,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,0,0
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,0,0
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,High,Tier 3,Grocery Store,732.38,0,1
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,0,0


#Prepare Data Before Machine Learning Model
>Let's deal with the categorical varaibles by grouping them as nominal, ordinal, or numerical.

* Item Identifier: nominal
* Item Weight: numerical 
* Item Fat Content: nominal 
* Item Visibility: numerical 
* Item Type: nominal 
* Item MRP: numerical
* Outlet Identifier: nominal 
* Establishment Year: ordinal
* Outlet Size: ordinal
* Location Type: ordinal 
* Outlet Type: ordinal
* Item Sales: numerical  
* No Weight Flag: nominal
* No Size Flag: nominal

#Columns of Ordinal Variables.

In [117]:
# View unique categories of Outlet Establishment Year 
sales['Outlet_Establishment_Year'].value_counts()

1985    1463
1987     932
1999     930
1997     930
2004     930
2002     929
2009     928
2007     926
1998     555
Name: Outlet_Establishment_Year, dtype: int64

In [118]:
# Define dictionary to replace 
years = {1985 : 0, 1987 : 1, 1997 : 2, 1998 : 3, 1999 : 4, 2002 : 5, 2004 : 6, 2007 : 7, 2009 : 8}

In [119]:
# Map the dictionary 
sales['Outlet_Establishment_Year'] = sales['Outlet_Establishment_Year'].map(years)
sales.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,No_Weight_Flag,No_Size_Flag
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,4,Medium,Tier 1,Supermarket Type1,3735.138,0,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,8,Medium,Tier 3,Supermarket Type2,443.4228,0,0


In [120]:
# View unique categories of Outlet Size
sales['Outlet_Size'].value_counts()

Medium    4648
Small     2388
High      1487
Name: Outlet_Size, dtype: int64

In [121]:
# Define dictionary to replace 
outletsize = {'Small': 0, 'Medium': 1, 'High': 2}

In [122]:
# Map the dictionary 
sales['Outlet_Size'] = sales['Outlet_Size'].map(outletsize)
sales.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,No_Weight_Flag,No_Size_Flag
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,4,1,Tier 1,Supermarket Type1,3735.138,0,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,8,1,Tier 3,Supermarket Type2,443.4228,0,0


In [123]:
# View unique categories of Outlet Location Type 
sales['Outlet_Location_Type'].value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

In [124]:
# Define dictionary to replace 
location = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}

In [125]:
# Map the dictionary 
sales['Outlet_Location_Type'] = sales['Outlet_Location_Type'].map(location)
sales.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,No_Weight_Flag,No_Size_Flag
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,4,1,0,Supermarket Type1,3735.138,0,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,8,1,2,Supermarket Type2,443.4228,0,0


In [126]:
# View unique categories of Outlet Type
sales['Outlet_Type'].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [127]:
# Define dictionary to replace 
outlettype = {'Supermarket Type1': 0, 'Supermarket Type2': 1, 'Supermarket Type3': 2, 'Grocery Store': 3}

In [128]:
# Map the dictionary 
sales['Outlet_Type'] = sales['Outlet_Type'].map(outlettype)
sales.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,No_Weight_Flag,No_Size_Flag
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,4,1,0,0,3735.138,0,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,8,1,2,1,443.4228,0,0


#Columns of Nominal Variables

In [129]:
# Item Identifier seems too unique and will not help with modeling, I will drop this column
sales['Item_Identifier'].nunique()

1559

In [130]:
# Outlet Identifier seems too unique and will not help with modeling, I will drop this column
sales['Outlet_Identifier'].nunique()

10

In [131]:
sales.drop(columns = ['Item_Identifier', 'Outlet_Identifier'], inplace = True)
sales.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,No_Weight_Flag,No_Size_Flag
0,9.3,Low Fat,0.016047,Dairy,249.8092,4,1,0,0,3735.138,0,0
1,5.92,Regular,0.019278,Soft Drinks,48.2692,8,1,2,1,443.4228,0,0
2,17.5,Low Fat,0.01676,Meat,141.618,4,1,0,0,2097.27,0,0
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,3,2,2,3,732.38,0,1
4,8.93,Low Fat,0.0,Household,53.8614,1,2,2,0,994.7052,0,0


In [132]:
sales = pd.get_dummies(sales, columns = ['Item_Fat_Content', 'Item_Type', 'No_Weight_Flag', 'No_Size_Flag'], drop_first = True)
sales.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,No_Weight_Flag_1,No_Size_Flag_1
0,9.3,0.016047,249.8092,4,1,0,0,3735.138,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,5.92,0.019278,48.2692,8,1,2,1,443.4228,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,17.5,0.01676,141.618,4,1,0,0,2097.27,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,19.2,0.0,182.095,3,2,2,3,732.38,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,8.93,0.0,53.8614,1,2,2,0,994.7052,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


Now that we have categorized each variable we can run this data frame though a few Machine Learning models! 

#Linear Regression 
>Linear Regression models predict the relationship between a features matrix and a target vector. It attempts to model the relationship between two variables by fitting a linear equation to observed data. 

In [136]:
# Split data into features matrix and target vector 
X = sales.drop(columns = 'Item_Outlet_Sales')
y = sales['Item_Outlet_Sales']

In [143]:
# Check shape
X.shape

(8523, 25)

In [144]:
# Make a linear regression instance 
# Instantiate the model 
reg = LinearRegression(fit_intercept = True)

In [145]:
# Fit model 
reg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [146]:
# Save predictions
preds = reg.predict(X)

#R^2: Coefficient of Determination

In [147]:
# Using the model:
reg.score(X, y)

0.5093577284907114

>Interpretation: We're trying the find the best fit line for our model. So, 51% of the varaibility in the target (sales) can be explained by all of the other features. A measure of linear fit, how well this line captures the trend. 

>Units: Between -1 and 1, anything closer to 1 is the best fit model.

#MAE: Mean Absolute Error 

In [148]:
# Using sklearn
mean_absolute_error(y, preds)

890.8609547239795

>Interpretation: This is the most human interpretable metric because it shows on average how wrong the model is. In this case on average the model is wrong by 890 dollars. Since the spread of sales is from 0 to 10,000 it is not terrible that you are off by 890 dollars because it is such a large spread of sales. 

>Units: In dollars, on average it is off by so many dollars.

#MSE: Mean Squared Error 

In [149]:
# Using sklearn:
mean_squared_error(y, preds)

1428651.8022370473

Interpretation: Not the most human interpretable metric. Essentialy it "punishes" larger errors, which tends to be useful in the real world.

Units: It is in squared dollars.

#RMSE: Root Mean Squared Error

In [151]:
# Using sklearn:
np.sqrt(mean_squared_error(y, preds))

1195.262231578095

>Interpretation: This is human interpretable because it is squared and gets it back into the original units that we can understand. The RMSE is off by 1195 dollars which is more than the MAE at 890 dollars. The reason being RMSE penalizes being off by more larger data points through outliers. That is why RMSE is larger than MAE because this model has quite a few outliers that this data set is significantly off by.

>Units: Dollar amount.

#Formula for Linear Regression Line

In [157]:
reg.intercept_

-220.52825451931676

In [158]:
reg.coef_

array([-4.24317195e-01, -1.12302442e+03,  1.55505950e+01,  1.95919126e+01,
       -1.60593527e+01,  4.56058642e+02, -9.06841542e+02,  4.99868755e+01,
        1.36946865e+01, -3.39914923e+00,  3.35251722e+01, -4.45382181e+01,
       -1.39064183e+01,  3.68117020e+01,  4.02258435e+01, -7.60108426e+00,
       -2.85457804e+01,  1.37726355e+01, -5.79598556e+01,  1.71461434e+02,
       -3.33485287e+00, -2.77199779e+01,  4.10159618e+01,  2.17193905e+03,
       -9.14735861e+01])

Interpretation: 

#KNN Regression 
>K-nearest neighbors is a model that uses the "K" most similar observations in order to make a prediction. So, it approximates the association between independent variables and the continuous outcome by averaging the observations in the same neighbourhood.

In [152]:
# Split data into features matrix and target vector
X = sales.drop(columns = 'Item_Outlet_Sales')
y = sales['Item_Outlet_Sales']

In [153]:
# Standardize data
scaler = StandardScaler()
# Fit on X (assuming your features matrix is a NumPy array)
scaler.fit(X)
# Transform X
X = scaler.transform(X);

In [154]:
# KNN - Regressor
knn_reg = KNeighborsRegressor(n_neighbors=3)
knn_reg.fit(X, y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform')

In [155]:
# Predict age
preds = knn_reg.predict(X)

#R^2

In [156]:
# Using the model:
knn_reg.score(X, y)

0.7202013386878403

> Interpretation: Measure the strength of this model. The score of 72% is predicted by local interpolation of the targets associated of the nearest neighbors in the training set. This score is better than fitting this data through a liner regression model.

> Units: between -1 and 1, anyting closer to 1 is the best fit model.