<a href="https://colab.research.google.com/github/paulf35/cd-ds-productsalespredictions/blob/main/ProductSalesPredictions_Modeling_Eval_PaulFoy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Modeling and Evaluation

---


Author: Paul Foy

# Data Modeling and Evaluation
In this notebook, I will finalize your sales predictions by creating and comparing different ML models. The goal of this is to help the retailer understand the properties of products and outlets that play crucial roles in predicting sales.



#Import required libraries

In [3]:
## Pandas
import pandas as pd
## Numpy
import numpy as np
## MatPlotLib
import matplotlib.pyplot as plt

## Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

## Models
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

## Regression Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


## Set global scikit-learn configuration
from sklearn import set_config
## Display estimators as a diagram
set_config(display='diagram') # 'text' or 'diagram'}

from IPython.core.display import clear_output

# Warnings
import warnings

## Set filter warnings to ignore
warnings.filterwarnings('ignore')

# Set the default transformation output to Pandas
from sklearn import set_config
set_config(transform_output='pandas')



# Create custom functions

In [4]:
# Helper Function
# This custom function accept true targets and predictions with custom label
# Calculate and print  MAE, MSE , RMSE and R2 scores by saving it in a dictionary

def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

# Helper Function
# This custom function accept the model, X_train, y_train, X_test, and y_test
# Obtain the predictions from the model for both training and test data
# Input the true and predicted values into the helper function to obtain all the metrics for both the training and test data.
# Print the results (optional with default as True
# Save the results as a dataframe (optional with default as False)

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False,model_name =''):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label= model_name + ' ' + 'Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label=model_name + ' ' + 'Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)


#Pandas Display Configurations


In [5]:
## Display all columns
pd.set_option('display.max_column', None)

#Load the data

In [6]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Load Sales data in from Google Drive
fname = "/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/Data/sales_predictions_2023.csv"
df = pd.read_csv(fname)

#Inspect and Fix Data Issues


In [8]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [9]:
# Make a copy of original df to avoid any manipulations
eda_ml = df.copy()

## Droping Item_Identifier
Item_Identifier has high cardinality and isn't needed to make business decisions.


In [10]:
#Remove Item_Identier column
eda_ml.drop(columns ='Item_Identifier', inplace = True)

#Verify
eda_ml.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


## Check for duplicated, Missing , or Inconsistent data

In [11]:
# Check to see if there are any duplicate rows
eda_ml.duplicated().sum()

0

In [12]:
# Display the sum of missing values
eda_ml.isna().sum().sum()

3873

In [13]:
# Display desriptive statitistics for all collumns
eda_ml.describe(include='number')

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [14]:
# Display desriptive statitistics for all collumns
eda_ml.describe(exclude='number')

Unnamed: 0,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,6113,8523,8523
unique,5,16,10,3,3,4
top,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,5089,1232,935,2793,3350,5577


##Fix data inconsistencies

Inconsistencies found and fixed:
- Item_Fat_Content
  - "LF" and "low fat" changed to "Low Fat"
  - "reg" changed to "Regular"
- Outlet_Size
  - "High" changed to "Large"

In [15]:
for col in eda_ml:
  print(f'Count for {col} is: \n{eda_ml[col].value_counts()}\n')

Count for Item_Weight is: 
12.150    86
17.600    82
13.650    77
11.800    76
15.100    68
          ..
7.275      2
7.685      1
9.420      1
6.520      1
5.400      1
Name: Item_Weight, Length: 415, dtype: int64

Count for Item_Fat_Content is: 
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

Count for Item_Visibility is: 
0.000000    526
0.076975      3
0.162462      2
0.076841      2
0.073562      2
           ... 
0.013957      1
0.110460      1
0.124646      1
0.054142      1
0.044878      1
Name: Item_Visibility, Length: 7880, dtype: int64

Count for Item_Type is: 
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks          

In [16]:
# Fix inconsistencies in Item_Fat_Content
#Display the value counts from the column
eda_ml['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [17]:
# Use a Dictionary to replace values
eda_ml.replace({'low fat': 'Low Fat', 'reg':'Regular','LF':'Low Fat', 'High':'Large'}, inplace = True)

In [18]:
#Display the value counts from the column
print(eda_ml['Item_Fat_Content'].value_counts())

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64


In [19]:
#Display the value counts from Outlet_Size
print(eda_ml['Outlet_Size'].value_counts())

Medium    2793
Small     2388
Large      932
Name: Outlet_Size, dtype: int64


#Determine Ordinal features
Ordinal features: Outlet_Size, Item_Fat_Content, Outlet_Location_Type

Check ordinal categories

In [20]:
eda_ml['Outlet_Size'].value_counts()

Medium    2793
Small     2388
Large      932
Name: Outlet_Size, dtype: int64

In [21]:
eda_ml['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [22]:
eda_ml['Outlet_Location_Type'].value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

#Split the Data (Validation Split)

In [23]:
# split X and y, we are predicting price
target = 'Item_Outlet_Sales'
X = eda_ml.drop(columns=[target]).copy()
y = eda_ml[target].copy()

# split training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30 , random_state=42)

In [24]:
X_train.shape

(5966, 10)

In [25]:
X_test.shape

(2557, 10)

In [26]:
X_train.dtypes

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object

#Create Pipelins and Tuples for Group of Columns

Pipeline types:
- Numeric: Item_Weight, Item_Fat_Content, Item_Visibility, Item_MRP, Outlet_Establishment_Year
- Nominal: Item_Identifier, Item_Type, Outlet_Type, Outlet_Identifier, Outlet_Establishment_Year
- Ordinal: Outlet_Size, Item_Fat_Content, Outlet_Location_Type

##Numeric

In [27]:
# PREPROCESSING PIPELINE FOR NUMERIC DATA

# Save list of number column names
num_cols = X_train.select_dtypes('number').columns

# Transformers
scaler = StandardScaler()
med_imputer = SimpleImputer(strategy = 'median')

# Pipeline
num_pipeline = make_pipeline(med_imputer, scaler)
num_pipeline

In [28]:
# Numeric Tuple
numeric_tuple = ('number',num_pipeline, num_cols)

## Ordinal

In [29]:
# PREPROCESSING PIPELINE FOR ORDINAL DATA

# Save list of number column names
ordinal_cols = ['Outlet_Size', 'Item_Fat_Content','Outlet_Location_Type']

# Ordered Category Lists

Outlet_Size_list = ['Small','Medium','Large']
Item_Fat_Content_list = ['Low Fat','Regular']
Outlet_Location_Type_list = ['Tier 1','Tier 2','Tier 3']

# Transformers

ord = OrdinalEncoder( categories = [Outlet_Size_list, Item_Fat_Content_list, Outlet_Location_Type_list])
freq_imputer = SimpleImputer(strategy='most_frequent')

# you might have 100 diff cat for ordinal so its getting out of range so good to scale
scaler2 = StandardScaler()

# Pipeline
ord_pipeline = make_pipeline(freq_imputer, ord, scaler2)

ord_pipeline

In [30]:
#Ordinal Tuple
ord_tuple = ('ordinal',ord_pipeline, ordinal_cols)

##Nominal


In [31]:
# PREPROCESSING PIPELINE FOR ONE-HOT-ENCODED DATA

# Save list of nominal column names
nominal_cols = X_train.select_dtypes('object').drop(columns=ordinal_cols).columns

# Transformers

missing_imputer = SimpleImputer(strategy='constant', fill_value='missing')
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Pipeline
nom_pipeline = make_pipeline(missing_imputer , ohe)
nom_pipeline

In [32]:
# Nominal Tuple
ohe_tuple = ('categorical',nom_pipeline, nominal_cols)

##Create Column Transformer to Apply Different Preprocessing to Different Columns

In [33]:
# Instantiate the make column transformer
col_transformer = ColumnTransformer([numeric_tuple,
                                       ord_tuple,
                                       ohe_tuple],
                                       remainder='drop', verbose_feature_names_out=False)
col_transformer

# Compare and Evaluate Different Models

# Linear Regression Model
1. Your first task is to build a linear regression model to predict sales.

  - Build a linear regression model.
  - Use the custom evaluation function to get the metrics for your model (on training and test data).
  - Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?

##Create and Evaluate the Linear Regression Model

In [34]:
# Instantiate a linear regression model
linreg = LinearRegression()

# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
linreg_pipe = make_pipeline(col_transformer, linreg)
linreg_pipe

##Fit the Column Transformer on the Training Data Only

In [35]:
# Fit the column transformer on the X_train
linreg_pipe.fit(X_train, y_train)

## Evaluatation metrics


In [56]:
# Obtain Model Evulation using custom function
scores_lr = evaluate_regression(linreg_pipe, X_train, y_train, X_test, y_test, verbose= True, output_frame=True, model_name= 'Linear Regression')

------------------------------------------------------------
Regression Metrics: Linear Regression Training Data
------------------------------------------------------------
- MAE = 847.663
- MSE = 1,298,669.325
- RMSE = 1,139.592
- R^2 = 0.561

------------------------------------------------------------
Regression Metrics: Linear Regression Test Data
------------------------------------------------------------
- MAE = 810.386
- MSE = 1,210,321.266
- RMSE = 1,100.146
- R^2 = 0.568


## Learnings
 - The R2 value shows that this data is a slight underfit for the data. Both the training and Test data R2 values are low.

# Default Random Forest Model
Your second task is to build a Random Forest model to predict sales.

- Build a default Random Forest model.
- Use the custom evaluation function to get the metrics for your model (on training and test data).
- Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?
- Compare this model's performance to the linear regression model: which model has the best test scores?

## Instantiate the default Random Forest Model

In [37]:
# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)
# Model Pipeline
rf_pipe = make_pipeline(col_transformer, rf)

# Fit the model pipeline on the training data only
rf_pipe.fit(X_train, y_train)

## Evaluation metrics

In [55]:
# Use custom function to evaluate default model

scores_rf = evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test, verbose= True, output_frame=True, model_name= 'Default Random Forest')

------------------------------------------------------------
Regression Metrics: Default Random Forest Training Data
------------------------------------------------------------
- MAE = 296.841
- MSE = 182,957.964
- RMSE = 427.736
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Default Random Forest Test Data
------------------------------------------------------------
- MAE = 778.649
- MSE = 1,249,723.025
- RMSE = 1,117.910
- R^2 = 0.554


## Learnings
- Based on the R2 value, this model is a high overfit (high varience)
- Because the Default Random Forest has such a high varience, I believe the linear regression model performs better. However, both aren't performant models because both R2 values are very low.

# Tuned Random Forest Model

- Use GridSearchCV to tune at least two hyperparameters for a Random Forest model.
- After determining the best parameters from your GridSearch, fit and evaluate a final best model on the entire training set (no folds).
- Compare your tuned model to your default Random Forest: did the performance improve?

In [40]:
# Get Parameters for tuning
rf_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('number',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
          'Outlet_Establishment_Year'],
         dtype='object')),
                                   ('ordinal',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ordinalencoder',
                                                     Ordi...
                                                     StandardScaler())]),
                           

## Define tuning parameter Values

In [41]:
# Define param grid with options to try
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          }


##Instantiate GridSearchCV

In [42]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [43]:
# Obtain best parameters
gridsearch.best_params_

{'randomforestregressor__max_depth': 10,
 'randomforestregressor__n_estimators': 200}

## Evaluation Metrics for Tuned Random Forest Model

In [57]:
# Define and refit best model
best_rf = gridsearch.best_estimator_
rft_best_scores = evaluate_regression(best_rf, X_train, y_train, X_test, y_test, verbose= True, output_frame=True, model_name= 'Tuned Random Forest Model')

------------------------------------------------------------
Regression Metrics: Tuned Random Forest Model Training Data
------------------------------------------------------------
- MAE = 637.369
- MSE = 806,900.857
- RMSE = 898.277
- R^2 = 0.727

------------------------------------------------------------
Regression Metrics: Tuned Random Forest Model Test Data
------------------------------------------------------------
- MAE = 748.389
- MSE = 1,154,142.731
- RMSE = 1,074.310
- R^2 = 0.588


In [60]:
display(scores_lr, scores_rf, rft_best_scores)

Unnamed: 0,MAE,MSE,RMSE,R^2
Linear Regression Training Data,847.663,1298669.325,1139.592,0.561
Linear Regression Test Data,810.386,1210321.266,1100.146,0.568


Unnamed: 0,MAE,MSE,RMSE,R^2
Default Random Forest Training Data,296.841,182957.964,427.736,0.938
Default Random Forest Test Data,778.649,1249723.025,1117.91,0.554


Unnamed: 0,MAE,MSE,RMSE,R^2
Tuned Random Forest Model Training Data,637.369,806900.857,898.277,0.727
Tuned Random Forest Model Test Data,748.389,1154142.731,1074.31,0.588


## Learnings
- The tuned model is performing better than the Default model. The tuned model has only slight bias compared to the Default model.