<a href="https://colab.research.google.com/github/ob3dd/Prediction-of-Product-Sales/blob/main/Modeling_and_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Import Libraries**

In [1]:
## Typical Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Modeling & preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

##**Reloading the data**

In [2]:
fpath = "/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/sales_predictions_2023.csv"

In [3]:
df_sales = pd.read_csv(fpath)
df_sales.head()
df_sales.info

<bound method DataFrame.info of      Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0              FDA15        9.300          Low Fat         0.016047   
1              DRC01        5.920          Regular         0.019278   
2              FDN15       17.500          Low Fat         0.016760   
3              FDX07       19.200          Regular         0.000000   
4              NCD19        8.930          Low Fat         0.000000   
...              ...          ...              ...              ...   
8518           FDF22        6.865          Low Fat         0.056783   
8519           FDS36        8.380          Regular         0.046982   
8520           NCJ29       10.600          Low Fat         0.035186   
8521           FDN46        7.210          Regular         0.145221   
8522           DRG01       14.800          Low Fat         0.044878   

                  Item_Type  Item_MRP Outlet_Identifier  \
0                     Dairy  249.8092            OUT049 

In [4]:
df_model = df_sales.copy()

In [5]:
df_model.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


##Performing Preprocessing Steps

In [6]:
# Checking for Duplicates
df_model.duplicated().sum()

0

In [7]:
# Checking missing values
df_model.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [8]:
# Identifying missing values.
missing_values = df_model.isnull().any(axis=0)

In [9]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [10]:
# Finding and fixing any inconsistent categories of data.
# the "Item Fat Column" was inconsistent
item_fat_column = 'Item_Fat_Content'
item_counts = df_sales[item_fat_column].value_counts()

In [11]:
# replacing the "LF, low fat" inconsistencies with 'Low Fat' so they can be consistent.
df_sales['Item_Fat_Content'] = df_sales['Item_Fat_Content'].replace({'LF':"Low Fat",'low fat':'Low Fat'})
df_sales['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    2889
reg         117
Name: Item_Fat_Content, dtype: int64

In [12]:
# repeating the same process for the "regular"
df_sales['Item_Fat_Content'] = df_sales['Item_Fat_Content'].replace({'regular':"Regular",'reg':'Regular'})
df_sales['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [13]:
# printing the column
print(df_model['Item_Fat_Content'])

0       Low Fat
1       Regular
2       Low Fat
3       Regular
4       Low Fat
         ...   
8518    Low Fat
8519    Regular
8520    Low Fat
8521    Regular
8522    Low Fat
Name: Item_Fat_Content, Length: 8523, dtype: object


##Defining X and y

In [14]:
## Define X and y
target = 'Item_Outlet_Sales'

X = df_model.drop(columns=target).copy()
y = df_model[target].copy()
X.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


##Dropping Unwanted Columns

Here we are going to drop all "Outlet" columns due to these being in different units in the column.
To ensure consistency in the predictions, all columns need to have the same units for the entire column.

In [15]:
## Drop unwanted/inappropriate columns
bad_cols = ['Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size',	'Outlet_Location_Type',	'Outlet_Type']
X = X.drop(columns = bad_cols)
X.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095
4,NCD19,8.93,Low Fat,0.0,Household,53.8614


##Train-Test-Split

In [16]:
# Perfoming a train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

##Making a Preprocessing Pipeline

In [17]:
# Creating a categorical data selector
cat_selector = make_column_selector(dtype_include = 'object')
cat_selector(X_train)

['Item_Identifier', 'Item_Fat_Content', 'Item_Type']

In [18]:
# Creating a pipline for handling categorical data

impute_cat = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

cat_pipe = make_pipeline(impute_cat,encoder)
cat_pipe

In [19]:
## test cat_pipe (optional)
cat_pipe.fit_transform(X_train[cat_selector(X_train)])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
# Creating a numeric data selector
num_selector = make_column_selector(dtype_include='number')
num_selector(X_train)

['Item_Weight', 'Item_Visibility', 'Item_MRP']

In [21]:
# Creating scaler object to scale the data
scaler = StandardScaler()

In [22]:
## test num_pipe (optional)
scaler.fit_transform(X_train[num_selector(X_train)])

array([[ 0.74311896, -0.71277507,  1.82810922],
       [ 0.50587592, -1.29105225,  0.60336888],
       [-0.11958298,  1.81331864,  0.24454056],
       ...,
       [ 1.01271331, -0.92052713,  1.52302674],
       [ 1.60582089, -0.2277552 , -0.38377708],
       [ 0.74311896, -0.95867683, -0.73836105]])

In [23]:
## Combine into 1 column transformer
preprocessor = make_column_transformer((cat_pipe,cat_selector),
                                       (scaler,num_selector))
preprocessor

In [24]:
## Test Col Transformer (optional)
preprocessor.fit_transform(X_train)

array([[ 0.        ,  0.        ,  0.        , ...,  0.74311896,
        -0.71277507,  1.82810922],
       [ 0.        ,  0.        ,  0.        , ...,  0.50587592,
        -1.29105225,  0.60336888],
       [ 0.        ,  0.        ,  0.        , ..., -0.11958298,
         1.81331864,  0.24454056],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.01271331,
        -0.92052713,  1.52302674],
       [ 0.        ,  0.        ,  0.        , ...,  1.60582089,
        -0.2277552 , -0.38377708],
       [ 0.        ,  0.        ,  0.        , ...,  0.74311896,
        -0.95867683, -0.73836105]])

In [25]:
type(preprocessor)

sklearn.compose._column_transformer.ColumnTransformer