<a href="https://colab.research.google.com/github/mikemenj/sales-predictions/blob/main/Project_1_Part_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Project 1 Part 5
Student: Michael Menjares

#Task

- Before splitting your data, you can drop duplicates and fix inconsistencies in categorical data.* (*There is a way to do this after the split, but for this project, you may perform this step before the split)
- Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.
- Perform a train test split
- Create a preprocessing object to prepare the dataset for Machine Learning
- Make sure your imputation of missing values occurs after the train test split using SimpleImputer.

# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, \
OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(display='diagram')

#Load Data

In [7]:
def csv_link(link):
  # replace /file/d/ with /uc?id=
  link = link.replace('/file/d/', '/uc?id=')

  # remove /view?usp=sharing or /edit from the end of the link
  link = link.replace('/view?usp=sharing', '')
  link = link.replace('/edit', '')
  link = link.replace('/view', '')

  return link

link = 'https://drive.google.com/file/d/1syH81TVrbBsdymLT_jl2JIf6IjPXtSQw/view'
url = csv_link(link)
url

original_df = pd.read_csv(url)
df = original_df.copy()
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


#Duplicates, Inconsistencies, Missing Values

In [9]:
#Duplicates
print(f'{df.duplicated().sum()} duplicates')

0 duplicates


In [12]:
#Value Counts on all columns and check for inconsistencies
for col in df.columns:
  print(f'{col} Value Counts:')
  print(df[col].value_counts(dropna = False))
  print('\n')

Item_Identifier Value Counts:
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64


Item_Weight Value Counts:
NaN       1463
12.150      86
17.600      82
13.650      77
11.800      76
          ... 
7.275        2
7.685        1
9.420        1
6.520        1
5.400        1
Name: Item_Weight, Length: 416, dtype: int64


Item_Fat_Content Value Counts:
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64


Item_Visibility Value Counts:
0.000000    526
0.076975      3
0.162462      2
0.076841      2
0.073562      2
           ... 
0.013957      1
0.110460      1
0.124646      1
0.054142      1
0.044878      1
Name: Item_Visibility, Length: 7880, dtype: int64


Item_Type Value Counts:
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              

Item_fat_content has inconsistent values

In [14]:
#Fix Inconsistent Values
fat_content = {'LF':'Low Fat', 'low fat':'Low Fat','reg':'Regular'}

df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(fat_content)

df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [16]:
#Missing Values
print(f'Missing values for each column: \n{df.isna().sum()}\n')
print(f'{df.isna().sum().sum()} total missing values in dataframe')

Missing values for each column: 
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

3873 total missing values in dataframe


#Define features (X) and target (y)

In [19]:
target = 'Item_Outlet_Sales'

y = df[target]
X = df.drop(columns = [target, 'Item_Identifier'])

print(X.head(),'\n')
print(y.head())

   Item_Weight Item_Fat_Content  Item_Visibility              Item_Type  \
0         9.30          Low Fat         0.016047                  Dairy   
1         5.92          Regular         0.019278            Soft Drinks   
2        17.50          Low Fat         0.016760                   Meat   
3        19.20          Regular         0.000000  Fruits and Vegetables   
4         8.93          Low Fat         0.000000              Household   

   Item_MRP Outlet_Identifier  Outlet_Establishment_Year Outlet_Size  \
0  249.8092            OUT049                       1999      Medium   
1   48.2692            OUT018                       2009      Medium   
2  141.6180            OUT049                       1999      Medium   
3  182.0950            OUT010                       1998         NaN   
4   53.8614            OUT013                       1987        High   

  Outlet_Location_Type        Outlet_Type  
0               Tier 1  Supermarket Type1  
1               Tier 3  Supe

#Train/Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# ID Features

- Item_Weight, Item_Visibility, Item_MRP, are numeric
- Item_Fat_Content, Item_Type, Outlet_Identifier, Outlet_Establishment_year, Outlet Location_Type, Outlet_Type are nominal
- year is listed as int type

# Instantiate Transformers

In [22]:
mean_imp = SimpleImputer(strategy = 'mean')
freq_imp = SimpleImputer(strategy = 'most_frequent')

ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

scaler = StandardScaler()

#Create Pipelines

In [23]:
num_pipe = make_pipeline(mean_imp, scaler)
cat_pipe = make_pipeline(freq_imp, ohe)

#Create Tuples to Pair Pipelines with Columns

In [24]:
#cols selectors
num_cols = make_column_selector(dtype_include = 'float')
cat_cols = make_column_selector(dtype_include = ['object', 'int'])

#tuples with transformers first, then cols
num_tuple = (num_pipe, num_cols)
cat_tuple = (cat_pipe, cat_cols)

#Create Column Transformer to Apply Preprocessing

In [25]:
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder = 'drop')

#Fit the Column Transformer

In [26]:
preprocessor.fit(X_train)



#Transform Train and Test

In [27]:
processed_train = preprocessor.transform(X_train)
processed_test = preprocessor.transform(X_test)

In [28]:
processed_train[:5]

array([[ 0.81724868, -0.71277507,  1.82810922,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.5563395 , -1.29105225,  0.60336888,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.

In [29]:
processed_test[:5]

array([[ 3.31008853e-01, -7.76646248e-01, -9.98815536e-01,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [-1.17

In [30]:
print(f'{np.isnan(processed_train).sum()} missing values in X_train')
print(f'{np.isnan(processed_test).sum()} missing values in X_test')

0 missing values in X_train
0 missing values in X_test
