- [ ] Before splitting your data, you can drop duplicates and fix inconsistencies in categorical data.* (*There is a way to do this after the split, but for this project, you may perform this step before the split)
- [ ] Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.  
- [ ] Perform a train test split 
- [ ] Create a preprocessing object to prepare the dataset for Machine Learning
- [ ] Make sure your imputation of missing values occurs **AFTER** the train test split using SimpleImputer.  

In [1]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, \
OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')

In [3]:
# Run unfiltered 'sales_predictions.csv'
df = pd.read_csv('/content/sales_predictions.csv')
pd.read_csv('/content/sales_predictions.csv')

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [4]:
# Make a copy of the data
ml_df = df.copy()

In [5]:
ml_df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
ml_df.duplicated().sum()

0

### Drop Duplicates and Fix Inconsistencies

In [8]:
iw_mean = ml_df['Item_Weight'].mean()
iw_mean.round(2)

12.86

In [9]:
# Filling in missing values for Item_Weight
ml_df['Item_Weight']= ml_df['Item_Weight'].fillna(iw_mean)

In [10]:
os_mode = ml_df['Outlet_Size'].mode()
os_mode

0    Medium
dtype: object

In [11]:
# Filling in missing values for Outlet_Size
ml_df['Outlet_Size']= ml_df['Outlet_Size'].fillna('Medium')

In [None]:
ml_df.isna().sum()

In [13]:
# Fixing Inconsistencies with column 'Item_Fat_Content'
ml_df['Item_Fat_Content'] = ml_df['Item_Fat_Content'].replace('LF', 'Low Fat')
ml_df['Item_Fat_Content'] = ml_df['Item_Fat_Content'].replace('low fat', 'Low Fat')
ml_df['Item_Fat_Content'] = ml_df['Item_Fat_Content'].replace('reg', 'Regular')
ml_df['Item_Fat_Content'].value_counts()


Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

- [x] Double checking duplicates and missing values are still 0


In [None]:
ml_df.duplicated().sum()

In [None]:
ml_df.isna().sum()

Assigning the target (y) value ['Item_Outlet_Sales'] & features (X) 

In [23]:
y=ml_df['Item_Outlet_Sales']
X=ml_df[['Item_Identifier',
         'Item_Weight',
         'Item_Fat_Content',
         'Item_Visibility',
         'Item_Type',
         'Item_MRP',
         'Outlet_Identifier',
         'Outlet_Establishment_Year',
         'Outlet_Size',
         'Outlet_Location_Type',
         'Outlet_Type']]

Perform a Train/Test Split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)