<a href="https://colab.research.google.com/github/oscardominguez-ds/Outlet-Store-Sales-Predictions/blob/main/Outlet_Store_Sales_Predictions_Public.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
## Pandas
import pandas as pd
## Numpy
import numpy as np

## Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

## Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

## Regression Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

## Set global scikit-learn configuration 
from sklearn import set_config
## Display estimators as a diagram
set_config(display='diagram') # 'text' or 'diagram'}


In [2]:
path = '/content/sales_predictions.csv'

df = pd.read_csv(path)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# -Before splitting your data, you can drop duplicates and fix inconsistencies in categorical data.* 
# (*There is a way to do this after the split, but for this project, you may perform this step before the split)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
# Checking for Duplicates
df.duplicated().sum()

0

In [5]:
# Checking missing values
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
# -Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.
# -Perform a train test split


X = df.drop(columns=['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year']) 
y = df['Item_Outlet_Sales']

X


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.300,Low Fat,0.016047,Dairy,249.8092,Medium,Tier 1,Supermarket Type1
1,5.920,Regular,0.019278,Soft Drinks,48.2692,Medium,Tier 3,Supermarket Type2
2,17.500,Low Fat,0.016760,Meat,141.6180,Medium,Tier 1,Supermarket Type1
3,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,,Tier 3,Grocery Store
4,8.930,Low Fat,0.000000,Household,53.8614,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...,...,...
8518,6.865,Low Fat,0.056783,Snack Foods,214.5218,High,Tier 3,Supermarket Type1
8519,8.380,Regular,0.046982,Baking Goods,108.1570,,Tier 2,Supermarket Type1
8520,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,Small,Tier 2,Supermarket Type1
8521,7.210,Regular,0.145221,Snack Foods,103.1332,Medium,Tier 3,Supermarket Type2


In [7]:
df['Outlet_Size'].value_counts()
# Ordinal Encoding 'Outlet_Size'
replacement_dictionary = {'High':2, 'Medium':1, 'Small':0}
df['Outlet_Size'].replace(replacement_dictionary, inplace=True)
df['Outlet_Size'].value_counts()

1.0    2793
0.0    2388
2.0     932
Name: Outlet_Size, dtype: int64

In [8]:
#Ordinal encoding Item_Fat_Content
df['Item_Fat_Content'].replace({'low fat':0,'LF':0,'Low Fat':0,'reg':1,'Regular':1},inplace=True)

In [9]:
# -Perform a train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [10]:
# -Create a preprocessing object to prepare the dataset for Machine Learning
# -Make sure your imputation of missing values occurs after the train test split using SimpleImputer.

display(X_train.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           5285 non-null   float64
 1   Item_Fat_Content      6392 non-null   object 
 2   Item_Visibility       6392 non-null   float64
 3   Item_Type             6392 non-null   object 
 4   Item_MRP              6392 non-null   float64
 5   Outlet_Size           4580 non-null   object 
 6   Outlet_Location_Type  6392 non-null   object 
 7   Outlet_Type           6392 non-null   object 
dtypes: float64(3), object(5)
memory usage: 449.4+ KB


None

In [11]:
# Selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data

Unnamed: 0,Item_Fat_Content,Item_Type,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,Low Fat,Household,Medium,Tier 3,Supermarket Type2
7510,Regular,Snack Foods,Medium,Tier 3,Supermarket Type2
5828,Regular,Meat,Medium,Tier 1,Supermarket Type1
5327,Low Fat,Baking Goods,Small,Tier 2,Supermarket Type1
4810,Low Fat,Frozen Foods,,Tier 2,Supermarket Type1
...,...,...,...,...,...
5734,Regular,Fruits and Vegetables,,Tier 3,Grocery Store
5191,Low Fat,Frozen Foods,,Tier 2,Supermarket Type1
5390,Low Fat,Health and Hygiene,,Tier 2,Supermarket Type1
860,low fat,Snack Foods,,Tier 2,Supermarket Type1


In [12]:
# Imputers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

# Scaler
scaler = StandardScaler()
# One-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)


In [13]:
# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe



In [14]:
# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe


In [15]:
# Tuples for Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)
# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple)
preprocessor


In [16]:
# fit on train
preprocessor.fit(X_train)



In [17]:
# transform train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [18]:
# Check for missing values and that data is scaled and one-hot encoded
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('shape of data is', X_train_processed.shape)
print('\n')
X_train_processed.round(2)



0 missing values in training data
0 missing values in testing data


All data in X_train_processed are float64
All data in X_test_processed are float64


shape of data is (6392, 34)




array([[ 0.82, -0.71,  1.83, ...,  0.  ,  1.  ,  0.  ],
       [ 0.56, -1.29,  0.6 , ...,  0.  ,  1.  ,  0.  ],
       [-0.13,  1.81,  0.24, ...,  1.  ,  0.  ,  0.  ],
       ...,
       [ 1.11, -0.92,  1.52, ...,  1.  ,  0.  ,  0.  ],
       [ 1.77, -0.23, -0.38, ...,  1.  ,  0.  ,  0.  ],
       [ 0.82, -0.96, -0.74, ...,  1.  ,  0.  ,  0.  ]])

#1. Your first task is to build a linear regression model to predict sales.

-Build a linear regression model.

-Evaluate the performance of your model based on r^2.

-Evaluate the performance of your model based on rmse.

In [19]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

from sklearn.pipeline import make_pipeline
reg_pipe = make_pipeline(preprocessor, reg)


In [20]:
reg_pipe.fit(X_train,y_train)



In [21]:
#Create Model Predictions to display Training and Testing Data.
train_pred = reg_pipe.predict(X_train)
test_pred = reg_pipe.predict(X_test)

In [22]:
predictions = reg_pipe.predict(X_test)


# Displaying sales predictions with original data

In [23]:
prediction_df = X_test.copy()
prediction_df['True Median Price'] = y_test
prediction_df['Predicted Median Price'] = predictions
prediction_df['Error'] = predictions - y_test
prediction_df.head()


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,True Median Price,Predicted Median Price,Error
7503,14.3,Low Fat,0.0263,Frozen Foods,79.4302,High,Tier 3,Supermarket Type1,1743.0644,1363.0,-380.0644
2957,7.93,Low Fat,0.071136,Health and Hygiene,42.7086,Small,Tier 1,Supermarket Type1,356.8688,799.0,442.1312
7031,14.5,Regular,0.041313,Canned,42.0454,Medium,Tier 1,Supermarket Type1,377.5086,855.0,477.4914
1084,,Regular,0.044767,Soft Drinks,173.7054,Medium,Tier 3,Supermarket Type3,5778.4782,4200.0,-1578.4782
856,10.195,Regular,0.012456,Meat,197.511,Small,Tier 2,Supermarket Type1,2356.932,3269.0,912.068


In [24]:
#-Evaluate the performance of your model based on r^2.

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print(f'Model Training R2: {train_r2}')
print(f'Model Training R2: {test_r2}')

Model Training R2: 0.5598659623940478
Model Training R2: 0.5616802564782515


In [25]:
#Need to get the MSE to get the RMSE
train_MSE = mean_squared_error(y_train, train_pred)
test_MSE = mean_squared_error(y_test, test_pred)

print(f'Model Training MSE: {train_MSE}')
print(f'Model Testing MSE: {test_MSE}')

Model Training MSE: 1302557.188349242
Model Testing MSE: 1209314.2171737424


In [26]:
#-Evaluate the performance of your model based on rmse.
train_RMSE = np.sqrt(train_MSE)
test_RMSE = np.sqrt(test_MSE)

print(f'Model Training RMSE: {train_RMSE}')
print(f'Model Testing RMSE: {test_RMSE}')

Model Training RMSE: 1141.296275447021
Model Testing RMSE: 1099.6882363532595


#2. Your second task is to build a regression tree model to predict sales.

-Build a simple regression tree model.

-Compare the performance of your model based on r^2.

-Compare the performance of your model based on rmse.

In [27]:
# By leaving the parenthesis empty, we are choosing to use all of the default parameters
# So you get the same results, let's set our random_state to the same number

dec_tree = DecisionTreeRegressor(random_state = 42)

#Create Pipeline
dec_tree_pipe = make_pipeline(preprocessor, dec_tree)

#Fit Pipeline
dec_tree_pipe.fit(X_train, y_train)



# Displaying sales predictions with original data

In [28]:

predictions = dec_tree_pipe.predict(X_test)
prediction_df = X_test.copy()
prediction_df['True Median Price'] = y_test
prediction_df['Predicted Median Price'] = predictions
prediction_df['Error'] = predictions - y_test
prediction_df.head()


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,True Median Price,Predicted Median Price,Error
7503,14.3,Low Fat,0.0263,Frozen Foods,79.4302,High,Tier 3,Supermarket Type1,1743.0644,805.618,-937.4464
2957,7.93,Low Fat,0.071136,Health and Hygiene,42.7086,Small,Tier 1,Supermarket Type1,356.8688,1249.0408,892.172
7031,14.5,Regular,0.041313,Canned,42.0454,Medium,Tier 1,Supermarket Type1,377.5086,479.376,101.8674
1084,,Regular,0.044767,Soft Drinks,173.7054,Medium,Tier 3,Supermarket Type3,5778.4782,5000.8238,-777.6544
856,10.195,Regular,0.012456,Meat,197.511,Small,Tier 2,Supermarket Type1,2356.932,5141.3076,2784.3756


In [29]:
# evaluate the model
train_score = dec_tree_pipe.score(X_train, y_train)
test_score = dec_tree_pipe.score(X_test, y_test)
print(train_score)
print(test_score)

1.0
0.13110528507769081


In [30]:
#-Compare the performance of your model based on r^2.
#-Evaluate the performance of your model based on r^2.

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print(f'Model Training R2: {train_r2}')
print(f'Model Test R2: {test_r2}')

Model Training R2: 0.5598659623940478
Model Test R2: 0.5616802564782515


In [31]:
#Need to get the MSE to get the RMSE
train_MSE = mean_squared_error(y_train, train_pred)
test_MSE = mean_squared_error(y_test, test_pred)

print(f'Model Training MSE: {train_MSE}')
print(f'Model Testing MSE: {test_MSE}')

Model Training MSE: 1302557.188349242
Model Testing MSE: 1209314.2171737424


In [32]:
#-Evaluate the performance of your model based on rmse.
train_RMSE = np.sqrt(train_MSE)
test_RMSE = np.sqrt(test_MSE)

print(f'Model Training RMSE: {train_RMSE}')
print(f'Model Testing RMSE: {test_RMSE}')

Model Training RMSE: 1141.296275447021
Model Testing RMSE: 1099.6882363532595


#3. You now have tried 2 different models on your data set. You need to determine which model to implement.

-Overall, which model do you recommend? I recommend the Linear regression model. The errors or the difference from the true median price vs predicted is smaller than the Decision Tree model. 