<a href="https://colab.research.google.com/github/nuraishasb/applied-ml/blob/main/Project_3_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<table class="table table-bordered">
    <tr>
        <th style="text-align:center; width:25%"><img src='https://www.nus.edu.sg/images/default-source/base/logo.png' style="width: 250px; height: 125px; "></th>
        <th style="text-align:center;"><h1>Applied Machine Learning</h1><h2>Project 3 - Machine Learning Pipeline </h2><h3></h3></th>
    </tr>
</table>

In this project, you are required to build a machine learning pipeline. Part of the codes are provided already and please follow the instruction to fill in the rest of codes.

In [1]:
# Import the requried packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model, neighbors, tree, svm, ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [2]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/Applied_ML/Projects' # Please adjust the path accordingly
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/Applied_ML/Projects


In [3]:
# Load dataset into a DataFrame
df = pd.read_csv('supermarket.csv')
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8337 entries, 0 to 8336
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                8337 non-null   float64
 1   Item_Fat_Content           8337 non-null   object 
 2   Item_Visibility            8337 non-null   float64
 3   Item_Type                  8337 non-null   object 
 4   Item_MRP                   8337 non-null   float64
 5   Outlet_Identifier          8337 non-null   object 
 6   Outlet_Establishment_Year  8337 non-null   int64  
 7   Outlet_Size                5955 non-null   object 
 8   Outlet_Location_Type       8337 non-null   object 
 9   Outlet_Type                8337 non-null   object 
 10  Item_Outlet_Sales          8337 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 716.6+ KB


In [5]:
# Set the "Item_Outlet_Sales" as target/model output and the rest features as model inputs
y = df['Item_Outlet_Sales']
X = df.drop(['Item_Outlet_Sales'], axis=1)

In [6]:
# Split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Build a Machine Learning Pipeline
* imputer: to handle missing values
* encoder: to encode categorical data
* scaler: to scale down the numerical data
* model: choose the best model you build from Project 2

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

num_attribs = list(X.select_dtypes(['int64','float64']))
cat_attribs = list(X.select_dtypes(['object']))

# Define categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # if there is nan, most freq value replaced
    ('encoder', OrdinalEncoder())
])

# Define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # if there is nan, median value replaced
    ('scaler', StandardScaler())
])

In [8]:
# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_attribs),
    ('num', num_pipe, num_attribs)
])

In [9]:
# Fit a pipeline with transformers and an estimator to the training data
from sklearn import ensemble
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ensemble.GradientBoostingRegressor(n_estimators=5000, learning_rate=0.001)),])

### Train the pipeline and evaluate the pipeline performance

In [10]:
pipe.fit(X_train, y_train)

In [None]:

# train
y_train_pred = pipe.predict(X_train)
print('train_mae:', mean_absolute_error(y_train, y_train_pred))

# test
y_test_pred = pipe.predict(X_test)
print('test_mae:', mean_absolute_error(y_test, y_test_pred))


train_mae: 692.603337679004
test_mae: 690.865847472706


In [None]:
print('Test MSE:', mean_squared_error(y_test, y_test_pred))
print('Test R²:', r2_score(y_test, y_test_pred))

Test MSE: 879645.7202482693
Test R²: 0.5782407395444744


### Save the pipeline model

In [None]:
import joblib
joblib.dump(pipe, "pipe_best_clf.pkl")

['pipe_best_clf.pkl']

In [None]:
test_data = X_test[0:5]
test_data

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
3164,12.15,Regular,0.132059,Fruits and Vegetables,187.5872,OUT035,2004,Small,Tier 2,Supermarket Type1
424,20.7,Regular,0.047565,Dairy,213.4876,OUT049,1999,Medium,Tier 1,Supermarket Type1
7106,15.25,Low Fat,0.089999,Frozen Foods,215.1192,OUT045,2002,,Tier 2,Supermarket Type1
701,20.0,Regular,0.0,Frozen Foods,127.3678,OUT046,1997,Small,Tier 1,Supermarket Type1
1195,20.6,Low Fat,0.071137,Household,76.3696,OUT045,2002,,Tier 2,Supermarket Type1


In [None]:
my_pipeline_loaded = joblib.load("pipe_best_clf.pkl")
my_pipeline_loaded.predict(test_data)

array([3099.30497107, 3473.212301  , 3418.78910004, 2082.62044108,
       1301.8658204 ])

In [None]:
y_test[0:5]

Unnamed: 0,Item_Outlet_Sales
3164,4349.0056
424,1929.4884
7106,2157.192
701,1780.3492
1195,372.848
