In [1]:
#import necessary libraries for the model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer



import warnings
warnings.filterwarnings('ignore')

In [14]:
#reading in the data file using pandas

cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('auto-mpg.data', names=cols, na_values='?',
                comment= '\t',
                sep=' ',
                skipinitialspace=True)

data = df.copy()

#splitting the data using StratifiedShuffleSplit from sklearn
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [15]:
#Separating the target variable from the features
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [16]:
#Preprocessing the Origin column to contain actual labels rather than their numerical representations
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [17]:
#Creation of custom attribute adder class, to automate the creation of different features, such as the 
#correlation of acceleration to horsepower and number of cylinders
acc_ix, hpower_ix, cyl_ix = 4,2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

In [18]:
def num_pipeline_transformer(data):
    '''
    Function that processes numerical transformations
    Argument:
        data: original dataframe
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
    '''

    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")), #Uses median values to fill in null/missing values
        ('attrs_adder', CustomAttrAdder()), #Adds additional custom attributes from the custom class created prior
        ('std_scaler', StandardScaler()), #Scales data appropriately to use the model
        ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    '''
    Completes the transformation pipeline for both numerical and categorical data.
    
    Argument:
        data: original dataframe
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data) #Call to previous function to handle numerical data
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs), #One Hot encodes the categorical data, in this case, the origin column
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data


## From raw data to processed data in 2 steps

In [19]:
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [20]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

## Training 4 different models
1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM Regressor

## Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [23]:
#Creating sample data to test predictions

sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of Samples: ", lin_reg.predict(sample_data_prepared))
print("Actual Labels of Samples: ", list(sample_labels))

Prediction of Samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]
Actual Labels of Samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [24]:
from sklearn.metrics import mean_squared_error

#Calculating the Mean Squared Error of the predictions


mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions) #Calculating MSE by plugging the actual values with predictions
lin_rmse = np.sqrt(lin_mse) #Converting to the root MSE with numpy
lin_rmse

2.9590402225760872

## Decision Tree

In [25]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [26]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

The decision tree predictions seemingly indicate that the root mean squared error is 0, or a perfect prediction. This is unlikely, and it is likely that the model has been **overfit**

To check out what is happening, we will use **Cross Validation** to take K-folds and randomly split the data. The result are K evaluation scores

In [27]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,
                        prepared_data,
                        data_labels,
                        scoring='neg_mean_squared_error',
                        cv = 10)
tree_reg_rmse_scores = np.sqrt(-scores)
tree_reg_rmse_scores

array([2.96642672, 2.98862427, 3.42057196, 3.5       , 2.2192341 ,
       2.98092897, 2.87326034, 3.68400258, 4.16962905, 2.64123627])

In [28]:
tree_reg_rmse_scores.mean()

3.1443914257175143

In [29]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [30]:
lin_reg_rmse_scores.mean()

3.0757081793709324

## Random Forest

In [32]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                      prepared_data,
                                      data_labels,
                                      scoring="neg_mean_squared_error",
                                      cv = 10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.6098723114018525

## Support Vector Machine Regressor

In [33]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg,
                               prepared_data,
                               data_labels,
                               scoring='neg_mean_squared_error',
                               cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.08659162080283

## Fine Tuning Hyperparameters with GridSearchCV

A model parameter is a configuration variable that is internal to the model and whose value can be estimated from the data. They are similar to the parameters passed into a function in programming, but in this case, parameters passed into the model.

Some examples of parameters:
 - Weights in an artificial neural network
 - The support vectors in a support vector machine
 - The coefficients in a linear regression or a logistic regression 

A model hyperparameter is a configuration external to the model and whose vlaue cannot be estimated from the data. They are often used in processes to help estimate model parameters, specified by the practionoer, and used to tune for a given predictive modeling problem. 

 - The learning rate for training a neural network
 - The C and sigma hyperparameters for support vector machines
 - The k in k-nearest-neighbors

In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                          scoring="neg_mean_squared_error",
                          return_train_score=True,
                          cv=10,
                          )

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [36]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [38]:
cv_scores = grid_search.cv_results_

#Print all parameters along with their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(np.sqrt(-mean_score), params)

3.5187844773484875 {'max_features': 2, 'n_estimators': 3}
3.211778715939392 {'max_features': 2, 'n_estimators': 10}
2.9105214634652867 {'max_features': 2, 'n_estimators': 30}
3.315262103306475 {'max_features': 4, 'n_estimators': 3}
2.755952110667078 {'max_features': 4, 'n_estimators': 10}
2.759392668661999 {'max_features': 4, 'n_estimators': 30}
3.1999721380955513 {'max_features': 6, 'n_estimators': 3}
2.7242450807665315 {'max_features': 6, 'n_estimators': 10}
2.7363377021006157 {'max_features': 6, 'n_estimators': 30}
2.9871191513518105 {'max_features': 8, 'n_estimators': 3}
2.807615802052603 {'max_features': 8, 'n_estimators': 10}
2.7043668758829225 {'max_features': 8, 'n_estimators': 30}
3.3492312082595803 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.9496329733408113 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.3778952469375296 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.926125667392118 {'bootstrap': False, 'max_features': 3, 'n_estim

In [39]:
#Checking feature importance
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.19231428, 0.37095856, 0.11334791, 0.15238502, 0.01082228,
       0.10766943, 0.02626149, 0.01823942, 0.00336293, 0.00153805,
       0.00310062])

In [40]:
extra_attrs = ['acc_on_power', 'acc_on_cyl']
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.026261494778480495),
 ('acc_on_cyl', 0.018239422657943385),
 ('Weight', 0.1523850226431066),
 ('Model Year', 0.10766943133400218),
 ('Horsepower', 0.11334790982496806),
 ('Displacement', 0.370958560857639),
 ('Cylinders', 0.19231427606021886),
 ('Acceleration', 0.010822279312477564)]

## Evaluating the system on test data

In [41]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('MPG', axis=1)
y_test = strat_test_set['MPG'].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

3.052067423166723

## Creating a function to initiate the entire process

In [44]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
        
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [45]:
#Checking the function on a random sample
vehicle_config = {
    'Cylinders': [4,6,8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([33.71      , 17.85666667, 19.20333333])

## Saving the model

In [46]:
import pickle

In [48]:
with open('model.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [49]:
#Loading the model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)
    
predict_mpg(vehicle_config, model)

array([33.71      , 17.85666667, 19.20333333])

In [51]:
import requests

url = "http://localhost:5000/predict"
r = requests.post(url, json = vehicle_config)
r.text.strip()

'{\n  "mpg_prediction": [\n    34.080000000000005, \n    18.936666666666667, \n    18.683333333333334\n  ]\n}'