# Data preparation using Sklearn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Defining the column names based on the data description
cols = ['MPG', 'Cylinders', 'Displacements', 'Horsepower', 'Weight', 
        'Acceleration', 'Model Year', 'Origin']
# Reading the .data file using pandas
df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                 comment = '\t', sep =" ", skipinitialspace=True)

# Making copy of the dataframe
data = df.copy()
data.head()

Unnamed: 0,MPG,Cylinders,Displacements,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [3]:
# Stratified split of the data
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

In [4]:
# Separating the target variable from the data
data = train_set.drop("MPG", axis=1)
data_labels = train_set["MPG"].copy()




### Preprocessing the Origin Column



In [5]:
# function to map the numbers in the origin column to the country
def preprocess_origin_col(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA",3: "Germany"})
    return df
data_tr = preprocess_origin_col(data)
data_tr.head()



Unnamed: 0,Cylinders,Displacements,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA


In [6]:
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Cylinders      318 non-null    int64  
 1   Displacements  318 non-null    float64
 2   Horsepower     314 non-null    float64
 3   Weight         318 non-null    float64
 4   Acceleration   318 non-null    float64
 5   Model Year     318 non-null    int64  
 6   Origin         318 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 19.9+ KB


In [7]:
data_cat = data_tr[['Origin','Cylinders']]
data_cat.head()

Unnamed: 0,Origin,Cylinders
145,Germany,4
151,USA,4
388,India,4
48,India,6
114,USA,4


In [8]:
#One hot encoding categorical values
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

<318x8 sparse matrix of type '<class 'numpy.float64'>'
	with 636 stored elements in Compressed Sparse Row format>

### Missing values

In [9]:
# Segregating numerical data
num_data = data.iloc[:,:-1]
num_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Cylinders      318 non-null    int64  
 1   Displacements  318 non-null    float64
 2   Horsepower     314 non-null    float64
 3   Weight         318 non-null    float64
 4   Acceleration   318 non-null    float64
 5   Model Year     318 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 17.4 KB


In [10]:
# Getting the missing values in Horsepower using the media
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(num_data)

# Filing the missing values
X = imputer.transform(num_data)

# Converting the array back to a dataframe
data_tr = pd.DataFrame(X, columns = num_data.columns, index = num_data.index)

data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Cylinders      318 non-null    float64
 1   Displacements  318 non-null    float64
 2   Horsepower     318 non-null    float64
 3   Weight         318 non-null    float64
 4   Acceleration   318 non-null    float64
 5   Model Year     318 non-null    float64
dtypes: float64(6)
memory usage: 17.4 KB


### Adding new atributes 

In [11]:
num_data.head()

Unnamed: 0,Cylinders,Displacements,Horsepower,Weight,Acceleration,Model Year
145,4,83.0,61.0,2003.0,19.0,74
151,4,79.0,67.0,2000.0,16.0,74
388,4,156.0,92.0,2585.0,14.5,82
48,6,250.0,88.0,3139.0,14.5,71
114,4,98.0,90.0,2265.0,15.5,73


In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

# Index of the colums given their position in the dataframe
acceleration_ind = 4
horsepower_ind = 2
cylinders_ind = 0

#Attributes adder
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acceleration_on_horsepower = True):
        self.acceleration_on_horsepower = acceleration_on_horsepower
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        acceleration_on_cylinders = X[:, acceleration_ind] / X[:, cylinders_ind]
        if self.acceleration_on_horsepower:
            acceleration_on_horsepower = X[:, acceleration_ind] / X[:, horsepower_ind]
            #np.c_ method concatenate the arrays
            return np.c_[X, acceleration_on_horsepower, acceleration_on_cylinders]
        
        return np.c_[X, acceleration_on_cylinders]

attr_adder = CustomAttrAdder(acceleration_on_horsepower = True)
data_tr_extra_attrs = attr_adder.transform(data_tr.values)
data_tr_extra_attrs[0]
    
    


array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

# Creating Pipeline

In [13]:
from sklearn.pipeline import Pipeline
## Scaling numerical atributes
from sklearn.preprocessing import StandardScaler

numerics = ['float64', 'int64']

num_data = data_tr.select_dtypes(include = numerics)

## Pipeline for numerical attributes
# Impute missing values
# Add atributes
# Scale the data
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attrs_adder', CustomAttrAdder()),
    ('std_scaler', StandardScaler())
])

num_data_tr = num_pipeline.fit_transform(num_data)
num_data_tr[0]


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

## Transforming numerical and categorical

In [14]:
from sklearn.compose import ColumnTransformer
num_attrs = list(num_data)
cat_attrs = ['Origin']

##complete pipeline to transform
##both numerical and categorical attributes

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attrs),
    ('cat', OneHotEncoder(), cat_attrs),
])

prepared_data = full_pipeline.fit_transform(data)
prepared_data[0]


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

# Machine Learning Models

## Selecting and training models
1. Try different algorithms
2. Evaluata (RMS)
3. Model evaluation using Cross Validation
4. Hypeparameter Tuning (GridSearch CV)
5. Check feature importance
6. Evaluate on the test data
7. Save the model

### 1. Try different algorithms

Linear Regression

In [15]:
# Linear regression
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() #instance the lin reg class
lin_reg.fit(prepared_data, data_labels)


LinearRegression()

Mean Square Error (MSE)


In [16]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


2.959040222576087

Decision Tree

In [17]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [18]:
mpg_predictions = tree_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.0

It give us zero error thats too good to be true. Overfitting. 
## Cross Validation

In [19]:
from sklearn.model_selection import cross_val_score

tree_reg_scores = cross_val_score(tree_reg,
                        prepared_data,
                        data_labels,
                        scoring="neg_mean_squared_error",
                        cv = 10)
tree_reg_rmse_scores = np.sqrt(-tree_reg_scores)
tree_reg_rmse_scores

array([3.78690177, 3.04220315, 2.86383702, 3.5970474 , 2.20276814,
       3.05900311, 3.50949605, 4.46996924, 4.19907824, 2.5516598 ])

In [20]:
tree_reg_rmse_scores.mean()

3.3281963918659856

In [21]:
## Cross validation for lineal regression
lin_reg_scores = cross_val_score(lin_reg,
                        prepared_data,
                        data_labels,
                        scoring="neg_mean_squared_error",
                        cv = 10)
lin_reg_rmse_scores = np.sqrt(-lin_reg_scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [22]:
lin_reg_rmse_scores.mean()

3.075708179370932

Random forest regressor model

In [23]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_scores = cross_val_score(forest_reg,
                                    prepared_data,
                                    data_labels,
                                    scoring="neg_mean_squared_error",
                                    cv = 10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_scores)
forest_reg_rmse_scores.mean()

2.5997355166533302

Support Vector Machine Regressor

In [24]:
from sklearn.svm import SVR

svm_reg = SVR(kernel = 'linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg,
                                prepared_data,
                                data_labels,
                                scoring="neg_mean_squared_error",
                                cv = 10)

svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.0865916208027717

## Hyperparameter for random forest

In [25]:
from sklearn.model_selection import GridSearchCV

# set of values for the parameters to try
param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2, 4,6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          cv=10,
                          )
grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [26]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [27]:
cv_scores = grid_search.cv_results_

# just to see the result for every group of parameters
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(np.sqrt(-mean_score), params)

3.57717537894863 {'max_features': 2, 'n_estimators': 3}
2.97728284431974 {'max_features': 2, 'n_estimators': 10}
2.8884970345713703 {'max_features': 2, 'n_estimators': 30}
3.4715459000825453 {'max_features': 4, 'n_estimators': 3}
2.937956908528607 {'max_features': 4, 'n_estimators': 10}
2.7251385647582427 {'max_features': 4, 'n_estimators': 30}
3.327647132595056 {'max_features': 6, 'n_estimators': 3}
2.970561813852618 {'max_features': 6, 'n_estimators': 10}
2.7437175017057913 {'max_features': 6, 'n_estimators': 30}
3.376550287775696 {'max_features': 8, 'n_estimators': 3}
2.895890924042481 {'max_features': 8, 'n_estimators': 10}
2.680595111808164 {'max_features': 8, 'n_estimators': 30}
3.41744578711723 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
3.0136377084684924 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.3317376388972235 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.9311985173276596 {'bootstrap': False, 'max_features': 3, 'n_estimators'

## Checking feature importance

In [28]:
#getting the feature importances
feature_importances = grid_search.best_estimator_.feature_importances_

extra_attrs = ['acc_on_power', 'acc_on_cyl']
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include = numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse = True)


[('acc_on_power', 0.02695464313370326),
 ('acc_on_cyl', 0.03278598058043975),
 ('Weight', 0.19092865843159573),
 ('Model Year', 0.11599147182557036),
 ('Horsepower', 0.12600084991688376),
 ('Displacements', 0.27442645158396595),
 ('Cylinders', 0.21173307317327497),
 ('Acceleration', 0.014743546407134581)]

## Evaluating the system on Test Data

In [29]:
final_model = grid_search.best_estimator_

## for_test 
test = test_set.drop("MPG", axis = 1)
test_labels = test_set["MPG"].copy()

prepared_test = full_pipeline.fit_transform(test)

final_predictions = final_model.predict(prepared_test)
final_mse = mean_squared_error(test_labels, final_predictions)
final_rmse = np.sqrt(final_mse)


In [30]:
final_rmse

3.049730748588946

## Saving the model

In [31]:
import pickle

In [33]:
with open('model_mpg.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()