# Importing Libraries and Data

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn import metrics

import pickle

In [2]:
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data', na_values = ['NA', '?'], names = columns, delim_whitespace=True)

# Data Cleaning

In [3]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [4]:
data.isnull().sum()

# 6 rows missing data

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [5]:
data['cylinders'].max()

8

In [6]:
data['horsepower'] = data['horsepower'].fillna(data['horsepower'].median())

# Filling missing data with median

In [7]:
data.isnull().sum()

# No missing data

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [26]:
numeric_list = data.select_dtypes(include=[np.number]).columns
data[numeric_list] = data[numeric_list].astype(np.float32)

# Converting data to float32

In [28]:
x = data[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']].values
y = data['mpg'].values

# Creating two arrays where x = featues and y = target

In [10]:
type(y)

# Confirming that y is an array

numpy.ndarray

In [29]:
type(x)

# Confirming that x is an array

numpy.ndarray

# Test, Train, Split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 101)

# Splitting data for training and testing

In [13]:
x_train = x_train.astype(np.float32)

# Setting x_train as float32

In [14]:
print(f'The shape of the data is: \nx_train: \t{x_train.shape} \nx_test: \t{x_test.shape} \ny_train: \t{y_train.shape} \ny_test: \t{y_test.shape}')

# Checking the shape of the data

The shape of the data is: 
x_train: 	(318, 7) 
x_test: 	(80, 7) 
y_train: 	(318,) 
y_test: 	(80,)


In [15]:
LinearRegression_model = LinearRegression()
DecisionTree_model = DecisionTreeRegressor()
RandomForest_model = RandomForestRegressor()
XGBRegressor_model = XGBRegressor()

# Creating four models

In [16]:
models = [LinearRegression_model, DecisionTree_model, RandomForest_model, XGBRegressor_model]
for model in models:
    model.fit(x_train, y_train)
    print(f'{model} is trained!')

# Training the models

LinearRegression() is trained!
DecisionTreeRegressor() is trained!
RandomForestRegressor() is trained!
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...) is trained!


In [17]:
accuracy_LinearRegression = LinearRegression_model.score(x_test, y_test)
accuracy_DecisionTree = DecisionTree_model.score(x_test, y_test)
accuracy_RandomForest = RandomForest_model.score(x_test, y_test)
accuracy_XGBoost = XGBRegressor_model.score(x_test, y_test)

# Creating accuracy scores for all models

In [18]:
models = {'LinearRegression_model': accuracy_LinearRegression, 'DecisionTree_model': accuracy_DecisionTree, 'RandomForest_model': accuracy_RandomForest,
          'XGBRegressor_model': accuracy_XGBoost}
for model, score in models.items():
    print(f'The accuracy score for the {model} is {round(score*100, 2)}%')

# Printing accuracy scores for all models

The accuracy score for the LinearRegression_model is 80.01%
The accuracy score for the DecisionTree_model is 73.38%
The accuracy score for the RandomForest_model is 90.79%
The accuracy score for the XGBRegressor_model is 86.67%


# Testing the Model

In [19]:
test_x = np.zeros((1, 7))
test_x

# Creating an array with all zeros

array([[0., 0., 0., 0., 0., 0., 0.]])

In [20]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino


In [21]:
test_x[0, 0] = 8.0
test_x[0, 1] = 307.0
test_x[0, 2] = 130.0
test_x[0, 3] = 3504.0
test_x[0, 4] = 12.0
test_x[0, 5] = 70.0
test_x[0, 6] = 1.0
test_x

# Filling empty array with values to test

array([[8.000e+00, 3.070e+02, 1.300e+02, 3.504e+03, 1.200e+01, 7.000e+01,
        1.000e+00]])

In [22]:
prediction = LinearRegression_model.predict(test_x)
round(float(prediction[0]), 2)

# Predicting the MPG output of the test input using the linear regression model

15.09

In [23]:
models = [LinearRegression_model, DecisionTree_model, RandomForest_model, XGBRegressor_model]

# Creating a list of all models

In [24]:
for model in models:
    prediction = model.predict(test_x)
    print(f'The model {model} predicts an mpg of {round(float(prediction[0]), 2)}')

# Predicting the MPG output of the test input using all models

The model LinearRegression() predicts an mpg of 15.09
The model DecisionTreeRegressor() predicts an mpg of 18.0
The model RandomForestRegressor() predicts an mpg of 17.33
The model XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...) predicts an mpg of 17.99


# Export the Model

In [25]:
with open('pkl_file', 'wb') as file:
    model = pickle.dump(LinearRegression_model, file)