In [1]:
import pandas as pd
import numpy as np

In [2]:
path = r"data\automobiles.csv"
df = pd.read_csv(path)

In [3]:
df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,205.0,205.0,205,205,205,205,205,205,205,205,...,205.0,205,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205
unique,,,52,22,2,2,3,5,3,2,...,,8,39.0,37.0,,60.0,24.0,,,187
top,,,?,toyota,gas,std,four,sedan,fwd,front,...,,mpfi,3.62,3.4,,68.0,5500.0,,,?
freq,,,41,32,185,168,114,96,120,202,...,,94,23.0,20.0,,19.0,37.0,,,4
mean,102.0,0.834146,,,,,,,,,...,126.907317,,,,10.142537,,,25.219512,30.75122,
std,59.322565,1.245307,,,,,,,,,...,41.642693,,,,3.97204,,,6.542142,6.886443,
min,0.0,-2.0,,,,,,,,,...,61.0,,,,7.0,,,13.0,16.0,
25%,51.0,0.0,,,,,,,,,...,97.0,,,,8.6,,,19.0,25.0,
50%,102.0,1.0,,,,,,,,,...,120.0,,,,9.0,,,24.0,30.0,
75%,153.0,2.0,,,,,,,,,...,141.0,,,,9.4,,,30.0,34.0,


In [4]:
## Data Pre-Processing

df.replace("?", np.nan, inplace = True) #replace the missing values("?") with nan

In [5]:
missing_data = df.isnull()

In [6]:
# this loop prints the number of missing values in each coloumn
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

Unnamed: 0
False    205
Name: Unnamed: 0, dtype: int64

symboling
False    205
Name: symboling, dtype: int64

normalized-losses
False    164
True      41
Name: normalized-losses, dtype: int64

make
False    205
Name: make, dtype: int64

fuel-type
False    205
Name: fuel-type, dtype: int64

aspiration
False    205
Name: aspiration, dtype: int64

num-of-doors
False    203
True       2
Name: num-of-doors, dtype: int64

body-style
False    205
Name: body-style, dtype: int64

drive-wheels
False    205
Name: drive-wheels, dtype: int64

engine-location
False    205
Name: engine-location, dtype: int64

wheel-base
False    205
Name: wheel-base, dtype: int64

length
False    205
Name: length, dtype: int64

width
False    205
Name: width, dtype: int64

height
False    205
Name: height, dtype: int64

curb-weight
False    205
Name: curb-weight, dtype: int64

engine-type
False    205
Name: engine-type, dtype: int64

num-of-cylinders
False    205
Name: num-of-cylinders, dtype: int64

engine-size
Fals

In [7]:
# replacing the missing values of some columns with their respective mean depending upon their type
avg_norm_loss = df["normalized-losses"].astype("float").mean(axis=0)
df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)

avg_bore = df["bore"].astype("float").mean(axis=0)
df["bore"].replace(np.nan, avg_bore, inplace=True)

avg_stroke = df["stroke"].astype("float").mean(axis=0)
df["stroke"].replace(np.nan, avg_stroke, inplace=True)

avg_horsepower = df["horsepower"].astype("float").mean(axis=0)
df["horsepower"].replace(np.nan, avg_horsepower, inplace=True)

avg_peakrpm = df["peak-rpm"].astype("float").mean(axis=0)
df["peak-rpm"].replace(np.nan, avg_peakrpm, inplace=True)

In [8]:
# replacing the missing values of columns with their respective frequency depending upon their type
max_freq=df["num-of-doors"].value_counts().idxmax()
df["num-of-doors"].replace(np.nan, max_freq, inplace=True) # replacing nan with value having maximum frequency in the coloumn

In [9]:
# droping the rows that do not have price values
df.dropna(subset=["price"], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
# changing the data types of some columns to proper data types
df[["normalized-losses"]] = df[["normalized-losses"]].astype("int")
df["bore"] = df["bore"].astype("float")
df["stroke"] = df["stroke"].astype("float")
df["peak-rpm"] = df["peak-rpm"].astype("float")
df["price"] = df["price"].astype("float")
df["horsepower"]=df["horsepower"].astype("int")

In [11]:
## Data Analysis

df.corr()

Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
Unnamed: 0,1.0,-0.161862,-0.241034,0.125728,0.162374,0.044613,0.251376,0.065501,-0.046752,0.246473,-0.162926,0.14477,-0.022466,-0.19843,0.027644,0.020049,-0.117889
symboling,-0.161862,1.0,0.466264,-0.535987,-0.365404,-0.242423,-0.55016,-0.233118,-0.110581,-0.140019,-0.008153,-0.182196,0.07581,0.27974,-0.035527,0.036233,-0.082391
normalized-losses,-0.241034,0.466264,1.0,-0.056661,0.019424,0.086802,-0.373737,0.099404,0.11236,-0.029862,0.055045,-0.114713,0.2173,0.239543,-0.225016,-0.181877,0.133999
wheel-base,0.125728,-0.535987,-0.056661,1.0,0.876024,0.814507,0.590742,0.782097,0.572027,0.493244,0.158018,0.250313,0.371178,-0.360305,-0.470606,-0.543304,0.584642
length,0.162374,-0.365404,0.019424,0.876024,1.0,0.85717,0.492063,0.880665,0.685025,0.608971,0.123952,0.159733,0.579795,-0.28597,-0.665192,-0.698142,0.690628
width,0.044613,-0.242423,0.086802,0.814507,0.85717,1.0,0.306002,0.866201,0.729436,0.544885,0.188822,0.189867,0.615056,-0.2458,-0.633531,-0.680635,0.751265
height,0.251376,-0.55016,-0.373737,0.590742,0.492063,0.306002,1.0,0.307581,0.074694,0.180449,-0.060663,0.259737,-0.087001,-0.309974,-0.0498,-0.104812,0.135486
curb-weight,0.065501,-0.233118,0.099404,0.782097,0.880665,0.866201,0.307581,1.0,0.849072,0.64406,0.167438,0.156433,0.757981,-0.279361,-0.749543,-0.794889,0.834415
engine-size,-0.046752,-0.110581,0.11236,0.572027,0.685025,0.729436,0.074694,0.849072,1.0,0.572609,0.205928,0.028889,0.822668,-0.256733,-0.650546,-0.679571,0.872335
bore,0.246473,-0.140019,-0.029862,0.493244,0.608971,0.544885,0.180449,0.64406,0.572609,1.0,-0.05539,0.001263,0.566903,-0.267392,-0.582027,-0.591309,0.543155


In [12]:
# We now have a better idea of which variables are important to take into account when predicting the car price.
# We have narrowed it down to the following variables: Length, Width, Curb-weight, Engine-size, Horsepower, City-mpg, Highway-mpg, Wheel-base, Bore and Drive-wheels
# We can further verify this by calculating Pearson Correlation and performing ANOVA test.

from scipy import stats

#Pearson Correlation
pearson_coef, p_value = stats.pearsonr(df['wheel-base'], df['price'])
print("The Pearson Correlation Coefficient is between wheel-base and price is", pearson_coef, " with a P-value of P =", p_value)

pearson_coef, p_value = stats.pearsonr(df['horsepower'], df['price'])
print("The Pearson Correlation Coefficient between horsepower and price is", pearson_coef, " with a P-value of P = ", p_value)

pearson_coef, p_value = stats.pearsonr(df['length'], df['price'])
print("The Pearson Correlation Coefficient between length and price is", pearson_coef, " with a P-value of P = ", p_value)

pearson_coef, p_value = stats.pearsonr(df['width'], df['price'])
print("The Pearson Correlation Coefficient between width and price is", pearson_coef, " with a P-value of P =", p_value)

pearson_coef, p_value = stats.pearsonr(df['curb-weight'], df['price'])
print( "The Pearson Correlation Coefficient between curb-weight and price is", pearson_coef, " with a P-value of P = ", p_value)

pearson_coef, p_value = stats.pearsonr(df['engine-size'], df['price'])
print("The Pearson Correlation Coefficient between engine-size and price is", pearson_coef, " with a P-value of P =", p_value)

pearson_coef, p_value = stats.pearsonr(df['bore'], df['price'])
print("The Pearson Correlation Coefficient between bore and price is", pearson_coef, " with a P-value of P =  ", p_value)

pearson_coef, p_value = stats.pearsonr(df['city-mpg'], df['price'])
print("The Pearson Correlation Coefficient between city-mpg and price is", pearson_coef, " with a P-value of P = ", p_value)

pearson_coef, p_value = stats.pearsonr(df['highway-mpg'], df['price'])
print( "The Pearson Correlation Coefficient between highway-mpg and price is", pearson_coef, " with a P-value of P = ", p_value)

The Pearson Correlation Coefficient is between wheel-base and price is 0.584641822265508  with a P-value of P = 8.076488270733218e-20
The Pearson Correlation Coefficient between horsepower and price is 0.8096068016571054  with a P-value of P =  6.273536270650504e-48
The Pearson Correlation Coefficient between length and price is 0.6906283804483642  with a P-value of P =  8.016477466158759e-30
The Pearson Correlation Coefficient between width and price is 0.7512653440522676  with a P-value of P = 9.2003355104806e-38
The Pearson Correlation Coefficient between curb-weight and price is 0.8344145257702843  with a P-value of P =  2.189577238894065e-53
The Pearson Correlation Coefficient between engine-size and price is 0.8723351674455182  with a P-value of P = 9.265491622200232e-64
The Pearson Correlation Coefficient between bore and price is 0.5431553832626603  with a P-value of P =   8.049189483935261e-17
The Pearson Correlation Coefficient between city-mpg and price is -0.686571006784467

In [13]:
# ANOVA test for drive-wheels

grouped_test = df[['drive-wheels', 'price']].groupby(['drive-wheels'])

f_val, p_val = stats.f_oneway(grouped_test.get_group('fwd')['price'], grouped_test.get_group('rwd')['price'], grouped_test.get_group('4wd')['price'])
print( "ANOVA results: F=", f_val, ", P =", p_val)

ANOVA results: F= 67.95406500780399 , P = 3.3945443577151245e-23


In [14]:
## Model Development

from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [15]:
# Linear Regression
X = df[['horsepower']] # We can use other variables to preidict the price of car
Y = df['price']
lm.fit(X, Y)
Yhat_lr = lm.predict(X)
Yhat_lr[0:5]

array([14515.26092092, 14515.26092092, 21919.43979421, 12965.54906372,
       15204.02174634])

In [16]:
# Multiple Linear Regression
Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']] # as these variables can be good predictor
lm.fit(Z, df['price'])
Yhat_mlr = lm.predict(Z)
Yhat_mlr[0:5]

array([13699.07700462, 13699.07700462, 19052.71346719, 10620.61524404,
       15520.90025344])

In [17]:
# Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures

pr = PolynomialFeatures(degree=2)
X_pr=pr.fit_transform(X)
lm.fit(X_pr, Y)
Yhat_pr = lm.predict(X_pr)
Yhat_pr[0:5]

array([14176.02870825, 14176.02870825, 21719.32128836, 12693.19690932,
       14845.7313834 ])

In [18]:
# Model Evaluation using R-squared and Mean Squared Error(MSE)

from sklearn.metrics import mean_squared_error
# for linear regression
lm.fit(X, Y)
print('The R-square is: ', lm.score(X, Y))
mse = mean_squared_error(Y, Yhat_lr)
print('The mean square error of price and predicted value is: ', mse)

The R-square is:  0.6554631732894476
The mean square error of price and predicted value is:  21651264.45659664


In [19]:
# for multiple linear regression
lm.fit(Z, Y)
print('The R-square is: ', lm.score(Z, Y))
mse = mean_squared_error(Y, Yhat_mlr)
print('The mean square error of price and predicted value is: ', mse)

The R-square is:  0.8093732522175299
The mean square error of price and predicted value is:  11979300.349818882


In [20]:
# for polynomial regression
lm.fit(X_pr, Y)
print('The R-square is: ', lm.score(X_pr, Y))
mse = mean_squared_error(Y, Yhat_pr)
print('The mean square error of price and predicted value is: ', mse)

The R-square is:  0.6579997817691481
The mean square error of price and predicted value is:  21491859.781220734


In [21]:
# When comparing models, the model with the higher R-squared value and smaller MSE value is a better fit for the data. In our case it is Multiple Linear Regression.

In [22]:
## Model Evaluation and Refinement

# Split the data into training and test data

from sklearn.model_selection import train_test_split

x_data = df.drop('price', axis=1)
y_data = df['price']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=0)

In [23]:
pr = PolynomialFeatures(degree=2)
x_train_pr=pr.fit_transform(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','wheel-base','bore']])
x_test_pr=pr.fit_transform(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','wheel-base','bore']])

In [24]:
# Ridge Regression

from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.1)
ridge_model.fit(x_train_pr, y_train)

  return linalg.solve(A, Xy, sym_pos=True,


Ridge(alpha=0.1)

In [25]:
yhat_ridge = ridge_model.predict(x_test_pr)

In [26]:
print('predicted:', yhat_ridge[0:4])
print('test set :', y_test[0:4].values)

predicted: [ 5737.20756836 10366.84601068 17137.25841682 18362.68492156]
test set : [ 6295. 10698. 13860. 13499.]


In [27]:
print('The R-square is: ', ridge_model.score(x_test_pr, y_test))

The R-square is:  0.6160010574062894


In [28]:
# We can use Grid Search class to find the best hyperparameter alpha

from sklearn.model_selection import GridSearchCV

parameter = [{'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}]
RR = Ridge()
GS = GridSearchCV(RR, parameter, cv=4)
GS.fit(x_data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','wheel-base','bore']], y_data)
BestRR=GS.best_estimator_
BestRR

Ridge(alpha=10000)

In [29]:
# We now test our model using test data
print('The R-square is: ', BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','wheel-base','bore']], y_test))
final_predictions = BestRR.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','wheel-base','bore']])
final_mse = mean_squared_error(y_test, final_predictions)
print('The mean square error of price and predicted value is: ', final_mse)

The R-square is:  0.8412808156514981
The mean square error of price and predicted value is:  11860347.920439804


In [30]:
BestRR.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','wheel-base','bore']])

array([ 5460.5457777 , 10094.98551049, 19081.8192099 , 22332.69733646,
       21359.00440434,  9571.33374124, 13983.32117118,  5733.51824317,
       17942.68307124,  7041.30943076, 11237.91577917, 24538.57911641,
        7009.83747242,  8167.28884815, 19186.73152843, 16215.70780062,
        6241.40971654, 13807.20074192,  9746.31429552,  5803.48040518,
        8058.04495775, 12530.85117151, 34225.29120041,  7425.18676569,
       10372.81968263, 24328.12840757,  9255.65767798,  9649.51572333,
       18017.22281661, 34225.29120041, 28078.86151088, 10873.63560949,
        7149.98751502, 19344.88259595,  8811.13334224, 11131.2082442 ,
       13748.8411579 , 15728.0029557 , 11305.29624845, 15787.56788118,
        6728.49346958, 23048.14450836,  9112.46049353, 18179.71663866,
       17293.36717441, 12996.8783932 , 17954.22580559,  6090.4586445 ,
        5866.65111598, 10788.21001748, 36414.54372529, 24328.12840757,
       10437.56479521, 11142.11442118,  7132.39836423,  5866.65111598,
      

In [34]:
import pickle 

#creating/opening the file where we want to store our model 
file = open('model.pkl', 'wb')

#dump inofrmation into that file
pickle.dump(BestRR, file)

In [35]:
# Loading model to test the results
model = pickle.load(open('model.pkl','rb'))
print(model.predict([[200, 2548, 130, 27, 88.6, 3.47]]))