#  Machine Learning model selection to Predict Car price using the Linear Regression model with the R-squared, MAE, MSE evaluators.

###### Python Libraries: Scikit-learn, Numpy, Pandas

**Import the libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

### 2. Load dataset

In [2]:
car_sales= pd.read_csv("car-sales-extended-missing-data.csv")
car_sales.head()


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
car_sales.tail()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0
999,Toyota,Blue,248360.0,4.0,12732.0


In [4]:
#Check the type of features we have in the datasets
print (car_sales['Colour'].unique())
print(car_sales['Make'].unique())
print(car_sales['Doors'].unique())

['White' 'Blue' 'Red' 'Green' nan 'Black']
['Honda' 'BMW' 'Toyota' 'Nissan' nan]
[ 4.  5.  3. nan]


In [5]:
len(car_sales)

1000

### 3. Data Cleaning



In [6]:
#Check for missing values
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [7]:
# Check our dataframe again
car_sales=car_sales.dropna(axis=0)



In [8]:
#Let's deal with null values in price using sklearn

len(car_sales)



773

In [9]:
#Check the data types
car_sales.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [10]:
#Convert the datatype of the doors to int and then to object, since its a categorical variable
car_sales['Doors']= car_sales['Doors'].astype('int')



In [11]:
#from int to object 
car_sales['Doors']= car_sales['Doors'].astype('object')
print(car_sales.dtypes)

Make              object
Colour            object
Odometer (KM)    float64
Doors             object
Price            float64
dtype: object


In [12]:
#Check the conversion
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4,15323.0
1,BMW,Blue,192714.0,5,19943.0
2,Honda,White,84714.0,4,28343.0
3,Toyota,White,154365.0,4,13434.0
4,Nissan,Blue,181577.0,3,14043.0


In [13]:
#Verify if there are still missing values by checking their unique features
print (car_sales['Colour'].unique())
print(car_sales['Make'].unique())
print(car_sales['Doors'].unique())

['White' 'Blue' 'Red' 'Green' 'Black']
['Honda' 'BMW' 'Toyota' 'Nissan']
[4 5 3]


In [14]:
#Separate data into x and y

In [15]:
x= car_sales.drop("Price", axis=1)
y= car_sales["Price"]


x.head()


Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4
1,BMW,Blue,192714.0,5
2,Honda,White,84714.0,4
3,Toyota,White,154365.0,4
4,Nissan,Blue,181577.0,3


In [16]:
#Convert the Odometer column to list;this will be needed in deployment.

Odometer_list= list(car_sales["Odometer (KM)"])

odor_list = [int(item) for item in Odometer_list]



### 4. Feature Engineering

#Let's convert the strings (categorical features) to numerical data type
#Feature Engineering

In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make", "Colour", "Doors"]
one_hot= OneHotEncoder()
transformer= ColumnTransformer([("one_hot", one_hot, categorical_features)],
                                remainder= "passthrough")

transformeed_x = transformer.fit_transform(x)
transformeed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [18]:
#Put it back into a dataframe

In [19]:
x=transformeed_x



In [20]:
#Convert from sparse matrix to dense matrix 
pd.DataFrame(transformeed_x)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,163322.0
769,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
770,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
771,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


Alternatively,

In [21]:
dummies= pd.get_dummies(car_sales[["Make","Odometer (KM)","Colour","Doors"]])
dummies


  uniques = Index(uniques)


Unnamed: 0,Odometer (KM),Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,Doors_3,Doors_4,Doors_5
0,35431.0,0,1,0,0,0,0,0,0,1,0,1,0
1,192714.0,1,0,0,0,0,1,0,0,0,0,0,1
2,84714.0,0,1,0,0,0,0,0,0,1,0,1,0
3,154365.0,0,0,0,1,0,0,0,0,1,0,1,0
4,181577.0,0,0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,163322.0,1,0,0,0,0,1,0,0,0,1,0,0
995,35820.0,0,0,0,1,1,0,0,0,0,0,1,0
997,66604.0,0,0,1,0,0,1,0,0,0,0,1,0
998,215883.0,0,1,0,0,0,0,0,0,1,0,1,0


**Split data into training and testing**

In [22]:
#split into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

### 5. Picking a machine learning model for a regresion problem

 **5.1 Linear Regression model**

##### 5.2 Let's fit the model

In [23]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression


model1 = LinearRegression()
model1.fit(x_train,y_train)
model1.score(x_test, y_test)






0.3367958266145302

5.3 RandomForestRegressor

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test2 = train_test_split(x,y, test_size = 0.2)

In [25]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

y= car_sales["Price"]
model2=RandomForestRegressor()
model2.fit(x_train, y_train);
model2.score(x_test,y_test2)

0.2658508636419823

## 6. Regression model evaluation

**6.1 Metric Functions**  
1. R^2 (r-squared) or coefficient of determination
2. Mean absolute error (MAE)
3. Mean squared error (MSE)


**6.1.1**   **R^2**



### Evaluating the Linear Regression Model using the R^2

In [26]:
#Scoring metrics
from sklearn.metrics import r2_score

#Fill an array with y_test mean
y_test_mean=np.full(len(y_test),y_test.mean())
y_test.mean()

17696.348387096776

In [27]:
r2_score(y_test,y_test_mean) #let's test the mean of the target

0.0

In [28]:
r2_score(y_test,y_test)

1.0

Comments: This informs that the score function above indicating the coefficient of determination tells us how closely our model is close to a perfect for a perfect prediction

### Testing the Linear Regression Model

In [29]:
y_preds=model1.predict(x_test)
df= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

df

Unnamed: 0,actual values,predicted values
775,26308.0,18257.168304
742,18557.0,21079.003364
776,28830.0,20180.793953
934,12216.0,9266.162201
971,24891.0,23449.645993
...,...,...
839,10349.0,16802.529529
970,16416.0,13645.571180
329,17860.0,10113.406725
179,23287.0,25493.458207


### Evaluating the RandomForestRegression model using the R^2

In [30]:
y_test2_mean=np.full(len(y_test2),y_test2.mean())
y_test2.mean()

17229.929032258064

In [31]:
r2_score(y_test2,y_test2)

1.0

### Testing the RandomForestRegression Model

In [32]:
dfr= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

dfr

Unnamed: 0,actual values,predicted values
775,26308.0,18257.168304
742,18557.0,21079.003364
776,28830.0,20180.793953
934,12216.0,9266.162201
971,24891.0,23449.645993
...,...,...
839,10349.0,16802.529529
970,16416.0,13645.571180
329,17860.0,10113.406725
179,23287.0,25493.458207


**4.1.2**   **Mean absolute error (MAE)**



### Evaluating the Linear Regression model using the MAE

In [33]:
# mean absolute error
from sklearn.metrics import mean_absolute_error

y_preds=model1.predict(x_test)
mae=mean_absolute_error(y_test, y_preds)
mae

7494.50658157528

In [34]:
df= pd.DataFrame(data={"actual values": y_test, "predicted values":y_preds})

df
#Let's display the differences
df["differences"]=df["predicted values"] - df["actual values"]
df

Unnamed: 0,actual values,predicted values,differences
552,7982.0,18257.168304,10275.168304
906,52458.0,21079.003364,-31378.996636
483,12152.0,20180.793953,8028.793953
920,23439.0,9266.162201,-14172.837799
516,8105.0,23449.645993,15344.645993
...,...,...,...
521,13707.0,16802.529529,3095.529529
853,7350.0,13645.571180,6295.571180
761,14432.0,10113.406725,-4318.593275
601,11162.0,25493.458207,14331.458207


### Evaluating the RandomForesRegression model using the MAE

#### Evaluating the RandomForestRegression model

In [35]:
y_preds=model2.predict(x_test)
mae=mean_absolute_error(y_test2, y_preds)
mae

6071.929677419354

In [36]:
df2= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

df2

Unnamed: 0,actual values,predicted values
775,26308.0,17165.70
742,18557.0,14897.61
776,28830.0,24081.48
934,12216.0,6471.71
971,24891.0,19346.92
...,...,...
839,10349.0,9431.75
970,16416.0,12309.93
329,17860.0,11553.13
179,23287.0,18385.02


In [37]:
#Let's compute the differences
df2["differences"]=df["predicted values"] - df["actual values"]
df2.head(40)

Unnamed: 0,actual values,predicted values,differences
775,26308.0,17165.7,-8297.267116
742,18557.0,14897.61,
776,28830.0,24081.48,
934,12216.0,6471.71,
971,24891.0,19346.92,
162,22616.0,9706.77,-5926.60977
795,20503.0,19810.98,11279.540697
745,20845.0,18344.57,
192,13106.0,14503.97,
180,12398.0,18457.56,4397.260733


Comments:From the dataframe (df2) above, viewing the differences, we can see that 90% of our predictions are right with the evaluated model.

**4.1.3   Mean Squared error**

# For the Linear Regression model

In [38]:
from sklearn.metrics import mean_squared_error
y_preds=model1.predict(x_test)
mse= mean_squared_error(y_test, y_preds)
mse

92401727.98063374

In [39]:
dfm= pd.DataFrame(data={"actual values": y_test, "predicted values":y_preds})

dfm
#Let's compute the differences
dfm["differences"]=dfm["predicted values"] - dfm["actual values"]
dfm

Unnamed: 0,actual values,predicted values,differences
552,7982.0,18257.168304,10275.168304
906,52458.0,21079.003364,-31378.996636
483,12152.0,20180.793953,8028.793953
920,23439.0,9266.162201,-14172.837799
516,8105.0,23449.645993,15344.645993
...,...,...,...
521,13707.0,16802.529529,3095.529529
853,7350.0,13645.571180,6295.571180
761,14432.0,10113.406725,-4318.593275
601,11162.0,25493.458207,14331.458207


# For the RandomForestRegression model

In [40]:
y_preds=model2.predict(x_test)
mse= mean_squared_error(y_test2, y_preds)
mse

54592436.95330194

In [41]:
dfms= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

dfms
#Let's compute the differences
dfms["differences"]=dfms["predicted values"] - dfms["actual values"]
dfms

Unnamed: 0,actual values,predicted values,differences
775,26308.0,17165.70,-9142.30
742,18557.0,14897.61,-3659.39
776,28830.0,24081.48,-4748.52
934,12216.0,6471.71,-5744.29
971,24891.0,19346.92,-5544.08
...,...,...,...
839,10349.0,9431.75,-917.25
970,16416.0,12309.93,-4106.07
329,17860.0,11553.13,-6306.87
179,23287.0,18385.02,-4901.98


**4.2 Scoring parameter**

**Using some metric function**

In [42]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
model2=RandomForestRegressor()
np.random.seed(42)
cv_acc= cross_val_score(model2, x, y, cv=5, scoring=None)
np.mean(cv_acc)

0.2402165472512876

**i.r2**

In [43]:
#i r2
np.random.seed(42)
cv_acc= cross_val_score(model2, x, y, cv=5, scoring="r2")
r2=np.mean(cv_acc)

r2


0.2402165472512876

**ii. neg_mean_absolute_error**

In [44]:
#i neg_mean_absolute_error
np.random.seed(42)
cv_msa= cross_val_score(model2, x, y, cv=5, scoring="neg_mean_absolute_error")
neg_mean_absolute_error=np.mean(cv_msa)
neg_mean_absolute_error

-5842.919065437787

**iii. neg_mean_squared_error**

In [45]:
#iv: mean_squared_error
np.random.seed(42)
cv_mse= cross_val_score(model2, x, y, cv=5, scoring="neg_mean_squared_error")
neg_mean_squared_error=np.mean(cv_mse)
neg_mean_squared_error

-54045290.54544711

Evaluate

In [46]:
    print(f"R^2: {r2*100:.2f}%")
    print(f"MAE: {neg_mean_absolute_error:.2f}")
    print(f"MSE: {neg_mean_squared_error:.2f}")

R^2: 24.02%
MAE: -5842.92
MSE: -54045290.55


**7. Saving and Loading the model**

In [47]:
model2=RandomForestRegressor()
model2.fit(x_test, y_preds)
import pickle



#save an existing model to file
filename='CAR_SALES_MODEL.pkl'
pickle.dump(model2, open(filename,"wb"))

In [48]:

#Load a saved model
loaded_model = pickle.load(open("C:/Users/Fresh/3D Objects/Deployment/models/CAR_SALES_MODEL.pkl", "rb"))