## **PREDICTING THE PRICE OF BMW USED CARS**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [5]:
# read the data

bmw = pd.read_csv(r"C:\Users\Michelle\Documents\Datasets\Datasets I can Use\Used Car Dataset\bmw.csv")
bmw.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0


In [7]:
# Find if there are any columns with one unique value i.e no predictive power

bmw.nunique()

model             24
year              25
price           3777
transmission       3
mileage         8086
fuelType           5
tax               38
mpg              102
engineSize        17
dtype: int64

In [9]:
# Check for missing data

bmw.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [10]:
missing_values = [np.nan, "", " ", None]

bmw.isin(missing_values).mean().sort_values(ascending=False) * 100

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [15]:
# Separate out our target column

X = bmw.drop(columns="price")
y = bmw["price"]

In [17]:
# Dictionary for model types

car_type = {'5 Series':'sedan',
 '6 Series':'coupe',
 '1 Series':'coupe',
 '7 Series':'sedan',
 '2 Series':'coupe',
 '4 Series':'coupe',
 'X3':'suv',
 '3 Series':'sedan',
 'X5':'suv',
 'X4':'suv',
 'i3':'electric',
 'X1':'suv',
 'M4':'sports',
 'X2':'suv',
 'X6':'suv',
 '8 Series':'coupe',
 'Z4':'convertible',
 'X7':'suv',
 'M5':'sports',
 'i8':'electric',
 'M2':'sports',
 'M3':'sports',
 'M6':'sports',
 'Z3':'convertible'}

### Feature Engineering

In [18]:
# We're going to add a classification that I manually put together 

X["model"] = X["model"].str.strip()
X["car_type"] = X["model"].map(car_type)
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,car_type
0,5 Series,2014,Automatic,67068,Diesel,125,57.6,2.0,sedan
1,6 Series,2018,Automatic,14827,Petrol,145,42.8,2.0,coupe
2,5 Series,2016,Automatic,62794,Diesel,160,51.4,3.0,sedan
3,1 Series,2017,Automatic,26676,Diesel,145,72.4,1.5,coupe
4,7 Series,2014,Automatic,39554,Diesel,160,50.4,3.0,sedan
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,Automatic,40818,Diesel,150,54.3,2.0,suv
10777,5 Series,2016,Automatic,42947,Diesel,125,60.1,2.0,sedan
10778,3 Series,2017,Manual,25468,Petrol,200,42.8,2.0,sedan
10779,1 Series,2014,Automatic,45000,Diesel,30,64.2,2.0,coupe


In [29]:
# view the distribution of car types

print((X['car_type'].value_counts()/ len(X) * 100).round(1).astype(str) + '%')

coupe          40.3%
sedan          33.4%
suv            22.7%
sports          1.9%
convertible     1.1%
electric        0.6%
Name: car_type, dtype: object


In [30]:
# Encode the data using one hot encoding with drop_first to avoid the dummy variable trap

X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,car_type_coupe,car_type_electric,car_type_sedan,car_type_sports,car_type_suv
0,2014,67068,125,57.6,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2018,14827,145,42.8,2.0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
2,2016,62794,160,51.4,3.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,2017,26676,145,72.4,1.5,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2014,39554,160,50.4,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,40818,150,54.3,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10777,2016,42947,125,60.1,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
10778,2017,25468,200,42.8,2.0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
10779,2014,45000,30,64.2,2.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [32]:
# Scale the data to allow our algorithm to perform well by limiting the scale to 0-1

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,car_type_coupe,car_type_electric,car_type_sedan,car_type_sports,car_type_suv
0,0.750000,0.313399,0.215517,0.111971,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.916667,0.069281,0.250000,0.080163,0.303030,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.833333,0.293427,0.275862,0.098646,0.454545,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.875000,0.124650,0.250000,0.143778,0.227273,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.750000,0.184828,0.275862,0.096497,0.454545,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,0.833333,0.190735,0.258621,0.104879,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10777,0.833333,0.200683,0.215517,0.117344,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10778,0.875000,0.119005,0.344828,0.080163,0.303030,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
10779,0.750000,0.210277,0.051724,0.126155,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### LINEAR REGRESSION

In [40]:
# Train - Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

## Import our ML Algorithm
from sklearn.linear_model import LinearRegression

## Import our metric
from sklearn.metrics import mean_absolute_error

## Create our model object
linear_regressor = LinearRegression()

In [41]:
## Training phase
## Fit the model object to our data 
linear_regressor.fit(X_train, y_train)

## Create predictions with out newly trained model
linear_predictions = linear_regressor.predict(X_test)

## Measure the efficacy of our algortihm using the metric
mean_absolute_error(y_test, linear_predictions)

2699.3441657855533

In [48]:
## Percentage of Error of the Mean Value
(mean_absolute_error(y_test, linear_predictions) / y_test.mean() *100).round(1)

12.0

In [39]:
## RANDOM FOREST

In [42]:
### Import our Machine Learning Algorithm
from sklearn.ensemble import RandomForestRegressor
### Import our metric
from sklearn.metrics import mean_absolute_error

# Create a model object
random_forest_regressor = RandomForestRegressor(n_estimators=1000)

# Fit the object to our data (this is the training phase)
random_forest_regressor.fit(X_train, y_train)

# Create predictions with your newly trained model
random_forest_predictions = random_forest_regressor.predict(X_test)

# Measure the efficacy of your algorithm using your metric
mean_absolute_error(y_test, random_forest_predictions)

1523.4088206972344

In [47]:
## Percentage of Error of the Mean Value
(mean_absolute_error(y_test, random_forest_predictions) / y_test.mean() * 100).round(1)

6.8

In [None]:
## XG Boost

In [53]:
### Import our Machine Learning Algorithm
from xgboost import XGBRegressor
### Import our metric
from sklearn.metrics import mean_absolute_error


# Create a model object
boost_model = XGBRegressor()

# Fit the object to our data (this is the training phase)
boost_model.fit(X_train, y_train)

# Create predictions with your newly trained model
boost_predictions = boost_model.predict(X_test)

# Measure the efficacy of your algorithm using your metric
mean_absolute_error(y_test, boost_predictions)


1512.168397807226

In [54]:
## Percentage of Error of the Mean Value
(mean_absolute_error(y_test, boost_predictions) / y_test.mean() * 100).round(1)

6.7

In [61]:
# Hyperparameter tuning

from sklearn.model_selection import GridSearchCV
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

n_estimators = [1500, 1600]
# Number of features to consider at every split
max_features = [1.0]
# Maximum number of levels in tree
max_depth = [80, 90]
# Minimum number of samples required to split a node
min_samples_split = [5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [1500, 1600],
 'max_features': [1.0],
 'max_depth': [80, 90],
 'min_samples_split': [5],
 'min_samples_leaf': [1],
 'bootstrap': [True]}

In [67]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 3, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

Fitting 3 folds for each of 4 candidates, totalling 12 fits


{'bootstrap': True,
 'max_depth': 80,
 'max_features': 1.0,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 1600}

In [68]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

perfect_random_forest = RandomForestRegressor(n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_features=1.0, max_depth=90, bootstrap=True)
perfect_random_forest.fit(X_train, y_train)

perfect_random_forest_predictions = perfect_random_forest.predict(X_test)

mean_absolute_error(y_test, perfect_random_forest_predictions)

1501.3088271724266

In [70]:
## Percentage of Error of the Mean Value
(mean_absolute_error(y_test, perfect_random_forest_predictions) / y_test.mean() * 100).round(1)

6.7

Small improvement to the % of error