# Modelling

In [3]:
%matplotlib inline 
# Make plots appear inside the notebook

# EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import missingno as msno 

# Models from Scikit-Learn
import sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

# Model evaluation 
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

## Evaluation Methods

In [4]:
def rmsle(y_true, y_preds):
    """
    Calculates root mean squared log error between predictions and true labels.
    """
    return np.sqrt(mean_squared_log_error(y_true, y_preds))

# Create function to evaluate the model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    # if "Test MAE" is lower than "Training MAE", it means that the model is overfitting
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Test MAE": mean_absolute_error(y_test, test_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Test RMSLE": rmsle(y_test, test_preds),
              "Training R^2": r2_score(y_train, train_preds),
              "Test R^2": r2_score(y_test, test_preds)}
    return scores

# Create function to evaluate the model on a few different levels (NO RMSLE)
def show_scores1(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    # if "Test MAE" is lower than "Training MAE", it means that the model is overfitting
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Test MAE": mean_absolute_error(y_test, test_preds),
              "Training R^2": r2_score(y_train, train_preds),
              "Test R^2": r2_score(y_test, test_preds)}
    return scores

def get_scores_dataframe(model):
    test_preds = model.predict(X_test)

    df = pd.DataFrame(data={"actual values": y_test,
                           "predicted values": test_preds})

    df["differences"] = np.abs(df["predicted values"] - df["actual values"])

    return df

## Import Clean Datasets

In [5]:
df_dropped_empty_outliers = pd.read_csv("df_dropped_empty_outliers.csv", low_memory=False)
df_filled_empty_outliers = pd.read_csv("df_filled_empty_outliers.csv", low_memory=False)
df_filled_empty_outliers_dyn = pd.read_csv("df_filled_empty_outliers_dyn.csv", low_memory=False)
df_2010_filtered = pd.read_csv("df_2010_filtered.csv", low_memory=False)

## 1) Dropped Missing Values & Outliers Dataset

### Split Dataset

In [6]:
# Split data into X & y
X = df_dropped_empty_outliers.drop("price", axis=1)

y = df_dropped_empty_outliers['price']

In [7]:
X.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,17,2014,16,21358,3,7,3,57923.0,1,3,0,0,9,11,2
1,17,2010,9,21668,3,7,3,71229.0,1,3,0,0,9,2,2
2,17,2020,9,21712,3,7,3,19160.0,1,3,0,0,9,9,2
3,17,2017,41,24411,3,7,3,41124.0,1,3,0,0,9,9,2
4,17,2013,15,12674,1,6,3,128000.0,1,1,3,2,11,1,2


In [8]:
y.head()

0    33590
1    22590
2    39590
3    30990
4    15000
Name: price, dtype: int64

In [9]:
# Split data into train & test sets
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
336782,376,2011,43,25510,1,6,3,126129.0,1,1,1,0,0,0,47
323491,97,2000,22,15139,4,7,1,143500.0,1,1,1,0,1,11,44
214453,242,2007,9,23297,1,7,3,252180.0,1,1,1,0,0,1,30
95102,309,2019,36,1798,3,0,1,32119.0,1,3,1,0,9,1,10
74004,233,2003,18,9154,0,0,3,119000.0,1,1,0,0,0,0,7


In [11]:
y_train.head()

336782     8900
323491     4400
214453     9900
95102     52990
74004      4650
Name: price, dtype: int64

### Random Forest Regressor Model

In [216]:
%%time

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

CPU times: total: 2min 9s
Wall time: 2min 9s


In [217]:
show_scores(rf_model)

{'Training MAE': 697.9195481852872,
 'Test MAE': 1831.6168106188886,
 'Training RMSLE': 0.2183472120271417,
 'Test RMSLE': 0.3742718993530793,
 'Training R^2': 0.987665013411039,
 'Test R^2': 0.9152470704793563}

In [None]:
# create dataframe containing actual values, predictions and the differences between them
df_rf_scores = get_scores_dataframe(rf_model)
df_rf_scores.sort_values(by="differences").tail(100)

In [220]:
rf_model.feature_importances_

array([0.015, 0.43 , 0.044, 0.081, 0.017, 0.068, 0.066, 0.121, 0.01 ,
       0.01 , 0.075, 0.004, 0.027, 0.014, 0.017])

In [13]:
# check column names
X_train.columns

Index(['region', 'year', 'manufacturer', 'model', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'size',
       'type', 'paint_color', 'state'],
      dtype='object')

**The feature importance shows that the year and odometer is the most important field recognized by the model.**

### Gradient Boosting Regressor Model

In [221]:
%%time

gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_model.fit(X_train, y_train)

CPU times: total: 30 s
Wall time: 30.2 s


In [222]:
show_scores1(gbr_model)

{'Training MAE': 4519.369483524598,
 'Test MAE': 4486.561953282086,
 'Training R^2': 0.7322294812812037,
 'Test R^2': 0.7319879327283059}

In [223]:
# create dataframe containing actual values, predictions and the differences between them
df_gbr_scores = get_scores_dataframe(gbr_model)
df_gbr_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
131721,39000,2878.339307,36121.660693
86684,576,36747.154505,36171.154505
206450,743,36952.304904,36209.304904
124776,531,36760.847793,36229.847793
363501,531,36760.847793,36229.847793
...,...,...,...
278344,745,43958.519780,43213.519780
273735,840,44203.095443,43363.095443
186456,54500,9704.739785,44795.260215
275368,1095,46297.057407,45202.057407


### Lasso Regression Model

In [224]:
%%time

las_model = Lasso(alpha=0.1, random_state=42)
las_model.fit(X_train, y_train)

CPU times: total: 156 ms
Wall time: 92.8 ms


In [225]:
show_scores1(las_model)

{'Training MAE': 7536.884155437472,
 'Test MAE': 7466.536173909178,
 'Training R^2': 0.4009491877173812,
 'Test R^2': 0.4023523636155324}

In [226]:
# create dataframe containing actual values, predictions and the differences between them
df_las_scores = get_scores_dataframe(las_model)
df_las_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
122923,41200,2750.685188,38449.314812
334179,55995,17502.770540,38492.229460
293541,55000,16507.646075,38492.353925
253728,49988,11482.421666,38505.578334
216259,54980,16470.428548,38509.571452
...,...,...,...
68194,54999,7291.655108,47707.344892
139821,54995,7108.718679,47886.281321
357415,54880,6795.697280,48084.302720
305047,57500,8762.299088,48737.700912


### Conclusion

**From the three different models, the Random Forest Regressor Model had the best performance with this dataset. It achieved a Training MAE of 727,08 and a Test MAE of 1917,32.**

## 2) Filled Missing Values & Outliers Dataset (Dynamic Median)

In [14]:
df_filled_empty_outliers_dyn.head(10)

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,median_price,median_odometer
0,17,33590.0,106,16,22705,3,7,3,57923.0,1,3,0,0,9,11,2,25990.0,87149.0
1,17,22590.0,102,9,23067,3,7,3,71229.0,1,3,0,0,9,2,2,10975.0,126050.0
2,17,39590.0,112,9,23111,3,7,3,19160.0,1,3,0,0,9,9,2,37990.0,15191.0
3,17,30990.0,109,41,25915,3,7,3,41124.0,1,3,0,0,9,9,2,24995.0,42314.0
4,17,15000.0,105,15,13545,1,6,3,128000.0,1,1,3,2,11,1,2,14990.0,116700.0
5,17,27990.0,104,16,22817,3,7,3,68696.0,1,3,1,0,9,1,2,17337.0,118948.0
6,17,34590.0,108,9,23124,3,6,3,29499.0,1,3,1,0,9,10,2,26990.0,66244.5
7,17,35000.0,111,41,24544,1,6,3,43000.0,1,1,1,0,11,6,2,34900.0,22990.5
8,17,29990.0,108,9,10306,3,6,3,17302.0,1,3,1,0,9,9,2,26990.0,66244.5
9,17,38590.0,103,9,10708,3,7,3,30237.0,1,3,3,0,8,9,2,11950.0,126946.0


### Split Dataset

In [15]:
# Split data into X & y
X = df_filled_empty_outliers_dyn.drop("price", axis=1)

y = df_filled_empty_outliers_dyn['price']

In [16]:
X.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,median_price,median_odometer
0,17,106,16,22705,3,7,3,57923.0,1,3,0,0,9,11,2,25990.0,87149.0
1,17,102,9,23067,3,7,3,71229.0,1,3,0,0,9,2,2,10975.0,126050.0
2,17,112,9,23111,3,7,3,19160.0,1,3,0,0,9,9,2,37990.0,15191.0
3,17,109,41,25915,3,7,3,41124.0,1,3,0,0,9,9,2,24995.0,42314.0
4,17,105,15,13545,1,6,3,128000.0,1,1,3,2,11,1,2,14990.0,116700.0


In [17]:
y.head()

0    33590.0
1    22590.0
2    39590.0
3    30990.0
4    15000.0
Name: price, dtype: float64

In [18]:
# Split data into train & test sets
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
X_train.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,median_price,median_odometer
293682,297,99,30,10507,1,0,3,81081.0,1,1,2,0,5,0,38,6300.0,92145.0
324174,231,107,1,28,1,0,3,129000.0,5,1,0,0,0,11,43,20590.0,83771.0
156301,199,104,15,19087,4,6,3,149672.0,1,1,3,0,0,9,18,14990.0,119120.0
263474,60,96,15,13009,3,6,3,169000.0,1,1,1,0,1,2,36,7422.5,155000.0
102707,340,110,9,10302,3,6,3,12941.0,1,3,1,0,9,1,10,28990.0,39029.5


In [20]:
y_train.head()

293682     7995.0
324174    10882.0
156301    11995.0
263474     4500.0
102707    39990.0
Name: price, dtype: float64

### Random Forest Regressor Model

In [238]:
%%time

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

CPU times: total: 2min 15s
Wall time: 2min 16s


In [239]:
show_scores(rf_model)

{'Training MAE': 728.9362949248923,
 'Test MAE': 1932.4347689449285,
 'Training RMSLE': 0.21858401502514713,
 'Test RMSLE': 0.3833004672361087,
 'Training R^2': 0.9868580246334404,
 'Test R^2': 0.9073897670202533}

In [240]:
# create dataframe containing actual values, predictions and the differences between them
df_rf_scores = get_scores_dataframe(rf_model)
df_rf_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
372904,57400.0,25361.390000,32038.610000
197812,47500.0,15461.100000,32038.900000
84137,44522.5,12480.645000,32041.855000
79691,2000.0,34099.010000,32099.010000
378993,39995.0,7796.850000,32198.150000
...,...,...,...
370552,784.0,45307.920000,44523.920000
292514,724.0,46081.860000,45357.860000
79929,55000.0,9302.213333,45697.786667
28385,549.0,47331.810000,46782.810000


### Gradient Boosting Regressor Model

In [241]:
X_train.isna().sum()

region          0
year            0
manufacturer    0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
state           0
dtype: int64

In [242]:
%%time

gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_model.fit(X_train, y_train)

CPU times: total: 31.6 s
Wall time: 31.8 s


In [243]:
show_scores1(gbr_model)

{'Training MAE': 4589.609520857405,
 'Test MAE': 4590.489316953101,
 'Training R^2': 0.7290031092182032,
 'Test R^2': 0.7279538055168722}

In [244]:
# create dataframe containing actual values, predictions and the differences between them
df_gbr_scores = get_scores_dataframe(gbr_model)
df_gbr_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
137134,39000.0,3015.692628,35984.307372
287202,954.0,36951.285763,35997.285763
121862,954.0,36951.285763,35997.285763
368994,669.0,36686.965709,36017.965709
369387,580.0,36627.087172,36047.087172
...,...,...,...
1567,55000.0,11001.339448,43998.660552
322178,49000.0,4292.003948,44707.996052
205767,54000.0,8363.232699,45636.767301
79929,55000.0,9189.266334,45810.733666


### Lasso Regression Model

In [245]:
%%time

las_model = Lasso(alpha=0.1, random_state=42)
las_model.fit(X_train, y_train)

CPU times: total: 203 ms
Wall time: 62.5 ms


In [246]:
show_scores1(las_model)

{'Training MAE': 7566.134785826512,
 'Test MAE': 7586.195084847863,
 'Training R^2': 0.40692442158172404,
 'Test R^2': 0.40470812482164786}

In [247]:
# create dataframe containing actual values, predictions and the differences between them
df_las_scores = get_scores_dataframe(las_model)
df_las_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
21474,44900.0,6020.558472,38879.441528
273490,49988.0,11105.713682,38882.286318
19074,50980.0,12067.809742,38912.190258
221763,50350.0,11410.414830,38939.585170
4049,50980.0,12031.898588,38948.101412
...,...,...,...
302242,54000.0,2231.664876,51768.335124
164363,55000.0,3117.611551,51882.388449
222932,56900.0,4954.982423,51945.017577
43717,54000.0,1941.696932,52058.303068


### Conclusion

**From the three different models, the Random Forest Regressor Model had the best performance with this dataset. It achieved a Training MAE of 758,4 and a Test MAE of 2018,3.**

## 3) Filled Missing Values & Outliers Dataset (General Median)

In [21]:
df_filled_empty_outliers.head(10)

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,17,33590.0,106,15,22705,3,7,3,57923.0,1,3,0,0,9,11,2
1,17,22590.0,102,8,23067,3,7,3,71229.0,1,3,0,0,9,2,2
2,17,39590.0,112,8,23111,3,7,3,19160.0,1,3,0,0,9,9,2
3,17,30990.0,109,40,25915,3,7,3,41124.0,1,3,0,0,9,9,2
4,17,15000.0,105,14,13545,1,6,3,128000.0,1,1,3,2,11,1,2
5,17,27990.0,104,15,22817,3,7,3,68696.0,1,3,1,0,9,1,2
6,17,34590.0,108,8,23124,3,6,3,29499.0,1,3,1,0,9,10,2
7,17,35000.0,111,40,24544,1,6,3,43000.0,1,1,1,0,11,6,2
8,17,29990.0,108,8,10306,3,6,3,17302.0,1,3,1,0,9,9,2
9,17,38590.0,103,8,10708,3,7,3,30237.0,1,3,3,0,8,9,2


### Split Dataset

In [22]:
# Split data into X & y
X = df_filled_empty_outliers.drop("price", axis=1)

y = df_filled_empty_outliers['price']

In [23]:
X.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,17,106,15,22705,3,7,3,57923.0,1,3,0,0,9,11,2
1,17,102,8,23067,3,7,3,71229.0,1,3,0,0,9,2,2
2,17,112,8,23111,3,7,3,19160.0,1,3,0,0,9,9,2
3,17,109,40,25915,3,7,3,41124.0,1,3,0,0,9,9,2
4,17,105,14,13545,1,6,3,128000.0,1,1,3,2,11,1,2


In [24]:
y.head()

0    33590.0
1    22590.0
2    39590.0
3    30990.0
4    15000.0
Name: price, dtype: float64

In [25]:
# Split data into train & test sets
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
X_train.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
293682,297,99,29,10507,1,0,3,81081.0,1,1,2,0,5,0,38
324174,231,107,0,28,1,0,3,129000.0,5,1,0,0,0,11,43
156301,199,104,14,19087,4,6,3,149672.0,1,1,3,0,0,9,18
263474,60,96,14,13009,3,6,3,169000.0,1,1,1,0,1,2,36
102707,340,110,8,10302,3,6,3,12941.0,1,3,1,0,9,1,10


In [27]:
y_train.head()

293682     7995.0
324174    10882.0
156301    11995.0
263474     4500.0
102707    39990.0
Name: price, dtype: float64

### Random Forest Regressor Model

In [255]:
%%time

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

CPU times: total: 2min 15s
Wall time: 2min 15s


In [256]:
show_scores(rf_model)

{'Training MAE': 755.2921518193015,
 'Test MAE': 2010.4903903917038,
 'Training RMSLE': 0.2172180052969636,
 'Test RMSLE': 0.38419821384051794,
 'Training R^2': 0.9849765039017367,
 'Test R^2': 0.8921277808211747}

In [257]:
# create dataframe containing actual values, predictions and the differences between them
df_rf_scores = get_scores_dataframe(rf_model)
df_rf_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
289139,50000.0,17418.640000,32581.360000
205670,8000.0,40599.620000,32599.620000
212956,529.0,33158.196667,32629.196667
217777,757.0,33573.415000,32816.415000
125037,550.0,33410.400000,32860.400000
...,...,...,...
214114,621.0,44713.410000,44092.410000
292247,789.0,47205.740000,46416.740000
79929,55000.0,8050.230000,46949.770000
212950,766.0,48526.220000,47760.220000


### Gradient Boosting Regressor Model

In [258]:
%%time

gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_model.fit(X_train, y_train)

CPU times: total: 30.6 s
Wall time: 30.8 s


In [259]:
show_scores1(gbr_model)

{'Training MAE': 4834.259118213068,
 'Test MAE': 4839.898554433496,
 'Training R^2': 0.6841754707853704,
 'Test R^2': 0.6826325108438909}

In [260]:
# create dataframe containing actual values, predictions and the differences between them
df_gbr_scores = get_scores_dataframe(gbr_model)
df_gbr_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
262196,45000.0,10651.669707,34348.330293
348371,50000.0,15596.184494,34403.815506
282163,655.0,35090.763986,34435.763986
290729,856.0,35350.240503,34494.240503
291008,556.0,35090.763986,34534.763986
...,...,...,...
205767,54000.0,10200.638715,43799.361285
79929,55000.0,10506.865478,44493.134522
1567,55000.0,10333.182138,44666.817862
322178,49000.0,4311.219156,44688.780844


### Lasso Regression Model

In [261]:
%%time

las_model = Lasso(alpha=0.1, random_state=42)
las_model.fit(X_train, y_train)

CPU times: total: 281 ms
Wall time: 85 ms


In [262]:
show_scores1(las_model)

{'Training MAE': 7591.406155218309,
 'Test MAE': 7618.1857634561275,
 'Training R^2': 0.377997199314847,
 'Test R^2': 0.37423654883424917}

In [263]:
# create dataframe containing actual values, predictions and the differences between them
df_las_scores = get_scores_dataframe(las_model)
df_las_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
92845,46998.0,8015.731620,38982.268380
344526,45995.0,6973.547902,39021.452098
384273,49999.0,10925.674670,39073.325330
116669,40000.0,886.213294,39113.786706
93354,49998.0,10834.398753,39163.601247
...,...,...,...
27315,57995.0,8504.153918,49490.846082
205767,54000.0,4219.180574,49780.819426
164363,55000.0,3427.640511,51572.359489
203573,56900.0,4350.596080,52549.403920


In [28]:
X.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,17,106,15,22705,3,7,3,57923.0,1,3,0,0,9,11,2
1,17,102,8,23067,3,7,3,71229.0,1,3,0,0,9,2,2
2,17,112,8,23111,3,7,3,19160.0,1,3,0,0,9,9,2
3,17,109,40,25915,3,7,3,41124.0,1,3,0,0,9,9,2
4,17,105,14,13545,1,6,3,128000.0,1,1,3,2,11,1,2


In [29]:
X.isna().sum()

region          0
year            0
manufacturer    0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
state           0
dtype: int64

### Conclusion

**From the three different models, the Random Forest Regressor Model had the best performance with this dataset. It achieved a Training MAE of 789,3 and a Test MAE of 2111,6.**

## 4) Random Forest With Less Features

In [30]:
df_selected_features = df_dropped_empty_outliers.drop(columns=["region", "manufacturer", "model", "condition", "cylinders", "fuel", "title_status", "drive", "type", "paint_color", "state"])

In [31]:
df_selected_features

Unnamed: 0,price,year,odometer,transmission,size
0,33590,2014,57923.0,3,0
1,22590,2010,71229.0,3,0
2,39590,2020,19160.0,3,0
3,30990,2017,41124.0,3,0
4,15000,2013,128000.0,1,2
...,...,...,...,...,...
370457,23590,2019,32226.0,3,0
370458,30590,2020,12029.0,3,0
370459,34990,2020,4174.0,3,0
370460,28990,2018,30112.0,3,0


### Split Dataset

In [34]:
# Split data into X & y
X = df_selected_features.drop("price", axis=1)

y = df_selected_features['price']

In [35]:
X.head()

Unnamed: 0,year,odometer,transmission,size
0,2014,57923.0,3,0
1,2010,71229.0,3,0
2,2020,19160.0,3,0
3,2017,41124.0,3,0
4,2013,128000.0,1,2


In [36]:
y.head()

0    33590
1    22590
2    39590
3    30990
4    15000
Name: price, dtype: int64

In [37]:
# Split data into train & test sets
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [38]:
X_train.head()

Unnamed: 0,year,odometer,transmission,size
336782,2011,126129.0,1,0
323491,2000,143500.0,1,0
214453,2007,252180.0,1,0
95102,2019,32119.0,3,0
74004,2003,119000.0,1,0


In [39]:
y_train.head()

336782     8900
323491     4400
214453     9900
95102     52990
74004      4650
Name: price, dtype: int64

### Random Forest Regressor Model

In [275]:
%%time

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

CPU times: total: 38.8 s
Wall time: 39.1 s


In [276]:
show_scores(rf_model)

{'Training MAE': 1851.8868611667701,
 'Test MAE': 3748.4856450198135,
 'Training RMSLE': 0.36709588382725317,
 'Test RMSLE': 0.5431129696236174,
 'Training R^2': 0.9213295749106821,
 'Test R^2': 0.7132316280667008}

In [277]:
# create dataframe containing actual values, predictions and the differences between them
df_rf_scores = get_scores_dataframe(rf_model)
df_rf_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
32977,46500,8735.340000,37764.660000
211633,800,38613.866583,37813.866583
316159,13995,51823.280000,37828.280000
195516,50000,12108.830000,37891.170000
230163,5500,43564.750000,38064.750000
...,...,...,...
236578,560,48351.850000,47791.850000
51741,54165,5181.945818,48983.054182
277895,660,51995.171190,51335.171190
272714,760,55435.840000,54675.840000


### Conclusion

**The Random Forest Regressor Model achieved a Training MAE of 1851,8 and a Test MAE of 3748,4.**

## 5) Dropped Missing Values & Outliers Dataset (smaller set - cars from 2010 and after)

In [40]:
df_2010_filtered.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,17,33590,5,15,12235,3,7,3,57923.0,1,3,0,0,9,11,2
1,17,22590,1,9,12448,3,7,3,71229.0,1,3,0,0,9,2,2
2,17,39590,11,9,12486,3,7,3,19160.0,1,3,0,0,9,9,2
3,17,30990,8,39,14244,3,7,3,41124.0,1,3,0,0,9,9,2
4,17,15000,4,14,6700,1,6,3,128000.0,1,1,3,2,11,1,2


### Split Dataset

In [41]:
# Split data into X & y
X = df_2010_filtered.drop("price", axis=1)

y = df_2010_filtered['price']

In [42]:
X.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,17,5,15,12235,3,7,3,57923.0,1,3,0,0,9,11,2
1,17,1,9,12448,3,7,3,71229.0,1,3,0,0,9,2,2
2,17,11,9,12486,3,7,3,19160.0,1,3,0,0,9,9,2
3,17,8,39,14244,3,7,3,41124.0,1,3,0,0,9,9,2
4,17,4,14,6700,1,6,3,128000.0,1,1,3,2,11,1,2


In [43]:
y.head()

0    33590
1    22590
2    39590
3    30990
4    15000
Name: price, dtype: int64

In [44]:
# Split data into train & test sets
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [45]:
X_train.head()

Unnamed: 0,region,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
78419,34,10,14,9329,0,0,3,19097.0,1,1,1,0,11,11,14
43814,65,3,21,4648,1,4,3,90867.0,1,1,1,3,1,11,6
149423,194,12,14,6209,5,4,3,1.0,1,1,1,1,1,0,30
79461,34,8,17,4387,4,4,3,31706.0,1,1,2,1,10,11,14
255356,217,7,9,4620,3,6,3,17302.0,1,3,1,0,9,9,49


In [46]:
y_train.head()

78419     46999
43814      8995
149423    33640
79461     17995
255356    29990
Name: price, dtype: int64

### Random Forest Regressor Model

In [47]:
%%time

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

CPU times: total: 1min 17s
Wall time: 1min 18s


In [48]:
show_scores(rf_model)

{'Training MAE': 653.6663188315678,
 'Test MAE': 1717.4499167702274,
 'Training RMSLE': 0.21219606601536276,
 'Test RMSLE': 0.3344054849209445,
 'Training R^2': 0.9871241320507795,
 'Test R^2': 0.9119945305098588}

In [49]:
# create dataframe containing actual values, predictions and the differences between them
df_rf_scores = get_scores_dataframe(rf_model)
df_rf_scores.sort_values(by="differences").tail(100)

Unnamed: 0,actual values,predicted values,differences
247329,1030,28858.754286,27828.754286
38750,1900,29818.730000,27918.730000
67993,511,28448.613333,27937.613333
245226,46858,18859.480000,27998.520000
142538,833,28854.043333,28021.043333
...,...,...,...
30901,1030,47618.633333,46588.633333
77423,837,49719.372500,48882.372500
189597,753,50004.530000,49251.530000
164272,560,51050.330000,50490.330000


In [50]:
rf_model.feature_importances_

array([0.017, 0.275, 0.057, 0.105, 0.014, 0.088, 0.08 , 0.143, 0.014,
       0.012, 0.121, 0.005, 0.034, 0.017, 0.02 ])

In [51]:
df_2010_filtered.columns

Index(['region', 'price', 'year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title_status', 'transmission',
       'drive', 'size', 'type', 'paint_color', 'state'],
      dtype='object')

### Conclusion

**The Random Forest Regressor Model achieved a Training MAE of 653,6 and a Test MAE of 1717,4.**

## Hyperparameter tuning with RandomizedSearchCV

The model with the best performance was traibed on the Dropped Missing Values & Outliers Dataset (with cars from 2010 and after), so the hyperparameter tuning will be performed on that model.

In [19]:
%%time
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestRegressor hyperparameters
rf_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Instantiate the RandomizedSearchCV model
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,
                                                    random_state=42),
                              param_distributions=rf_grid,
                              n_iter=20,
                              cv=5,
                              verbose=True)

# Fit the RandomizedSearchCV model
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: total: 5h 46min 23s
Wall time: 23min 11s


In [20]:
# Find the best params
rs_model.best_params_

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_depth': 20,
 'bootstrap': True}

In [21]:
show_scores(rs_model)

{'Training MAE': 1238.5773903486206,
 'Test MAE': 2036.9085903074501,
 'Training RMSLE': 0.2677127835711585,
 'Test RMSLE': 0.35347914230541805,
 'Training R^2': 0.9679923003566601,
 'Test R^2': 0.9017968718974356}

In [52]:
# Different hyperparameters for LogisticRegression model
rf_grid = {
    "n_estimators": np.arange(10, 100, 10),
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": np.arange(2, 20, 2),
    "min_samples_leaf": np.arange(1, 20, 2),
    "max_features": [0.5, 1, "sqrt", "log2"],
    "max_samples": [100000]
}

# Simplified hyperparameter grid
rf_grid_simple = {
    "n_estimators": [10, 50, 100],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 5],
    "max_features": ["log2", "sqrt"],
    "max_samples": [100000]
}

# Setup grid hyperparameter search for LogisticRegression
gs_rf_reg = GridSearchCV(RandomForestRegressor(n_jobs=-1,
                                                random_state=42),
                              param_grid=rf_grid_simple,
                              cv=5,
                              verbose=True)

# Fit grid hyperparameter search model
gs_rf_reg.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [53]:
show_scores(gs_rf_reg)

{'Training MAE': 1385.1283959014893,
 'Test MAE': 2214.110270405585,
 'Training RMSLE': 0.2967832143475053,
 'Test RMSLE': 0.3686168338677035,
 'Training R^2': 0.958167553849143,
 'Test R^2': 0.89396303411757}

### Conclusion

**After hyperparameter tuning using Random Search, the Random Forest Regressor Model achieved a Training MAE of 1238,5 and a Test MAE of 2036,9.**

**After hyperparameter tuning using Grid Search, the Random Forest Regressor Model achieved a Training MAE of 1385,1 and a Test MAE of 2214,1.**

## Final Conclusion

**The model with the best results is Random Forest Regressor trained on the Dropped Missing Values & Outliers Dataset (with cars from 2010 and after), with a Test MAE (mean absolute error) of 1717.4.  
The results are not satisfactory as an avarage error of `$1700` is a huge amount when predicting used cars prices, especailly when the car is not expensive.**

The models are performing poorly mostly due to unreliable data:
* The dataset is missing a posting date or a sale date column.
* Since Craigslist is not specialized in selling cars, many of the features are free text instead of dropdown lists.

**The usage of better datasets from sites that are specialized in selling cars, with cleaner and more recent data, will certainlly improve the results of the models.**

## Export the model

In [56]:
import joblib

# Export model
joblib.dump(rf_model, 'models/car_pred_model.pkl')

['models/car_pred_model.pkl']