In [12]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import  MissingIndicator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import joblib

import os
import warnings

warnings.filterwarnings("ignore")

### Import data

In [13]:
link = r"../data/clean_df.csv"
df = pd.read_csv(link)

df.head()

Unnamed: 0,maker,model,mileage,manufacture_year,engine_displacement,engine_power,body_type,stk_year,transmission,door_count,seat_count,fuel_type,date_created,date_last_seen,price_eur
0,ford,galaxy,151000.0,2011.0,2000.0,138.12506,compact,,man,5.0,7.0,diesel,2015-11-14 18:10:06.838319+00,2016-01-27 20:40:15.46361+00,10584.75
1,skoda,octavia,143476.0,2012.0,2000.0,108.62262,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.853411+00,2016-01-27 20:40:15.46361+00,8882.31
2,bmw,,97676.0,2010.0,1995.0,113.9867,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.861792+00,2016-01-27 20:40:15.46361+00,12065.06
3,skoda,fabia,111970.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.872313+00,2016-01-27 20:40:15.46361+00,2960.77
4,skoda,fabia,128886.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.880335+00,2016-01-27 20:40:15.46361+00,2738.71


### Create man_period, stk_period

In [14]:
df['date_created'] = pd.to_datetime(df['date_created'])
df['year_created'] = df['date_created'].dt.year

df['man_period'] = df['year_created'] - df['manufacture_year']
df['stk_period'] = df['year_created'] - df['stk_year']

### Convert columns

In [15]:
numerical_columns = ['engine_power', 'mileage', 'engine_displacement']
conv_to_string_columns = ['door_count', 'seat_count', 'man_period', 'stk_period']
categorical_columns = ['maker', 'model', 'body_type', 'transmission', 'fuel_type']+conv_to_string_columns

In [16]:
def datatype_converter(x):
    for i in conv_to_string_columns:
        x[i] = x[i].apply(str)
    
    for i in categorical_columns:
        x[i] = x[i].astype('category')
    
    x[numerical_columns] = x[numerical_columns].apply(pd.to_numeric, downcast="float")
    x['price_eur'] = x['price_eur'].apply(pd.to_numeric, downcast='float')

    return x

df[categorical_columns] = df[categorical_columns].fillna('unknown')
df = datatype_converter(df)

In [17]:
class Indicator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        nonnull_X = np.nan_to_num(X.astype(float), nan=0).astype(int)

        missing_indicator = MissingIndicator()
        indicator_values = missing_indicator.fit_transform(X).astype(int)

        return np.c_[nonnull_X, indicator_values]

In [18]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
indicator = Indicator()

full_pipeline = ColumnTransformer([
    ("num", indicator, numerical_columns),
    ("cat", one_hot_encoder , categorical_columns),
])

### Split into train, val, test sets

In [19]:
X = df.drop('price_eur', axis=1)
y = df['price_eur']

full_pipeline.fit(X)
X_trans = full_pipeline.transform(X)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.2, random_state=42)

## Model

In [31]:
eva_df = pd.DataFrame(columns = ["Model", "RMSE", "MAE", "R2" ])     

### Decision tree

In [29]:
dt_reg = DecisionTreeRegressor(random_state=42)

dt_param_grid = {
    'splitter': ['best', 'random'],     # Strategy for choosing splits
    'max_depth': [10, 20, 30],     # Maximum depth of the tree
    'min_weight_fraction_leaf': [0.01, 0.05, 0.1],      # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider for the best split
}

dt_rs = RandomizedSearchCV(
    estimator=dt_reg,
    param_distributions=dt_param_grid,
    scoring = 'neg_mean_squared_error',
    n_iter=10,  # Number of parameter candidate settings to sample
    verbose=2,  # The higher this is, the more messages are outputed
    random_state=42,
    refit = True,
    return_train_score=True
)

In [30]:
# Fit
dt_rs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=20, max_features=auto, min_weight_fraction_leaf=0.01, splitter=random; total time=  10.2s
[CV] END max_depth=20, max_features=auto, min_weight_fraction_leaf=0.01, splitter=random; total time=  10.4s
[CV] END max_depth=20, max_features=auto, min_weight_fraction_leaf=0.01, splitter=random; total time=  10.5s
[CV] END max_depth=20, max_features=auto, min_weight_fraction_leaf=0.01, splitter=random; total time=  10.7s
[CV] END max_depth=20, max_features=auto, min_weight_fraction_leaf=0.01, splitter=random; total time=  10.7s
[CV] END max_depth=30, max_features=log2, min_weight_fraction_leaf=0.01, splitter=random; total time=   0.3s
[CV] END max_depth=30, max_features=log2, min_weight_fraction_leaf=0.01, splitter=random; total time=   0.3s
[CV] END max_depth=30, max_features=log2, min_weight_fraction_leaf=0.01, splitter=random; total time=   0.3s
[CV] END max_depth=30, max_features=log2, min_weight_fraction_leaf=

RandomizedSearchCV(estimator=DecisionTreeRegressor(random_state=42),
                   param_distributions={'max_depth': [10, 20, 30],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_weight_fraction_leaf': [0.01, 0.05,
                                                                     0.1],
                                        'splitter': ['best', 'random']},
                   random_state=42, return_train_score=True,
                   scoring='neg_mean_squared_error', verbose=2)

In [53]:
dt_rs.best_params_

{'splitter': 'random',
 'min_weight_fraction_leaf': 0.01,
 'max_features': 'auto',
 'max_depth': 20}

In [35]:
best_dt_reg = dt_rs.best_estimator_
y_pred = best_dt_reg.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

eva_df = eva_df.append({"Model": "Decision Tree", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear regression with SVD,7751.070399,4893.32651,0.626433


In [37]:
joblib.dump(best_dt_reg, './model/decision_tree.joblib')

['./model/decision_tree.joblib']

### Random forest

In [27]:
rf_reg = RandomForestRegressor()

rf_param_grid = {
    "n_estimators": [10, 15, 20, 40],
    "max_depth": [10, 15, 20, 25],
    'min_samples_split': [5,7,10, 12, 15],
    'min_samples_leaf': [5,7,10, 12, 15],
    "ccp_alpha": [0, 0.1, 0.01, 0.001],
    "max_samples": [0.1, 0.2]
}

rf_rs = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=rf_param_grid,
    scoring = 'neg_mean_squared_error',
    n_iter=30,  # Number of parameter candidate settings to sample
    verbose=2,  # The higher this is, the more messages are outputed
    random_state=42,
    refit = True,
    return_train_score=True
)

In [28]:
# Fit
X_sample, _, y_sample, _ = train_test_split(X_train, y_train, test_size=0.95)
rf_rs.fit(X_sample, y_sample)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ccp_alpha=0.001, max_depth=25, max_samples=0.2, min_samples_leaf=12, min_samples_split=12, n_estimators=20; total time=  11.1s
[CV] END ccp_alpha=0.001, max_depth=25, max_samples=0.2, min_samples_leaf=12, min_samples_split=12, n_estimators=20; total time=  11.3s
[CV] END ccp_alpha=0.001, max_depth=25, max_samples=0.2, min_samples_leaf=12, min_samples_split=12, n_estimators=20; total time=  10.5s
[CV] END ccp_alpha=0.001, max_depth=25, max_samples=0.2, min_samples_leaf=12, min_samples_split=12, n_estimators=20; total time=  10.5s
[CV] END ccp_alpha=0.001, max_depth=25, max_samples=0.2, min_samples_leaf=12, min_samples_split=12, n_estimators=20; total time=  10.6s
[CV] END ccp_alpha=0.1, max_depth=10, max_samples=0.1, min_samples_leaf=12, min_samples_split=5, n_estimators=10; total time=   1.6s
[CV] END ccp_alpha=0.1, max_depth=10, max_samples=0.1, min_samples_leaf=12, min_samples_split=5, n_estimators=10; total time=

RandomizedSearchCV(estimator=RandomForestRegressor(), n_iter=30,
                   param_distributions={'ccp_alpha': [0, 0.1, 0.01, 0.001],
                                        'max_depth': [10, 15, 20, 25],
                                        'max_samples': [0.1, 0.2],
                                        'min_samples_leaf': [5, 7, 10, 12, 15],
                                        'min_samples_split': [5, 7, 10, 12, 15],
                                        'n_estimators': [10, 15, 20, 40]},
                   random_state=42, return_train_score=True,
                   scoring='neg_mean_squared_error', verbose=2)

In [32]:
rf_rs.best_params_

{'n_estimators': 40,
 'min_samples_split': 12,
 'min_samples_leaf': 5,
 'max_samples': 0.2,
 'max_depth': 25,
 'ccp_alpha': 0.1}

In [33]:
best_rf_reg = rf_rs.best_estimator_
y_pred = best_rf_reg.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

eva_df = eva_df.append({"Model": "Random Forest", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Random Forest,5164.351175,2872.686599,0.834165


In [34]:
joblib.dump(best_rf_reg, './model/random_forest.joblib')

['./model/random_forest.joblib']

In [63]:
params = {
    'n_estimators': 18,
    'min_samples_split': 6,
    'min_samples_leaf': 6,
    'max_samples': 0.1,
    'max_depth': 11,
    'ccp_alpha': 0.1,
    "random_state": 10,
    "n_jobs": -1,
    "bootstrap": True,
}

rf = RandomForestRegressor(**params)

rf = rf.fit(X_train, y_train)

In [64]:
y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

mse**0.5

5227.843560670883

In [65]:
R2

0.8300621727565209