In [1]:
import pandas as pd
import numpy as np

import eli5
from eli5.sklearn import PermutationImportance

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold



In [2]:
carDf = pd.read_hdf('../data/car.h5')
rowsCount, columnsCount = carDf.shape

columns = carDf.columns

## Feature Engineering

In [3]:
SUFFIX_CAT = '__cat'
for feature in carDf.columns:
    if isinstance(carDf[feature][0], list):
        continue
    
    factorizedValues = carDf[feature].factorize()[0]
    
    if SUFFIX_CAT in feature:
        carDf[feature] = factorizedValues
    else:
        column = f'{feature}{SUFFIX_CAT}'
        carDf[column] = factorizedValues

In [4]:
catFeatures = [column for column in carDf.columns if SUFFIX_CAT in column]
catFeatures = [column for column in catFeatures if 'price' not in column]

In [5]:
def runModel(aModel, aFeatures):
    xData = carDf[aFeatures].values
    yData = carDf['price_value'].values

    scores = cross_val_score(aModel, xData, yData, cv = 3, scoring = 'neg_mean_absolute_error')
    return np.mean(scores), np.std(scores)

In [6]:
runModel(DecisionTreeRegressor(max_depth = 5), catFeatures)

(-19695.13091100928, 148.72570644015792)

## Random Forest

In [7]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
runModel(model, catFeatures)

(-18718.657185256638, 64.5424578125788)

## XGBoost

In [8]:
xgbParams = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0
}

runModel(xgb.XGBRegressor(**xgbParams), catFeatures)

(-13108.379065811214, 74.32158265003798)

In [10]:
xgbModel = xgb.XGBRegressor(**xgbParams)

xData = carDf[catFeatures].values
yData = carDf['price_value'].values
xgbModel.fit(xData, yData)

imp = PermutationImportance(xgbModel, random_state = 0).fit(xData, yData)
eli5.show_weights(imp, feature_names = catFeatures)

Weight,Feature
0.1194  ± 0.0031,param_napęd__cat
0.1132  ± 0.0032,param_rok-produkcji__cat
0.1090  ± 0.0025,param_stan__cat
0.0619  ± 0.0025,param_skrzynia-biegów__cat
0.0568  ± 0.0016,param_faktura-vat__cat
0.0489  ± 0.0014,param_moc__cat
0.0273  ± 0.0007,param_marka-pojazdu__cat
0.0242  ± 0.0013,feature_kamera-cofania__cat
0.0212  ± 0.0008,param_typ__cat
0.0174  ± 0.0008,param_pojemność-skokowa__cat


In [11]:
features = [
    'param_napęd__cat',
    'param_stan__cat',
    'param_rok-produkcji__cat',
    'param_faktura-vat__cat',
    'param_moc__cat',
    'param_skrzynia-biegów__cat',
    'param_marka-pojazdu__cat',
    'feature_kamera-cofania__cat',
    'param_typ__cat',
    'param_pojemność-skokowa__cat',
    'seller_name__cat',
    'param_wersja__cat',
    'feature_wspomaganie-kierownicy__cat',
    'param_model-pojazdu__cat',
    'feature_system-start-stop__cat',
    'param_kod-silnika__cat',
    'feature_asystent-pasa-ruchu__cat',
    'feature_łopatki-zmiany-biegów__cat',
    'feature_światła-led__cat',
    'feature_czujniki-parkowania-przednie__cat'
    ]

In [12]:
runModel(xgb.XGBRegressor(**xgbParams), features)

(-13371.749129325122, 120.74632178909155)

In [13]:
carDf['param_rok-produkcji'] = carDf['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))

features = [
    'param_napęd__cat',
    'param_stan__cat',
    'param_rok-produkcji',
    'param_faktura-vat__cat',
    'param_moc__cat',
    'param_skrzynia-biegów__cat',
    'param_marka-pojazdu__cat',
    'feature_kamera-cofania__cat',
    'param_typ__cat',
    'param_pojemność-skokowa__cat',
    'seller_name__cat',
    'param_wersja__cat',
    'feature_wspomaganie-kierownicy__cat',
    'param_model-pojazdu__cat',
    'feature_system-start-stop__cat',
    'param_kod-silnika__cat',
    'feature_asystent-pasa-ruchu__cat',
    'feature_łopatki-zmiany-biegów__cat',
    'feature_światła-led__cat',
    'feature_czujniki-parkowania-przednie__cat'
    ]

runModel(xgb.XGBRegressor(**xgbParams), features)

(-11386.828140541147, 58.212963494024464)

In [14]:
carDf['param_moc'] = carDf['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0]))

features = [
    'param_napęd__cat',
    'param_stan__cat',
    'param_rok-produkcji',
    'param_faktura-vat__cat',
    'param_moc',
    'param_skrzynia-biegów__cat',
    'param_marka-pojazdu__cat',
    'feature_kamera-cofania__cat',
    'param_typ__cat',
    'param_pojemność-skokowa__cat',
    'seller_name__cat',
    'param_wersja__cat',
    'feature_wspomaganie-kierownicy__cat',
    'param_model-pojazdu__cat',
    'feature_system-start-stop__cat',
    'param_kod-silnika__cat',
    'feature_asystent-pasa-ruchu__cat',
    'feature_łopatki-zmiany-biegów__cat',
    'feature_światła-led__cat',
    'feature_czujniki-parkowania-przednie__cat'
    ]

runModel(xgb.XGBRegressor(**xgbParams), features)

(-9725.904859228242, 80.8769815926293)

In [15]:
carDf['param_pojemność-skokowa'] = carDf['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else str(x).split('cm3')[0].replace(' ', ''))

features = [
    'param_napęd__cat',
    'param_stan__cat',
    'param_rok-produkcji',
    'param_faktura-vat__cat',
    'param_moc',
    'param_skrzynia-biegów__cat',
    'param_marka-pojazdu__cat',
    'feature_kamera-cofania__cat',
    'param_typ__cat',
    'param_pojemność-skokowa',
    'seller_name__cat',
    'param_wersja__cat',
    'feature_wspomaganie-kierownicy__cat',
    'param_model-pojazdu__cat',
    'feature_system-start-stop__cat',
    'param_kod-silnika__cat',
    'feature_asystent-pasa-ruchu__cat',
    'feature_łopatki-zmiany-biegów__cat',
    'feature_światła-led__cat',
    'feature_czujniki-parkowania-przednie__cat'
    ]

runModel(xgb.XGBRegressor(**xgbParams), features)

(-9556.21383364917, 101.81252798368146)