In [1]:
import pandas as pd
import numpy as np

import eli5
from eli5.sklearn import PermutationImportance

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score



## Load data

In [2]:
carDf = pd.read_hdf('../data/car.h5')
rowsCount, columnsCount = carDf.shape

columns = carDf.columns
print(columns)

Index(['breadcrumb', 'created_at', 'price_currency', 'price_details',
       'price_value', 'seller_address', 'seller_name', 'seller_type',
       'feature_czujniki-parkowania-przednie',
       'feature_poduszka-powietrzna-chroniąca-kolana',
       ...
       'param_pearl', 'param_stan', 'param_wersja', 'param_emisja-co2',
       'param_body-type', 'param_matowy', 'param_bezwypadkowy',
       'param_akryl-(niemetalizowany)', 'param_monthly-payment-value',
       'car_id'],
      dtype='object', length=155)


## Dummy model

In [3]:
numberColumns = carDf.select_dtypes(np.number).columns

In [4]:
features = ['car_id']
xData = carDf[features].values
yData = carDf['price_value'].values

model = DummyRegressor()
model.fit(xData, yData)
predictions = model.predict(xData)

mae(yData, predictions)

39465.934630440985

In [5]:
carDf = carDf[carDf['price_currency'] != 'EUR']

## Features

In [6]:
SUFFIX_CAT = '__cat'
for feature in carDf.columns:
    if isinstance(carDf[feature][0], list):
        continue
    
    factorizedValues = carDf[feature].factorize()[0]
    
    if SUFFIX_CAT in feature:
        carDf[feature] = factorizedValues
    else:
        column = f'{feature}{SUFFIX_CAT}'
        carDf[column] = factorizedValues

In [7]:
catFeatures = [column for column in carDf.columns if SUFFIX_CAT in column]
catFeatures = [column for column in catFeatures if 'price' not in column]

In [8]:
xData = carDf[catFeatures].values
yData = carDf['price_value'].values

model = DecisionTreeRegressor(max_depth = 5)
scores = cross_val_score(model, xData, yData, cv = 3, scoring = 'neg_mean_absolute_error')
np.mean(scores)

-19566.588937368324

In [9]:
nextModel =  DecisionTreeRegressor(max_depth = 5)
nextModel.fit(xData, yData)

imp = PermutationImportance(nextModel, random_state = 0).fit(xData, yData)
eli5.show_weights(nextModel, feature_names = catFeatures)

Weight,Feature
0.3567,param_faktura-vat__cat
0.2785,param_napęd__cat
0.0782,param_rok-produkcji__cat
0.0666,feature_kamera-cofania__cat
0.0562,param_stan__cat
0.0526,param_moc__cat
0.0313,param_skrzynia-biegów__cat
0.0191,feature_łopatki-zmiany-biegów__cat
0.0182,param_pojemność-skokowa__cat
0.0170,feature_bluetooth__cat


In [10]:
features = [
    'param_faktura-vat__cat', 
    'param_napęd__cat', 
    'param_rok-produkcji__cat',
    'feature_kamera-cofania__cat',
    'param_stan__cat',
    'param_moc__cat',
    'param_skrzynia-biegów__cat',
    'feature_łopatki-zmiany-biegów__cat',
    'param_pojemność-skokowa__cat',
    'feature_bluetooth__cat',
    'param_marka-pojazdu__cat'
]

In [11]:
xData = carDf[features].values
yData = carDf['price_value'].values

model = DecisionTreeRegressor(max_depth = 5)
scores = cross_val_score(model, xData, yData, cv = 3, scoring = 'neg_mean_absolute_error')
np.mean(scores)

-19802.35306460076