In [0]:
# !pip install --upgrade tables
# !pip install eli5
# !pip install xgboost

In [1]:
HOME = '/content/drive/My Drive/Colab Notebooks/matrix/dataworkshop_matrix_car'
%cd $HOME

/content/drive/My Drive/Colab Notebooks/matrix/dataworkshop_matrix_car


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error as mae

import xgboost as xgb

import eli5
from eli5.sklearn import PermutationImportance

sns.set()

Using TensorFlow backend.


In [3]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [4]:
SUFFIX = '__cat'

for feat in df.columns:
    if isinstance(df[feat][0], list):
        continue

    factorized_value = df[feat].factorize()[0]
    if SUFFIX in feat:
        df[feat] = factorized_value
    else:
        df[feat + SUFFIX] = factorized_value

df.shape

(106494, 309)

In [5]:
cat_feat = [x for x in df.columns if SUFFIX in x and 'price' not in x]
len(cat_feat)

151

In [0]:
X = df[cat_feat]
y = df['price_value']

In [7]:
model = DecisionTreeRegressor(max_depth=5)
score = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
np.mean(score), np.std(score)

(-19695.13091100928, 148.72570644015792)

In [8]:
def run_model(model, feats):
    X = df[feats]
    y = df['price_value']
    score = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
    return np.mean(score), np.std(score)

run_model(DecisionTreeRegressor(max_depth=5), cat_feat)

(-19695.13091100928, 148.72570644015792)

In [9]:
rnd_forrest = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(rnd_forrest, cat_feat)

(-18718.657185256638, 64.5424578125788)

In [10]:
xgboost_param = {
    'max_depth': 5,
    'n_estimators': 50, 
    'learning_rate': 0.1,
    'seed': 0
}

xgboost_model = xgb.XGBRegressor(**xgboost_param)
run_model(xgboost_model, cat_feat)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




(-13108.379065811214, 74.32158265003798)

In [22]:
xgboost_model = xgb.XGBRegressor(**xgboost_param)
xgboost_model.fit(X, y)

imp = PermutationImportance(xgboost_model, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feat)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




Weight,Feature
0.1194  ± 0.0031,param_napęd__cat
0.1132  ± 0.0032,param_rok-produkcji__cat
0.1090  ± 0.0025,param_stan__cat
0.0619  ± 0.0025,param_skrzynia-biegów__cat
0.0568  ± 0.0016,param_faktura-vat__cat
0.0489  ± 0.0014,param_moc__cat
0.0273  ± 0.0007,param_marka-pojazdu__cat
0.0242  ± 0.0013,feature_kamera-cofania__cat
0.0212  ± 0.0008,param_typ__cat
0.0174  ± 0.0008,param_pojemność-skokowa__cat


In [11]:
new_cat_feats = ['param_napęd__cat', 'param_rok-produkcji__cat', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 'param_moc__cat', 'param_marka-pojazdu__cat', 'feature_kamera-cofania__cat', 'param_typ__cat', 'param_pojemność-skokowa__cat', 'seller_name__cat', 'feature_wspomaganie-kierownicy__cat', 'param_model-pojazdu__cat', 'param_wersja__cat', 'param_kod-silnika__cat', 'feature_system-start-stop__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_czujniki-parkowania-przednie__cat', 'feature_łopatki-zmiany-biegów__cat', 'feature_regulowane-zawieszenie__cat']
len(new_cat_feats)

20

In [12]:
xgboost_model = xgb.XGBRegressor(**xgboost_param)
run_model(xgboost_model, new_cat_feats)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




(-13375.230420852275, 65.40441107118909)

In [0]:
def map_rp(x):
    return -1 if str(x) == 'None' else int(x)

df['param_rok-produkcji'] = [map_rp(x) for x in df['param_rok-produkcji']]

In [14]:
new_cat_feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 'param_moc__cat', 'param_marka-pojazdu__cat', 'feature_kamera-cofania__cat', 'param_typ__cat', 'param_pojemność-skokowa__cat', 'seller_name__cat', 'feature_wspomaganie-kierownicy__cat', 'param_model-pojazdu__cat', 'param_wersja__cat', 'param_kod-silnika__cat', 'feature_system-start-stop__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_czujniki-parkowania-przednie__cat', 'feature_łopatki-zmiany-biegów__cat', 'feature_regulowane-zawieszenie__cat']
run_model(xgboost_model, new_cat_feats)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




(-11308.885890938496, 27.868488259630677)

In [0]:
def map_m(x):
    return -1 if x == None else int(x.split(' ')[0])

df['param_moc'] = [map_m(x) for x in df['param_moc']]

In [16]:
new_cat_feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 'param_moc', 'param_marka-pojazdu__cat', 'feature_kamera-cofania__cat', 'param_typ__cat', 'param_pojemność-skokowa__cat', 'seller_name__cat', 'feature_wspomaganie-kierownicy__cat', 'param_model-pojazdu__cat', 'param_wersja__cat', 'param_kod-silnika__cat', 'feature_system-start-stop__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_czujniki-parkowania-przednie__cat', 'feature_łopatki-zmiany-biegów__cat', 'feature_regulowane-zawieszenie__cat']
run_model(xgboost_model, new_cat_feats)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




(-9716.450230340148, 62.2164408321879)

In [0]:
def map_ps(x):
    return -1 if not isinstance(x, str) else int(x.split(' cm')[0].replace(' ', ''))

df['param_pojemność-skokowa'] = [map_ps(x) for x in df['param_pojemność-skokowa']]

In [18]:
new_cat_feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 'param_moc', 'param_marka-pojazdu__cat', 'feature_kamera-cofania__cat', 'param_typ__cat', 'param_pojemność-skokowa', 'seller_name__cat', 'feature_wspomaganie-kierownicy__cat', 'param_model-pojazdu__cat', 'param_wersja__cat', 'param_kod-silnika__cat', 'feature_system-start-stop__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_czujniki-parkowania-przednie__cat', 'feature_łopatki-zmiany-biegów__cat', 'feature_regulowane-zawieszenie__cat']
run_model(xgboost_model, new_cat_feats)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




(-9569.227198767323, 72.83561801421891)