In [1]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 2.8MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 2.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

Using TensorFlow backend.


In [3]:
cd "/content/drive/My Drive/Colab Notebooks/matrix_two/dw_matrix_car"

/content/drive/My Drive/Colab Notebooks/matrix_two/dw_matrix_car


In [4]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [0]:
df = df[ df['price_currency'] != 'EUR' ]

## Feature engineering

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance(df[feat][0], list): continue

  if SUFFIX_CAT in feat: continue
  
  df[feat + SUFFIX_CAT] = df[feat].factorize()[0]

In [7]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x ]
# remove price related features
cat_feats = [x for x in cat_feats if 'price' not in x ]
len(cat_feats)

151

In [8]:
X = df[ cat_feats ].values
y = df['price_value'].values

model = DecisionTreeRegressor(max_depth=5)
scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
np.mean(scores)

-19566.588937368324

In [0]:
def run_model(model, feats):
  X = df[ feats ].values
  y = df['price_value'].values
  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

## Decision Tree

In [12]:
run_model( DecisionTreeRegressor(max_depth=5), cat_feats )

(-19566.588937368324, 90.6181486516617)

## Random Forest

In [14]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model( model, cat_feats )

(-18668.68348676454, 100.47250903832975)

## XGBoost

In [15]:
xgb_params = {
    'max_depth': 5, 
    'n_estimators': 50, 
    'seed': 0,
    'learning_rate': 0.1
}

model = xgb.XGBRegressor (**xgb_params)
run_model( model, cat_feats )



(-13039.290196724838, 109.36715375706265)

In [16]:
m = xgb.XGBRegressor(**xgb_params)
m.fit(X, y)

imp = PermutationImportance(m, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1209  ± 0.0019,param_napęd__cat
0.1175  ± 0.0030,param_rok-produkcji__cat
0.1113  ± 0.0013,param_stan__cat
0.0625  ± 0.0019,param_skrzynia-biegów__cat
0.0527  ± 0.0016,param_faktura-vat__cat
0.0461  ± 0.0015,param_moc__cat
0.0275  ± 0.0008,param_marka-pojazdu__cat
0.0230  ± 0.0004,param_typ__cat
0.0227  ± 0.0007,feature_kamera-cofania__cat
0.0191  ± 0.0007,param_pojemność-skokowa__cat


In [17]:
feats = ['param_napęd__cat', 'param_rok-produkcji__cat', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 'param_moc__cat', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_pojemność-skokowa__cat', 'seller_name__cat', 'param_kod-silnika__cat', 'param_model-pojazdu__cat', 'feature_wspomaganie-kierownicy__cat', 'param_wersja__cat', 'feature_czujniki-parkowania-przednie__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_regulowane-zawieszenie__cat', 'feature_system-start-stop__cat', 'feature_światła-led__cat']

len(feats)

20

In [18]:
xgb_params = {
    'max_depth': 5, 
    'n_estimators': 50, 
    'seed': 0,
    'learning_rate': 0.1
}

model = xgb.XGBRegressor (**xgb_params)
run_model( model, feats )



(-13240.835942843716, 95.7039217631258)

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))

In [20]:
feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 'param_moc__cat', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_pojemność-skokowa__cat', 'seller_name__cat', 'param_kod-silnika__cat', 'param_model-pojazdu__cat', 'feature_wspomaganie-kierownicy__cat', 'param_wersja__cat', 'feature_czujniki-parkowania-przednie__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_regulowane-zawieszenie__cat', 'feature_system-start-stop__cat', 'feature_światła-led__cat']


model = xgb.XGBRegressor (**xgb_params)
run_model( model, feats )




(-11197.83713694348, 98.22041147876314)

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0]) )

In [24]:
feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 
         'param_moc', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_pojemność-skokowa__cat', 
         'seller_name__cat', 'param_kod-silnika__cat', 'param_model-pojazdu__cat', 'feature_wspomaganie-kierownicy__cat', 'param_wersja__cat', 
         'feature_czujniki-parkowania-przednie__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_regulowane-zawieszenie__cat', 
         'feature_system-start-stop__cat', 'feature_światła-led__cat']


model = xgb.XGBRegressor (**xgb_params)
run_model( model, feats )




(-9602.94111071797, 57.96672683246094)

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.split('cm')[0].replace(' ', '')) )

In [28]:
feats = ['param_napęd__cat', 'param_rok-produkcji', 'param_stan__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat', 
         'param_moc', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_pojemność-skokowa', 
         'seller_name__cat', 'param_kod-silnika__cat', 'param_model-pojazdu__cat', 'feature_wspomaganie-kierownicy__cat', 'param_wersja__cat', 
         'feature_czujniki-parkowania-przednie__cat', 'feature_asystent-pasa-ruchu__cat', 'feature_regulowane-zawieszenie__cat', 
         'feature_system-start-stop__cat', 'feature_światła-led__cat']


model = xgb.XGBRegressor (**xgb_params)
run_model( model, feats )



(-9449.513980284812, 81.47168211987172)