In [0]:
import os

!pip install --upgrade tables
!pip install eli5
!pip install xgboost
!pip install hyperopt

os.kill(os.getpid(), 9)

In [0]:
from google.colab import drive
ROOT_DIR = "/content/drive"
drive.mount(ROOT_DIR)

In [0]:
MATRIX_TWO_PATH = ROOT_DIR + "/My Drive/Colab Notebooks/matrix/matrix_two"

import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

from hyperopt import hp, fmin, tpe, STATUS_OK

import eli5
from eli5.sklearn import PermutationImportance

In [0]:
cd "{MATRIX_TWO_PATH}/dw_matrix_cars/"

## Importing data

In [0]:
df = pd.read_hdf('data/car.h5')

In [0]:
df.columns
# df = df[df['price_currency'] == 'PLN']

## Feature engineering

In [0]:
suffix_cat = '__cat'

for feat in df.columns:
  if isinstance(df[feat][0], list): continue
  factorized_values = df[feat].factorize()[0]
  if suffix_cat in feat:
    df[feat] = factorized_values
  else:
    df[feat + suffix_cat] = factorized_values

In [9]:
cat_feats = [x for x in df.columns if suffix_cat in x]
cat_feats = [x for x in cat_feats if 'price' not in x]

len(cat_feats)

151

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y , cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

## Decision tree

In [0]:
run_model(DecisionTreeRegressor(max_depth=5), cat_feats)

(-19566.588937368328, 90.61814865166)

## Random forest

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(model, cat_feats)

(-18718.657185256638, 64.5424578125788)

## XGBoost

In [13]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0
}

xgb_model = xgb.XGBRegressor(**xgb_params)
run_model(xgb_model, cat_feats)



(-13108.379065811214, 74.32158265003798)

In [0]:
xgb_model.fit(X,y)

imp = PermutationImportance(xgb_model, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats, top=80)



Weight,Feature
0.1194  ± 0.0031,param_napęd__cat
0.1132  ± 0.0032,param_rok-produkcji__cat
0.1090  ± 0.0025,param_stan__cat
0.0619  ± 0.0025,param_skrzynia-biegów__cat
0.0568  ± 0.0016,param_faktura-vat__cat
0.0489  ± 0.0014,param_moc__cat
0.0273  ± 0.0007,param_marka-pojazdu__cat
0.0242  ± 0.0013,feature_kamera-cofania__cat
0.0212  ± 0.0008,param_typ__cat
0.0174  ± 0.0008,param_pojemność-skokowa__cat


In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0]))
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.split('cm')[0].replace(' ', '')))
#df['param_liczba-miejsc'] = df['param_liczba-miejsc'].map(lambda x: -1 if str(x) == 'None' else int(x))

In [14]:
feats = [
  'param_napęd__cat',
  'param_rok-produkcji',
  'param_stan__cat',
  'param_skrzynia-biegów__cat',
  'param_faktura-vat__cat',
  'param_moc',
  'param_marka-pojazdu__cat',
  'feature_kamera-cofania__cat',
  'param_typ__cat',
  'param_pojemność-skokowa',
  'seller_name__cat',
  'feature_wspomaganie-kierownicy__cat',
  'param_model-pojazdu__cat',
  'param_wersja__cat',
  'param_kod-silnika__cat',
  'feature_system-start-stop__cat',
  'feature_asystent-pasa-ruchu__cat',
  'feature_czujniki-parkowania-przednie__cat',
  'feature_łopatki-zmiany-biegów__cat',
  'feature_regulowane-zawieszenie__cat',
  # 'feature_światła-led__cat',
  # 'param_uszkodzony__cat',
  # 'feature_klimatyzacja-czterostrefowa__cat',
  # 'feature_hud-(wyświetlacz-przezierny)__cat',
  # 'param_rodzaj-paliwa__cat',
  # 'param_liczba-miejsc',
]

run_model(xgb_model, feats)



(-9569.227198767323, 72.83561801421891)

In [16]:
df['param_liczba-miejsc'].unique()

array([None, '5', '7', '4', '2', '6', '8', '9', '3', '1'], dtype=object)

## Hyperopt

In [15]:
def obj_func(params):
    print("Training with params:")
    print(params)
    try:
      mean_mae, score_std = run_model(xgb.XGBRegressor(**params), feats)
      return{'loss': np.abs(mean_mae), 'status': STATUS_OK}
    except:
      return{'loss': np.abs(mean_mae), 'status': STATUS_FAIL}

xgb_reg_params = {
    "learning_rate": hp.choice("learning_rate", np.arange(0.05, 0.31, 0.05)),
    "max_depth": hp.choice("max_depth", np.arange(5, 16, 1, dtype=int)),
    "subsample": hp.quniform("subsample", 0.5, 1, 0.05),
    "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 1, 0.05),
    "objective": 'reg:squarederror',
    "n_estimators": 100,
    "seed": 0
}

best = fmin(obj_func, xgb_reg_params, algo=tpe.suggest, max_evals=25)
best

Training with params:
{'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}
Training with params:
{'colsample_bytree': 0.55, 'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}
Training with params:
{'colsample_bytree': 0.75, 'learning_rate': 0.15000000000000002, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9500000000000001}
Training with params:
{'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}
Training with params:
{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}
Training with p

{'colsample_bytree': 0.9500000000000001,
 'learning_rate': 2,
 'max_depth': 9,
 'subsample': 0.8500000000000001}