### Imports

In [0]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score

from hyperopt import hp, fmin, tpe, STATUS_OK

import eli5
from eli5.sklearn import PermutationImportance

### Data Loading

In [7]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

### Feature Engineering

In [0]:
SUFFIX_CAT = "_cat"
for feat in df.columns:
  if isinstance(df[feat][0], list): continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values 

In [9]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]
len(cat_feats)

151

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df.price_value.values
  scores = cross_val_score(model, X, y, cv=3, scoring ='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

### XGboost

In [11]:
xgb_params = {
    'max_depth' : 5,
    'n_estimators' : 50,
    'learning_rate' : 0.1,
    'seed' :0 
}

run_model(xgb.XGBRegressor(**xgb_params), cat_feats)



(-13108.379065811214, 74.32158265003798)

In [0]:
X = df[cat_feats].values
y = df.price_value.values

In [13]:
feats = ['param_napęd_cat',
'param_rok-produkcji_cat',
'param_stan_cat',
'param_skrzynia-biegów_cat',
'param_faktura-vat_cat',
'param_moc_cat',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa_cat',
'seller_name_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'param_wersja_cat',
'param_kod-silnika_cat',
'feature_system-start-stop_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_czujniki-parkowania-przednie_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_regulowane-zawieszenie_cat']
len(feats)

20

In [14]:
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-13375.230420852275, 65.40441107118909)

In [15]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))
df['param_rok-produkcji'].unique()

array([2018, 2011, 2015, 2009, 2017, 2012, 2013, 2007, 2001, 2016, 2006,
       2008, 2004, 1999, 2000, 2010, 2005, 2002, 1998, 2014, 2003, 1982,
       1995, 1997, 1992, 1993, 1994, 1996, 1989, 1988, 1967, 1987, 1970,
       1959, 1990, 1991, 1974,   -1, 1975, 1973, 1953, 1985, 1984, 1986,
       1981, 1979, 1960, 1983, 1978, 1964, 1980, 1972, 1969, 1956, 1966,
       1977, 1962, 1965, 1971, 1963, 1961, 1952, 1949, 1976, 1937, 1968,
       1958, 1955, 1933, 1929, 1957, 1944, 1954, 1932, 1936, 1947, 1948])

In [16]:
feats = ['param_napęd_cat',
'param_rok-produkcji',
'param_stan_cat',
'param_skrzynia-biegów_cat',
'param_faktura-vat_cat',
'param_moc_cat',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa_cat',
'seller_name_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'param_wersja_cat',
'param_kod-silnika_cat',
'feature_system-start-stop_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_czujniki-parkowania-przednie_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_regulowane-zawieszenie_cat']

run_model(xgb.XGBRegressor(**xgb_params), feats)



(-11308.885890938496, 27.868488259630677)

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split('K')[0].replace(" ", "")))

In [18]:
df.param_moc.unique()

array([    90,    115,    262,    110,    310,    105,    140,    175,
          125,    185,    190,    440,    141,    200,    224,     75,
           99,    184,    109,    233,    116,     68,    286,    126,
          160,    135,    120,    272,     -1,    150,    180,    136,
          102,    131,    218,    245,    170,    112,    250,    252,
           73,    100,    313,    101,    285,     70,    383,    174,
          277,    132,    130,    215,     60,    330,    163,    177,
           98,     78,    189,    156,    143,     69,    113,     65,
          122,     82,    251,     95,    197,    235,    238,    171,
          381,    400,    178,     80,    165,     85,    258,    142,
          204,    124,     55,    144,    231,    248,    152,    181,
          210,    340,    129,    147,     50,     54,    290,    306,
          193,     77,    164,     96,    194,    111,    166,    206,
          118,    360,    211,    271,    455,    280,    106,    114,
      

In [19]:
feats = ['param_napęd_cat',
'param_rok-produkcji',
'param_stan_cat',
'param_skrzynia-biegów_cat',
'param_faktura-vat_cat',
'param_moc',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa_cat',
'seller_name_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'param_wersja_cat',
'param_kod-silnika_cat',
'feature_system-start-stop_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_czujniki-parkowania-przednie_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_regulowane-zawieszenie_cat']

run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9754.814038220027, 78.76301552059402)

In [20]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.split('cm')[0].replace(" ","")))

feats = ['param_napęd_cat',
'param_rok-produkcji',
'param_stan_cat',
'param_skrzynia-biegów_cat',
'param_faktura-vat_cat',
'param_moc',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa',
'seller_name_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'param_wersja_cat',
'param_kod-silnika_cat',
'feature_system-start-stop_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_czujniki-parkowania-przednie_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_regulowane-zawieszenie_cat']

run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9621.119663721702, 100.59307679134791)

### Hyperopt

In [21]:
def obj_func(params):
  print('Training with params: ')
  print(params)
  mean_mae, score_std = run_model(xgb.XGBRegressor(**params), feats)

  return { 'loss' : np.abs(mean_mae), 'status' : STATUS_OK}

#space
xgb_reg_params = {
  'learning_rate' : hp.choice('learning_rate', np.arange(0.05, 0.31, 0.05)),
  'max_depth': hp.choice('max_depth', np.arange(5,16,1, dtype=int)),
  'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
  'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
  'objective': 'reg:squarederror',
  'n_estimators': 100,
  'seed' : 0
}

#run

best = fmin(obj_func, xgb_reg_params, algo=tpe.suggest, max_evals=25)

Training with params: 
{'colsample_bytree': 1.0, 'learning_rate': 0.3, 'max_depth': 14, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.55}
Training with params: 
{'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 12, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 1.0}
Training with params: 
{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.55}
Training with params: 
{'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}
Training with params: 
{'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}
Training with params: 
{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators'