In [1]:
import pandas as pd
import numpy as np

import eli5
from eli5.sklearn import PermutationImportance

import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold



In [2]:
carDf = pd.read_hdf('../data/car.h5')
rowsCount, columnsCount = carDf.shape

columns = carDf.columns

## Feature Engineering

In [3]:
SUFFIX_CAT = '__cat'
for feature in carDf.columns:
    if isinstance(carDf[feature][0], list):
        continue
    
    factorizedValues = carDf[feature].factorize()[0]
    
    if SUFFIX_CAT in feature:
        carDf[feature] = factorizedValues
    else:
        column = f'{feature}{SUFFIX_CAT}'
        carDf[column] = factorizedValues

In [4]:
catFeatures = [column for column in carDf.columns if SUFFIX_CAT in column]
catFeatures = [column for column in catFeatures if 'price' not in column]

carDf['param_rok-produkcji'] = carDf['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))
carDf['param_moc'] = carDf['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0]))
carDf['param_pojemność-skokowa'] = carDf['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else str(x).split('cm3')[0].replace(' ', ''))

features = [
    'param_napęd__cat',
    'param_stan__cat',
    'param_rok-produkcji',
    'param_faktura-vat__cat',
    'param_moc',
    'param_skrzynia-biegów__cat',
    'param_marka-pojazdu__cat',
    'feature_kamera-cofania__cat',
    'param_typ__cat',
    'param_pojemność-skokowa',
    'seller_name__cat',
    'param_wersja__cat',
    'feature_wspomaganie-kierownicy__cat',
    'param_model-pojazdu__cat',
    'feature_system-start-stop__cat',
    'param_kod-silnika__cat',
    'feature_asystent-pasa-ruchu__cat',
    'feature_łopatki-zmiany-biegów__cat',
    'feature_światła-led__cat',
    'feature_czujniki-parkowania-przednie__cat'
    ]

In [7]:
def runModel(aModel, aFeatures):
    xData = carDf[aFeatures].values
    yData = carDf['price_value'].values

    scores = cross_val_score(aModel, xData, yData, cv = 3, scoring = 'neg_mean_absolute_error')
    return np.mean(scores), np.std(scores)

In [8]:
xgbParams = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0
}

In [10]:
runModel(xgb.XGBRegressor(**xgbParams), features)

(-9556.21383364917, 101.81252798368146)

## Hyperopt

In [21]:
def objFunc(aParams):
    print(f'Training with params: {aParams}')
    
    meanMae, scoreStd = runModel(xgb.XGBRegressor(**aParams), features)
    
    return {'loss': np.abs(meanMae), 'status': STATUS_OK}

In [22]:
xgbRegParams = {
    'learning_rate': hp.choice('learning_rate', np.arange(0.05, 0.31, 0.05)),
    'max_depth': hp.choice('max_depth', np.arange(5, 16, 1, dtype = int)),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'objective': 'reg:squarederror',
    'n_estimators': 100,
    'seed': 0
}

In [23]:
best = fmin(objFunc, xgbRegParams, algo = tpe.suggest, max_evals = 25)

Training with params: {'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.25, 'max_depth': 10, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}
Training with params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.6000000000000001}
Training with params: {'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}
Training with params: {'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.3, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.8500000000000001}
Training with params: {'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.70

In [24]:
best

{'colsample_bytree': 0.9, 'learning_rate': 1, 'max_depth': 9, 'subsample': 0.8}