In [0]:
# !git config --global user.email "me@example.com"
# !git config --global user.name "nervuzz"
# !git status
# !git pull
# !git add .
# !git commit -m 'Day 3'
# !git status
# !git push

In [1]:
cd "drive/My Drive/Colab Notebooks/data_workshop_car/"

/content/drive/My Drive/Colab Notebooks/data_workshop_car


In [0]:
# !pip install --upgrade tables
# !pip install eli5
# !pip install xgboost

In [4]:
import eli5
import numpy as np
import pandas as pd
import xgboost as xgb
from eli5.sklearn import PermutationImportance
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold


Using TensorFlow backend.


# Data loading section

In [44]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

# Feature engineering

In [0]:
CAT_SUFFIX = '__cat'

for feat in df.columns:
  if not isinstance(df[feat][0] , list):
    factorized_values = df[feat].factorize()[0]
    if CAT_SUFFIX in feat:
      df[feat] = factorized_values
    else:
      df[ feat + CAT_SUFFIX ] = factorized_values

In [46]:
cat_feats = [x for x in df.columns if CAT_SUFFIX in x and 'price' not in str(x).lower()]
len(cat_feats)

151

# run_model() function

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values
  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

# DecisionTree

In [10]:
model = DecisionTreeRegressor(max_depth=5)

run_model(model, cat_feats)

(-19695.13091100928, 148.72570644015792)

# RandomForest

In [11]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)

run_model(model, cat_feats)

(-18718.657185256638, 64.5424578125788)

# XGBoost

In [14]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0,
}

model = xgb.XGBRegressor(**xgb_params)

run_model(model, cat_feats)



(-13108.379065811214, 74.32158265003798)

# Feature evaluation by XGBoost

In [17]:
X = df[cat_feats].values
y = df['price_value'].values

model.fit(X, y)

imp = PermutationImportance(model, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1194  ± 0.0031,param_napęd__cat
0.1132  ± 0.0032,param_rok-produkcji__cat
0.1090  ± 0.0025,param_stan__cat
0.0619  ± 0.0025,param_skrzynia-biegów__cat
0.0568  ± 0.0016,param_faktura-vat__cat
0.0489  ± 0.0014,param_moc__cat
0.0273  ± 0.0007,param_marka-pojazdu__cat
0.0242  ± 0.0013,feature_kamera-cofania__cat
0.0212  ± 0.0008,param_typ__cat
0.0174  ± 0.0008,param_pojemność-skokowa__cat


# XGBoost attempt 2 (only 20 feats)

In [0]:
# MULTI SELECTION
# Mark some char -> Ctrl+D -> navigate with arrows
feats20 = ['param_napęd__cat','param_rok-produkcji__cat','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [49]:
run_model(model, feats20)



(-13375.230420852275, 65.40441107118909)

# XGBoost attempt 3 (year feat changed)

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))

In [0]:
feats20_year = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [54]:
run_model(model, feats20_year)



(-11308.885890938496, 27.868488259630677)

# XGBoost attempt 4 (horse power feat changed)

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.replace(' ', '').split('KM')[0]))

In [0]:
feats20_year_hp = ['param_rok-produkcji','param_moc','param_napęd__cat','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [62]:
run_model(model, feats20_year_hp)



(-9754.638210376661, 78.48987960994316)

# XGBoost attempt 5 (engine size feat changed)

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.replace(' ', '').split('cm')[0]))

In [0]:
feats20_year_hp_cap = ['param_rok-produkcji','param_moc','param_pojemność-skokowa','param_napęd__cat','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

In [67]:
run_model(model, feats20_year_hp_cap)



(-9620.550798000319, 100.27045195136738)