In [77]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost

Requirement already up-to-date: tables in /usr/local/lib/python3.6/dist-packages (3.6.1)


In [0]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

In [79]:
cd '/content/drive/My Drive/Colab Notebooks/dw_matrix'

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
df = pd.read_hdf('data/car.h5')

In [0]:
df = df[ df.price_currency != 'EUR' ]

In [0]:
df.rename(columns = {'param_model-pojazdu':'param_model_pojazdu',
                     'param_liczba-drzwi' : 'param_liczba_drzwi'}, inplace = True)

In [0]:
df['param_liczba_drzwi'].fillna(-1, inplace = True)
df['param_liczba_drzwi'] = df['param_liczba_drzwi'].astype(np.int8)

In [0]:
suffix = '__cat'

for feat in df.columns:
  if isinstance(df[feat][0], list):
    continue
  factorized_vals = df[feat].factorize()[0]
  if suffix in feat:
    df[feat] = factorized_vals
  else:
    df[feat + suffix] = factorized_vals

In [0]:
cat_feats = [x for x in df.columns if suffix in x]
cat_feats = [x for x in cat_feats if 'price' not in x]

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

### Decision Tree Model

In [86]:
run_model( DecisionTreeRegressor(max_depth=5), cat_feats )

(-19566.588937368324, 90.6181486516617)

### Random Forest Model

In [87]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=2020)
run_model(model, cat_feats)

(-18801.110116978012, 138.10063905479723)

### XGBoost Model

In [88]:
xgb_params = {
    'max_depth': 5, 
    'n_estimators': 50, 
    'learning_rate': 0.1,
    'seed': 2020
    }

model = xgb.XGBRegressor(**xgb_params)
run_model(model, cat_feats)



(-13039.290196724838, 109.36715375706265)

In [89]:
X = df[cat_feats].values
y = df['price_value'].values

m = xgb.XGBRegressor(**xgb_params)
m.fit(X, y)

imp = PermutationImportance(m, random_state=2020).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1206  ± 0.0012,param_napęd__cat
0.1182  ± 0.0019,param_rok-produkcji__cat
0.1112  ± 0.0017,param_stan__cat
0.0639  ± 0.0021,param_skrzynia-biegów__cat
0.0532  ± 0.0014,param_faktura-vat__cat
0.0457  ± 0.0014,param_moc__cat
0.0273  ± 0.0009,param_marka-pojazdu__cat
0.0234  ± 0.0004,param_typ__cat
0.0231  ± 0.0007,feature_kamera-cofania__cat
0.0191  ± 0.0023,param_pojemność-skokowa__cat


In [0]:
feats = ['param_napęd__cat','param_rok-produkcji__cat','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat',
         'param_typ__cat','feature_kamera-cofania__cat','param_pojemność-skokowa__cat','seller_name__cat','param_kod-silnika__cat','param_model_pojazdu__cat',
         'feature_wspomaganie-kierownicy__cat','param_wersja__cat','feature_czujniki-parkowania-przednie__cat','feature_asystent-pasa-ruchu__cat','feature_system-start-stop__cat',
         'feature_regulowane-zawieszenie__cat','feature_światła-led__cat']

In [91]:
run_model(model, feats)



(-13240.835942843716, 95.7039217631258)

In [92]:
df['param_rok-produkcji'].unique()

array(['2018', '2011', '2015', '2009', '2017', '2012', '2013', '2007',
       '2001', '2016', '2006', '2008', '2004', '1999', '2000', '2010',
       '2005', '2002', '1998', '2014', '2003', '1982', '1995', '1997',
       '1992', '1993', '1994', '1996', '1989', '1988', '1967', '1987',
       '1959', '1990', '1991', '1974', None, '1975', '1973', '1985',
       '1984', '1986', '1981', '1979', '1960', '1983', '1978', '1964',
       '1980', '1972', '1969', '1956', '1966', '1977', '1971', '1963',
       '1953', '1961', '1952', '1949', '1976', '1965', '1937', '1968',
       '1958', '1962', '1955', '1970', '1933', '1929', '1957', '1944',
       '1954', '1932', '1936', '1947', '1948'], dtype=object)

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))

In [94]:
df['param_rok-produkcji'].unique()

array([2018, 2011, 2015, 2009, 2017, 2012, 2013, 2007, 2001, 2016, 2006,
       2008, 2004, 1999, 2000, 2010, 2005, 2002, 1998, 2014, 2003, 1982,
       1995, 1997, 1992, 1993, 1994, 1996, 1989, 1988, 1967, 1987, 1959,
       1990, 1991, 1974,   -1, 1975, 1973, 1985, 1984, 1986, 1981, 1979,
       1960, 1983, 1978, 1964, 1980, 1972, 1969, 1956, 1966, 1977, 1971,
       1963, 1953, 1961, 1952, 1949, 1976, 1965, 1937, 1968, 1958, 1962,
       1955, 1970, 1933, 1929, 1957, 1944, 1954, 1932, 1936, 1947, 1948])

In [0]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat',
         'param_typ__cat','feature_kamera-cofania__cat','param_pojemność-skokowa__cat','seller_name__cat','param_kod-silnika__cat','param_model_pojazdu__cat',
         'feature_wspomaganie-kierownicy__cat','param_wersja__cat','feature_czujniki-parkowania-przednie__cat','feature_asystent-pasa-ruchu__cat','feature_system-start-stop__cat',
         'feature_regulowane-zawieszenie__cat','feature_światła-led__cat']

In [96]:
run_model(model, feats)



(-11197.83713694348, 98.22041147876314)

In [97]:
df['param_moc'].unique()

array(['90 KM', '115 KM', '262 KM', '110 KM', '310 KM', '105 KM',
       '140 KM', '175 KM', '125 KM', '185 KM', '190 KM', '440 KM',
       '141 KM', '200 KM', '224 KM', '75 KM', '99 KM', '184 KM', '109 KM',
       '233 KM', '116 KM', '68 KM', '286 KM', '126 KM', '160 KM',
       '135 KM', '120 KM', '272 KM', None, '150 KM', '180 KM', '136 KM',
       '102 KM', '131 KM', '218 KM', '245 KM', '170 KM', '112 KM',
       '250 KM', '252 KM', '73 KM', '100 KM', '313 KM', '101 KM',
       '285 KM', '70 KM', '383 KM', '174 KM', '277 KM', '132 KM',
       '130 KM', '215 KM', '60 KM', '330 KM', '163 KM', '177 KM', '98 KM',
       '78 KM', '189 KM', '156 KM', '143 KM', '69 KM', '113 KM', '65 KM',
       '122 KM', '82 KM', '251 KM', '95 KM', '197 KM', '235 KM', '238 KM',
       '171 KM', '381 KM', '400 KM', '178 KM', '80 KM', '165 KM', '85 KM',
       '258 KM', '142 KM', '204 KM', '124 KM', '55 KM', '144 KM',
       '231 KM', '248 KM', '152 KM', '181 KM', '210 KM', '340 KM',
       '129 KM', '147 

In [0]:
df['param_moc'] = df['param_moc'].map( lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0]) )

In [0]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat',
         'param_typ__cat','feature_kamera-cofania__cat','param_pojemność-skokowa__cat','seller_name__cat','param_kod-silnika__cat','param_model_pojazdu__cat',
         'feature_wspomaganie-kierownicy__cat','param_wersja__cat','feature_czujniki-parkowania-przednie__cat','feature_asystent-pasa-ruchu__cat','feature_system-start-stop__cat',
         'feature_regulowane-zawieszenie__cat','feature_światła-led__cat']

In [100]:
run_model(model, feats)



(-9602.94111071797, 57.96672683246094)

In [101]:
df['param_pojemność-skokowa'].unique()

array(['898 cm3', '1 560 cm3', '3 000 cm3', ..., '5 992 cm3', '1 966 cm3',
       '142 280 cm3'], dtype=object)

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map( lambda x: -1 if str(x) == 'None' else int(str(x).split('cm')[0].replace(' ','')) )

In [103]:
df['param_pojemność-skokowa'].unique()

array([   898,   1560,   3000, ...,   5992,   1966, 142280])

In [0]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat',
         'param_typ__cat','feature_kamera-cofania__cat','param_pojemność-skokowa','seller_name__cat','param_kod-silnika__cat','param_model_pojazdu__cat',
         'feature_wspomaganie-kierownicy__cat','param_wersja__cat','feature_czujniki-parkowania-przednie__cat','feature_asystent-pasa-ruchu__cat','feature_system-start-stop__cat',
         'feature_regulowane-zawieszenie__cat','feature_światła-led__cat']

In [105]:
run_model(model, feats)



(-9449.513980284812, 81.47168211987172)

In [0]:
# Some cleaning
# df.loc[df.param_typ == 'compact/sedan', 'param_typ'] = 'Sedan'

In [143]:
!pwd

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [144]:
cd '/content/drive/My Drive/Colab Notebooks/dw_matrix/data'

/content/drive/My Drive/Colab Notebooks/dw_matrix/data


In [172]:
df.to_hdf('car_data.h5', key='df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['breadcrumb', 'created_at', 'price_currency', 'price_details', 'price_value', 'seller_address', 'seller_name', 'seller_type', 'param_liczba-miejsc', 'param_uszkodzony', 'param_marka-pojazdu', 'param_model', 'param_engine-code', 'param_first-registration', 'param_country-of-origin', 'param_pierwsza-rejestracja', 'param_gearbox', 'param_przebieg', 'param_faktura-vat', 'param_vin', 'param_perłowy', 'param_homologacja-ciężarowa', 'param_service-record', 'param_metallic', 'param_leasing-concession', 'param_color', 'param_financing-option', 'param_original-owner', 'param_vat-marża', 'param_kategoria', 'param_co2-emissions', 'param_leasing', 'param_mileage', 'param_zarejestrowany-jako-zabytek', 'param_napęd', 'param_wartość-wykupu', 'param_typ', 'param_metalik', 'param_pierwszy-właściciel', 'param_skrzynia-biegów', 'param_vat-discount', 'pa

In [0]:
df = pd.read_hdf('car_data.h5')

In [0]:
df['param_typ__cat'] = df['param_typ'].factorize()[0]

In [0]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat',
         'param_typ__cat','feature_kamera-cofania__cat','param_pojemność-skokowa','seller_name__cat','param_kod-silnika__cat','param_model_pojazdu__cat',
         'feature_wspomaganie-kierownicy__cat','param_wersja__cat','feature_czujniki-parkowania-przednie__cat','feature_asystent-pasa-ruchu__cat','feature_system-start-stop__cat',
         'feature_regulowane-zawieszenie__cat','feature_światła-led__cat']

In [179]:
run_model(model, feats)



(-9442.720742888154, 31.435447493587677)

In [180]:
X = df[feats].values
y = df['price_value'].values

m = xgb.XGBRegressor(**xgb_params)
m.fit(X, y)

imp = PermutationImportance(m, random_state=2020).fit(X, y)
eli5.show_weights(imp, feature_names=feats)



Weight,Feature
0.6460  ± 0.0114,param_rok-produkcji
0.3067  ± 0.0023,param_moc
0.0613  ± 0.0015,param_pojemność-skokowa
0.0231  ± 0.0009,param_marka-pojazdu__cat
0.0174  ± 0.0006,param_napęd__cat
0.0164  ± 0.0012,seller_name__cat
0.0132  ± 0.0008,param_wersja__cat
0.0117  ± 0.0001,param_stan__cat
0.0108  ± 0.0004,param_typ__cat
0.0106  ± 0.0005,param_kod-silnika__cat


In [0]:
!git config --global user.email "m.zajac1988@gmail.com"
!git config --global user.name "Mateusz"

In [182]:
!git add matrix_two/day4_xgboost.ipynb

fatal: pathspec 'matrix_two/day4_xgboost.ipynb' did not match any files


In [66]:
!git commit -m 'Run XGBoost model'

[master c88fce7] Run XGBoost model
 1 file changed, 1 insertion(+)
 create mode 100644 matrix_two/day4_xgboost.ipynb


In [67]:
!git push -u origin master

Counting objects: 1   Counting objects: 4, done.
Delta compression using up to 2 threads.
Compressing objects:  25% (1/4)   Compressing objects:  50% (2/4)   Compressing objects:  75% (3/4)   Compressing objects: 100% (4/4)   Compressing objects: 100% (4/4), done.
Writing objects:  25% (1/4)   Writing objects:  50% (2/4)   Writing objects:  75% (3/4)   Writing objects: 100% (4/4)   Writing objects: 100% (4/4), 7.77 KiB | 1.94 MiB/s, done.
Total 4 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/mattzajac/dw_matrix.git
   6f8c05d..c88fce7  master -> master
Branch 'master' set up to track remote branch 'master' from 'origin'.
