In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder


from tqdm.notebook import tqdm

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier

# ustawia domyślną wielkość wykresów
plt.rcParams['figure.figsize'] = (12,8)
# to samo tylko dla tekstu
plt.rcParams['font.size'] = 16

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


In [2]:
train = pd.read_csv('E:/data/car_prices_poland/Car_Prices_Poland.csv')

## Obróbka danych

In [3]:
train.isnull().sum()

Unnamed: 0             0
mark                   0
model                  0
generation_name    30085
year                   0
mileage                0
vol_engine             0
fuel                   0
city                   0
province               0
price                  0
dtype: int64

In [4]:
train = train.drop(columns=['generation_name', 'Unnamed: 0'])

In [5]:
ord_enc = OrdinalEncoder()

train["mark"] = ord_enc.fit_transform(train[["mark"]])
train["model"] = ord_enc.fit_transform(train[["model"]])
train["fuel"] = ord_enc.fit_transform(train[["fuel"]])
train["province"] = ord_enc.fit_transform(train[["province"]])
train["city"] = ord_enc.fit_transform(train[["city"]])


## Podział danych

In [6]:
y = train['price']
X = train.drop("price", axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=420, train_size=0.66)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, random_state=420, train_size=0.66)

In [7]:
X_train

Unnamed: 0,mark,model,year,mileage,vol_engine,fuel,city,province
61271,11.0,109.0,2020,26553,1950,1.0,4290.0,20.0
68673,17.0,249.0,2010,259985,1900,1.0,425.0,3.0
103703,14.0,200.0,2013,150000,1386,3.0,3770.0,6.0
22973,1.0,226.0,2022,10,2967,1.0,2882.0,6.0
70720,20.0,51.0,2006,373000,2231,1.0,4154.0,20.0
...,...,...,...,...,...,...,...,...
117363,22.0,321.0,2020,25600,1969,1.0,2426.0,16.0
62239,11.0,181.0,2007,253000,3996,1.0,3819.0,20.0
21055,1.0,32.0,2012,143000,2967,1.0,3693.0,18.0
62536,11.0,306.0,2019,54800,1950,1.0,3132.0,6.0


In [8]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

print(X_val.shape)
print(y_val.shape)

(77831, 8)
(77831,)
(26463, 8)
(26463,)
(13633, 8)
(13633,)


## Random Forest

In [9]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train).score(X_test, y_test)

0.938151970859524

In [10]:
#przetestujmy jak działa to dla różnych parametrów
scores = []
best = [0,0,0,0]
for i in [1, 2, 4, 8, 16, 32, 64, 128]:
    for j in  [1, 2, 4, 8, 16, 32, 64, 128]:
        for k in  [2, 4, 8, 16, 32, 64, 128]: 
            rf = RandomForestRegressor(max_depth = i, n_estimators = j, max_leaf_nodes = k)
            score = rf.fit(X_train, y_train).score(X_test, y_test)
            print (f"max_depth: {i},  n_estimators: {j}, max_leaf_nodes: {k}, score: {score}")
            if (score > best[3]):
                best = [i, j, k, score]
            scores.append(score)

max_depth: 1,  n_estimators: 1, max_leaf_nodes: 2, score: 0.35043999894348465
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 4, score: 0.3504347288686881
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 8, score: 0.35043692779555735
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 16, score: 0.3504408666819372
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 32, score: 0.35039090475413615
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 64, score: 0.3503853513512881
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 128, score: 0.3503054557507309
max_depth: 1,  n_estimators: 2, max_leaf_nodes: 2, score: 0.3503651850645134
max_depth: 1,  n_estimators: 2, max_leaf_nodes: 4, score: 0.35042425102211927
max_depth: 1,  n_estimators: 2, max_leaf_nodes: 8, score: 0.3503992761093082
max_depth: 1,  n_estimators: 2, max_leaf_nodes: 16, score: 0.3503734481949651
max_depth: 1,  n_estimators: 2, max_leaf_nodes: 32, score: 0.3503667745370046
max_depth: 1,  n_estimators: 2, max_leaf_nodes: 64, score: 0.3504

max_depth: 2,  n_estimators: 128, max_leaf_nodes: 2, score: 0.3612999664119544
max_depth: 2,  n_estimators: 128, max_leaf_nodes: 4, score: 0.6213132552910625
max_depth: 2,  n_estimators: 128, max_leaf_nodes: 8, score: 0.6181347488148418
max_depth: 2,  n_estimators: 128, max_leaf_nodes: 16, score: 0.618077300335558
max_depth: 2,  n_estimators: 128, max_leaf_nodes: 32, score: 0.6197504525373505
max_depth: 2,  n_estimators: 128, max_leaf_nodes: 64, score: 0.6204937662689891
max_depth: 2,  n_estimators: 128, max_leaf_nodes: 128, score: 0.6188995606266484
max_depth: 4,  n_estimators: 1, max_leaf_nodes: 2, score: 0.35043795326374116
max_depth: 4,  n_estimators: 1, max_leaf_nodes: 4, score: 0.6138476134050137
max_depth: 4,  n_estimators: 1, max_leaf_nodes: 8, score: 0.7384303360410036
max_depth: 4,  n_estimators: 1, max_leaf_nodes: 16, score: 0.7912894750020094
max_depth: 4,  n_estimators: 1, max_leaf_nodes: 32, score: 0.7893402390683744
max_depth: 4,  n_estimators: 1, max_leaf_nodes: 64, sco

max_depth: 8,  n_estimators: 64, max_leaf_nodes: 2, score: 0.358537912925589
max_depth: 8,  n_estimators: 64, max_leaf_nodes: 4, score: 0.620554570190866
max_depth: 8,  n_estimators: 64, max_leaf_nodes: 8, score: 0.7441176656885928
max_depth: 8,  n_estimators: 64, max_leaf_nodes: 16, score: 0.7969651570144232
max_depth: 8,  n_estimators: 64, max_leaf_nodes: 32, score: 0.8423066280692362
max_depth: 8,  n_estimators: 64, max_leaf_nodes: 64, score: 0.8681380702007909
max_depth: 8,  n_estimators: 64, max_leaf_nodes: 128, score: 0.8879941725024839
max_depth: 8,  n_estimators: 128, max_leaf_nodes: 2, score: 0.3575493618966751
max_depth: 8,  n_estimators: 128, max_leaf_nodes: 4, score: 0.6284270330485803
max_depth: 8,  n_estimators: 128, max_leaf_nodes: 8, score: 0.7413030592216299
max_depth: 8,  n_estimators: 128, max_leaf_nodes: 16, score: 0.7958628973805946
max_depth: 8,  n_estimators: 128, max_leaf_nodes: 32, score: 0.8433366828167606
max_depth: 8,  n_estimators: 128, max_leaf_nodes: 64, 

max_depth: 32,  n_estimators: 16, max_leaf_nodes: 128, score: 0.8937138871688001
max_depth: 32,  n_estimators: 32, max_leaf_nodes: 2, score: 0.35455497951154225
max_depth: 32,  n_estimators: 32, max_leaf_nodes: 4, score: 0.6267989895503918
max_depth: 32,  n_estimators: 32, max_leaf_nodes: 8, score: 0.7442781667900598
max_depth: 32,  n_estimators: 32, max_leaf_nodes: 16, score: 0.7972044207034389
max_depth: 32,  n_estimators: 32, max_leaf_nodes: 32, score: 0.840938394360545
max_depth: 32,  n_estimators: 32, max_leaf_nodes: 64, score: 0.8703490973121117
max_depth: 32,  n_estimators: 32, max_leaf_nodes: 128, score: 0.8958510283777045
max_depth: 32,  n_estimators: 64, max_leaf_nodes: 2, score: 0.36035449919370954
max_depth: 32,  n_estimators: 64, max_leaf_nodes: 4, score: 0.6298379728985215
max_depth: 32,  n_estimators: 64, max_leaf_nodes: 8, score: 0.7411989080932833
max_depth: 32,  n_estimators: 64, max_leaf_nodes: 16, score: 0.7951288980787273
max_depth: 32,  n_estimators: 64, max_leaf_

max_depth: 128,  n_estimators: 8, max_leaf_nodes: 64, score: 0.8687377390240356
max_depth: 128,  n_estimators: 8, max_leaf_nodes: 128, score: 0.8931918080970112
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 2, score: 0.3504284398165587
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 4, score: 0.6270509761359528
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 8, score: 0.7362444408176614
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 16, score: 0.7942686144771138
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 32, score: 0.8429228782760155
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 64, score: 0.8705971197706746
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 128, score: 0.8945661232976826
max_depth: 128,  n_estimators: 32, max_leaf_nodes: 2, score: 0.36215513614400263
max_depth: 128,  n_estimators: 32, max_leaf_nodes: 4, score: 0.6137663780379319
max_depth: 128,  n_estimators: 32, max_leaf_nodes: 8, score: 0.7444694015994902
max_depth: 128,  n_estimators: 32

In [11]:
print (f"mediana: {np.median(scores)}")
print (f"srednia: {np.mean(scores)}")
print (f"najlepszy: max_depth: {best[0]}, n_estimators: {best[1]}, max_leaf_nodes: {best[2]}, score: {best[3]}")

mediana: 0.7387879403496593
srednia: 0.660447463668583
najlepszy: max_depth: 128, n_estimators: 128, max_leaf_nodes: 128, score: 0.8961917990594395


## XGBoost

In [12]:
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_val, y_val)])
xgb_reg.score(X_test, y_test)

[0]	validation_0-rmse:78978.28906
[1]	validation_0-rmse:59608.76953
[2]	validation_0-rmse:46810.71875
[3]	validation_0-rmse:38731.90625
[4]	validation_0-rmse:33747.10547
[5]	validation_0-rmse:30314.96875
[6]	validation_0-rmse:28424.15430
[7]	validation_0-rmse:27180.24609
[8]	validation_0-rmse:26231.63477
[9]	validation_0-rmse:25595.12305
[10]	validation_0-rmse:25271.23438
[11]	validation_0-rmse:25030.88672
[12]	validation_0-rmse:24915.65039
[13]	validation_0-rmse:24529.78516
[14]	validation_0-rmse:24393.43555
[15]	validation_0-rmse:24330.83203
[16]	validation_0-rmse:24275.04492
[17]	validation_0-rmse:23980.49219
[18]	validation_0-rmse:23853.24023
[19]	validation_0-rmse:23647.44727
[20]	validation_0-rmse:23517.19336
[21]	validation_0-rmse:23445.61523
[22]	validation_0-rmse:23114.34375
[23]	validation_0-rmse:22995.70117
[24]	validation_0-rmse:22901.15820
[25]	validation_0-rmse:22852.85156
[26]	validation_0-rmse:22843.56836
[27]	validation_0-rmse:22747.86133
[28]	validation_0-rmse:22629.4

0.9380997897184561

In [13]:
#przetestujmy jak działa to dla różnych parametrów
scores = []
best = [0,0,0,0,0]
for i in [1, 4, 16, 64, 128]:
    for j in  [1, 4, 16, 64, 128]:
        for k in  [4, 16, 64, 128]:
            for l in [1, 4, 16, 64, 128]:
                
                xgb_reg = xgb.XGBRegressor(objective="reg:linear", 
                                           seed = 42, 
                                           use_label_encoder=False, 
                                           max_depth = i, 
                                           n_estimators = j, 
                                           max_leaf_nodes = k, 
                                           max_leaves = l, 
                                           verbocity = 0,
                                           silent = True,
                                           verbosity = 0,
                                           verbose=0)

                score = xgb_reg.fit(X_train, y_train, 
                                    verbose=False, 
                                    early_stopping_rounds=10, 
                                    eval_metric="rmse", eval_set=[(X_val, y_val)]).score(X_test, y_test)
                                                                  
                print (f"max_depth: {i},  n_estimators: {j}, max_leaf_nodes: {k}, max_leaves: {l}, score: {score}")
                if (score > best[4]):
                    best = [i, j, k, l, score]
                scores.append(score)

max_depth: 1,  n_estimators: 1, max_leaf_nodes: 4, max_leaves: 1, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 4, max_leaves: 4, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 4, max_leaves: 16, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 4, max_leaves: 64, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 4, max_leaves: 128, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 16, max_leaves: 1, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 16, max_leaves: 4, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 16, max_leaves: 16, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 16, max_leaves: 64, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes: 16, max_leaves: 128, score: -0.15225544970847582
max_depth: 1,  n_estimators: 1, max_leaf_nodes:

max_depth: 1,  n_estimators: 128, max_leaf_nodes: 16, max_leaves: 16, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 16, max_leaves: 64, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 16, max_leaves: 128, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 64, max_leaves: 1, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 64, max_leaves: 4, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 64, max_leaves: 16, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 64, max_leaves: 64, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 64, max_leaves: 128, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 128, max_leaves: 1, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_leaf_nodes: 128, max_leaves: 4, score: 0.7047183227116642
max_depth: 1,  n_estimators: 128, max_le

max_depth: 4,  n_estimators: 64, max_leaf_nodes: 64, max_leaves: 128, score: 0.913534475751984
max_depth: 4,  n_estimators: 64, max_leaf_nodes: 128, max_leaves: 1, score: 0.913534475751984
max_depth: 4,  n_estimators: 64, max_leaf_nodes: 128, max_leaves: 4, score: 0.913534475751984
max_depth: 4,  n_estimators: 64, max_leaf_nodes: 128, max_leaves: 16, score: 0.913534475751984
max_depth: 4,  n_estimators: 64, max_leaf_nodes: 128, max_leaves: 64, score: 0.913534475751984
max_depth: 4,  n_estimators: 64, max_leaf_nodes: 128, max_leaves: 128, score: 0.913534475751984
max_depth: 4,  n_estimators: 128, max_leaf_nodes: 4, max_leaves: 1, score: 0.924646078077104
max_depth: 4,  n_estimators: 128, max_leaf_nodes: 4, max_leaves: 4, score: 0.924646078077104
max_depth: 4,  n_estimators: 128, max_leaf_nodes: 4, max_leaves: 16, score: 0.924646078077104
max_depth: 4,  n_estimators: 128, max_leaf_nodes: 4, max_leaves: 64, score: 0.924646078077104
max_depth: 4,  n_estimators: 128, max_leaf_nodes: 4, max_

max_depth: 16,  n_estimators: 64, max_leaf_nodes: 4, max_leaves: 1, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 4, max_leaves: 4, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 4, max_leaves: 16, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 4, max_leaves: 64, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 4, max_leaves: 128, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 16, max_leaves: 1, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 16, max_leaves: 4, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 16, max_leaves: 16, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 16, max_leaves: 64, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_nodes: 16, max_leaves: 128, score: 0.9374639224278547
max_depth: 16,  n_estimators: 64, max_leaf_node

max_depth: 64,  n_estimators: 16, max_leaf_nodes: 16, max_leaves: 4, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 16, max_leaves: 16, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 16, max_leaves: 64, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 16, max_leaves: 128, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 64, max_leaves: 1, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 64, max_leaves: 4, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 64, max_leaves: 16, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 64, max_leaves: 64, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 64, max_leaves: 128, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_leaf_nodes: 128, max_leaves: 1, score: 0.9362602048383744
max_depth: 64,  n_estimators: 16, max_lea

max_depth: 128,  n_estimators: 4, max_leaf_nodes: 64, max_leaves: 16, score: 0.8207589975474425
max_depth: 128,  n_estimators: 4, max_leaf_nodes: 64, max_leaves: 64, score: 0.8207589975474425
max_depth: 128,  n_estimators: 4, max_leaf_nodes: 64, max_leaves: 128, score: 0.8207589975474425
max_depth: 128,  n_estimators: 4, max_leaf_nodes: 128, max_leaves: 1, score: 0.8207589975474425
max_depth: 128,  n_estimators: 4, max_leaf_nodes: 128, max_leaves: 4, score: 0.8207589975474425
max_depth: 128,  n_estimators: 4, max_leaf_nodes: 128, max_leaves: 16, score: 0.8207589975474425
max_depth: 128,  n_estimators: 4, max_leaf_nodes: 128, max_leaves: 64, score: 0.8207589975474425
max_depth: 128,  n_estimators: 4, max_leaf_nodes: 128, max_leaves: 128, score: 0.8207589975474425
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 4, max_leaves: 1, score: 0.9362602048383744
max_depth: 128,  n_estimators: 16, max_leaf_nodes: 4, max_leaves: 4, score: 0.9362602048383744
max_depth: 128,  n_estimators: 16, ma

In [14]:
print (f"mediana: {np.median(scores)}")
print (f"srednia: {np.mean(scores)}")
print (f"najlepszy: max_depth: {best[0]}, n_estimators: {best[1]}, max_leaf_nodes: {best[2]}, max_leaves: {best[3]}, score: {best[4]}")

mediana: 0.8210226567852289
srednia: 0.6806235611224338
najlepszy: max_depth: 16, n_estimators: 16, max_leaf_nodes: 4, max_leaves: 1, score: 0.9375138581886048


## Wnioski 
* Z XGBoost jest równie dobry (różnica na poziomie 0.001 na korzyść RF) z parametrami domyślnymi jak Random Forest (także z domyślnymi).

* XGBoost okazuje się być średnio (średnia i mediana) lepszy niż Random Forest.

* W przypadku obu modeli dobranie większych parametrów pozytywnie wpływa na score.

* Im więcej estymatorów oraz im większa głębokość, tym wolniej się trenuje (oba modele).

* Najlepsze u mnie okazały się parametry:
    * RF: max_depth: 128, n_estimators: 128, max_leaf_nodes: 128 || (score: 0.8961917990594395)
    * XGB: max_depth: 16, n_estimators: 16, max_leaf_nodes: 4, max_leaves: 1 || (score: 0.9375138581886048)