# extra_3.3.4_weitere_ensemble_regressoren.ipynb

In [2]:
## for scikit-learn 1.4.2, to silence warnings regarding physical cores
import os
os.environ['LOKY_MAX_CPU_COUNT'] = '4' ## depending on the hardware used

In [3]:
## prepare environment and data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

datapath = '../3_data'
from os import chdir; chdir(datapath)

from bfh_cas_pml import prep_data, prep_demo_data
X_train, X_test, y_train, y_test = prep_data('melb_data_prep.csv', 'Price', seed = 1234)

from bfh_cas_pml import test_regression_model

names = []
scores = []

**AdaBoostRegressor**

In [5]:
from sklearn.ensemble import AdaBoostRegressor
this_model = test_regression_model(
    AdaBoostRegressor(random_state=1234), 
    X_train, y_train, X_test, y_test,
    show_plot=False)
names.append(this_model.__class__.__name__)
scores.append(this_model.score(X_test, y_test))

R2 = -0.3023


**GradientBoostingRegressor**

In [7]:
from sklearn.ensemble import GradientBoostingRegressor
this_model = test_regression_model(
    GradientBoostingRegressor(random_state=1234), 
    X_train, y_train, X_test, y_test,
    show_plot=False)
names.append(this_model.__class__.__name__)
scores.append(this_model.score(X_test, y_test))

R2 = 0.7250


**HistGradientBoostingRegressor**

"This estimator is much faster than GradientBoostingRegressor for big datasets (n_samples >= 10 000)." [https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor]

In [9]:
from sklearn.ensemble import HistGradientBoostingRegressor
this_model = test_regression_model(
    HistGradientBoostingRegressor(), 
    X_train, y_train, X_test, y_test,
    show_plot=False)
names.append(this_model.__class__.__name__)
scores.append(this_model.score(X_test, y_test))

R2 = 0.7878


**CatBoostRegressor**

In [11]:
from catboost import CatBoostRegressor
this_model = test_regression_model(
    CatBoostRegressor(logging_level='Silent'), 
    X_train, y_train, X_test, y_test,
    show_plot=False)
names.append(this_model.__class__.__name__)
scores.append(this_model.score(X_test, y_test))

R2 = 0.8003


**LGBMRegressor**

In [13]:
from lightgbm import LGBMRegressor
this_model = test_regression_model(
    LGBMRegressor(), 
    X_train, y_train, X_test, y_test,
    show_plot=False)
names.append(this_model.__class__.__name__)
scores.append(this_model.score(X_test, y_test))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1630
[LightGBM] [Info] Number of data points in the train set: 12262, number of used features: 23
[LightGBM] [Info] Start training from score 1055902.695237
R2 = 0.7882


In [14]:
## synthesis
print(pd.DataFrame({'models': names, 'scores': scores}))

                          models    scores
0              AdaBoostRegressor -0.302314
1      GradientBoostingRegressor  0.724983
2  HistGradientBoostingRegressor  0.787850
3              CatBoostRegressor  0.800349
4                  LGBMRegressor  0.788166
