In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import phik
from phik.report import plot_correlation_matrix
from phik import report

import shap
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from typing import Literal
import os
from IPython.display import display, Markdown

In [8]:
cpu_count = os.cpu_count() - 1

### Подготовка данных

In [10]:
cols_to_drop = ['car_id', 'target_class', 'deviation_normal_count']
cols_cats = ['model', 'car_type', 'fuel_type']

def prepare_data(df: pd.DataFrame, cols_to_drop: list() = [], cols_cats: list() = [], type_df: Literal['train', 'test'] = 'train'):
    cols_to_drop = cols_to_drop
    cols_cats = cols_cats
    df_copy = df.copy()
    df_copy.drop(columns=cols_to_drop, inplace=True)
    
    le = LabelEncoder()
    for col in cols_cats:
        df_copy[col] = le.fit_transform(df_copy[col])
    
    df_copy[cols_cats] = df_copy[cols_cats].astype('category')
    if type_df == 'test':
        return df_copy
    X = df_copy.drop('target_reg', axis=1)
    y = df_copy.target_reg
    return X, y

In [12]:
df_train = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_train.csv')
df_train.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,109.99,another_bug,4.737759,12141310.0,0.1,180.855726,0.023174,174,170
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,34.48,electro_bug,4.480517,18039090.0,0.0,187.862734,12.306011,174,174
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,34.93,gear_stick,4.768391,15883660.0,0.1,102.382857,2.513319,174,173
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,32.22,engine_fuel,3.88092,16518830.0,0.1,172.793237,-5.029476,174,170
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,27.51,engine_fuel,4.181149,13983170.0,0.1,203.462289,-14.260456,174,171


In [14]:
df_test = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_test.csv')
df_test.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,P17494612l,Skoda Rapid,economy,petrol,4.8,2013,42269,2019,gear_stick,3.746207,14075390.0,0.1,195.454152,10.56622,174,170
1,N-1530212S,Renault Sandero,standart,petrol,4.32,2015,90014,2016,engine_overheat,4.318966,19703900.0,0.0,181.538685,11.807941,174,174
2,B-1154399t,Smart ForTwo,economy,petrol,4.46,2015,82684,2017,electro_bug,5.134655,9314946.0,0.1,118.440645,14.862538,174,172
3,F12725233R,Smart ForFour,economy,petrol,2.8,2014,68833,2021,engine_check,4.617356,9336838.0,0.83,112.829785,20.088904,174,172
4,l-1139189J,Skoda Rapid,economy,petrol,6.56,2013,42442,2021,another_bug,4.287471,11962500.0,0.0,187.846088,3.69846,174,172


In [16]:
X, y = prepare_data(df_train, cols_to_drop, cols_cats)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

### <center> **Catboost**

#### Обучение CatBoost с параметрами по умолчанию

In [None]:
cat_model_default = CatBoostRegressor(thread_count=-1,
                                      random_seed=42,
                                      cat_features=cols_cats)
cat_model_default.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    verbose=200,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100
)

Learning rate set to 0.056174
0:	learn: 17.0065561	test: 17.7387430	best: 17.7387430 (0)	total: 56.2ms	remaining: 56.1s
200:	learn: 9.3813500	test: 12.2560321	best: 12.1641768 (128)	total: 235ms	remaining: 933ms
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12.16417676
bestIteration = 128

Shrink model to first 129 iterations.


<catboost.core.CatBoostRegressor at 0x78174e9c3e00>

In [15]:
display(Markdown(f"#### Значение `RMSE` с default параметрами CatBoost: {round(cat_model_default.best_score_['validation']['RMSE'], 2)}"))

#### Значение `RMSE` с default параметрами CatBoost: 12.16

### Подбор гиперпараметров
#### Используем встроенный **grid_search** Catboost

In [117]:
param_grid = {
    'learning_rate': [.01, .03, .04],
    'max_bin': [40, 50, 60], 
}
model_cat = CatBoostRegressor(
    random_seed=42,
    cat_features=cols_cats,
    thread_count=cpu_count,
    verbose=False,
)

rand_search_res = model_cat.grid_search(param_grid, X, y, verbose=False, plot=False,)
print(model_cat.best_score_)

params_best_cat = rand_search_res['params']
display(Markdown(f"<br> #### Выбранные гиперпараметры для `CatBoost`:<br> `{params_best_cat}`"))


bestTest = 10.73227337
bestIteration = 999


bestTest = 10.7365755
bestIteration = 323


bestTest = 10.71436759
bestIteration = 239


bestTest = 10.7808992
bestIteration = 936


bestTest = 10.80901949
bestIteration = 337


bestTest = 10.81892195
bestIteration = 299


bestTest = 10.72167277
bestIteration = 970


bestTest = 10.7488679
bestIteration = 325


bestTest = 10.76114647
bestIteration = 242

Training on fold [0/3]

bestTest = 11.7431856
bestIteration = 208

Training on fold [1/3]

bestTest = 11.7577536
bestIteration = 238

Training on fold [2/3]

bestTest = 11.3622793
bestIteration = 246

{'learn': {'RMSE': 7.230686518563849}}


#### Выбранные гиперпараметры для `CatBoost`:<br> `{'border_count': 40, 'learning_rate': 0.04}`

#### Обучение по фолдам

In [188]:
def pull_best_model(estimator, X, y, kwargs_fit, *, n_splits = 3, random_state=42):
    n_splits = n_splits
    models = []
    scores = []
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = estimator
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            **kwargs_fit
        )
        if isinstance(model, CatBoostRegressor):
            score = model.best_score_['validation']['RMSE']
        elif isinstance(model, lgb.LGBMRegressor):
            score = model.best_score_['valid_0']['rmse']
        elif isinstance(model, xgb.XGBRegressor):
            score = model.best_score
        else:
            score = 0
        scores.append(score)
        models.append(model)

    best_model = models[np.argmin(scores)]
    return best_model, min(scores)

In [196]:
model_cat = CatBoostRegressor(
    **params_best_cat,
    iterations=2000,
    cat_features=cols_cats,
    random_seed=7575,
    early_stopping_rounds=100,)

cat_fit = dict(
    verbose=200,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100
)

b_m, rmse = pull_best_model(model_cat, X, y, cat_fit)
rmse

0:	learn: 17.5448536	test: 16.9309868	best: 16.9309868 (0)	total: 5.34ms	remaining: 10.7s
200:	learn: 9.8768638	test: 12.0412599	best: 12.0132694 (113)	total: 205ms	remaining: 1.84s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12.01326945
bestIteration = 113

Shrink model to first 114 iterations.
0:	learn: 16.8713527	test: 18.1447932	best: 18.1447932 (0)	total: 1.4ms	remaining: 2.79s
200:	learn: 10.0181547	test: 11.8354254	best: 11.8354254 (200)	total: 165ms	remaining: 1.48s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.82282234
bestIteration = 279

Shrink model to first 280 iterations.
0:	learn: 17.5396387	test: 16.9023884	best: 16.9023884 (0)	total: 885us	remaining: 1.77s
200:	learn: 10.2551450	test: 11.4095143	best: 11.4095143 (200)	total: 113ms	remaining: 1.01s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.33389497
bestIteration = 296

Shrink model to first 297 iterations.


11.333894966792219

In [158]:
n_splits = 3
models = []
scores = []

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    train_dataset = Pool(data=X_train, label=y_train, cat_features=cols_cats)
    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cols_cats)

    model_cat = CatBoostRegressor(
        **params_best_cat,
        iterations=2000,
        cat_features=cols_cats,
        random_seed=7575,
        early_stopping_rounds=100,)
    
    models.append(model_cat)
    
    model_cat.fit(
        train_dataset,
        eval_set=eval_dataset,
        verbose=500,
        use_best_model=True,
        plot=False
    )
    scores.append(model_cat.best_score_['validation']['RMSE'])

0:	learn: 17.5448536	test: 16.9309868	best: 16.9309868 (0)	total: 2.35ms	remaining: 4.69s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12.01326945
bestIteration = 113

Shrink model to first 114 iterations.
0:	learn: 16.8713527	test: 18.1447932	best: 18.1447932 (0)	total: 841us	remaining: 1.68s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.82282234
bestIteration = 279

Shrink model to first 280 iterations.
0:	learn: 17.5396387	test: 16.9023884	best: 16.9023884 (0)	total: 838us	remaining: 1.68s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.33389497
bestIteration = 296

Shrink model to first 297 iterations.


In [124]:
display(Markdown(f"#### Лучший ``CatBoost RMSE`` по {n_splits} фолдам: `{round(min(scores), 2)}`"))

#### Лучший ``CatBoost RMSE`` по 3 фолдам: `11.72`

Сохраним лучшую модель `CatBoost`

In [22]:
best_model_cat = models[np.argmin(scores)]

### <center> **LightGBM**

Посмотрим `RMSE` для **GOSS** `baseline` модели

In [87]:
model_lgb = lgb.LGBMRegressor(
    boosting_type='goss',
    objective='regression',
    random_state=42,
    cat_feature=[0, 1, 2],
    verbose=-1)

model_lgb.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    categorical_feature=[0, 1, 2],
    callbacks=[lgb.early_stopping(10)]
)

display(Markdown(f"#### <br>`LightGBM` `RMSE` по дефолту: {round(model_lgb.best_score_['valid_0']['rmse'], 3)}"))

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[41]	valid_0's rmse: 12.0408	valid_0's l2: 144.982


#### <br>`LightGBM` `RMSE` по дефолту: 12.041

#### <center> Выполним поиск гиперпараметров **`LightGBV`**

In [85]:
param_lgb = {
    'learning_rate': [.05, ],
    'max_bin': [40, 50, 60],
    'top_rate': [.2, .25, .3],
    'other_rate': [.1, .2, .25]
}

cv_kf = KFold(n_splits=5, random_state=42, shuffle=True)
estimator_lgb = lgb.LGBMRegressor(
    boosting_type='gbdt',
    data_sample_strategy='goss',
    random_state=42,
    force_row_wise=True,
    verbose=-1,
)

model_rand_lgb = GridSearchCV(
    estimator_lgb,
    param_lgb,
    cv=cv_kf,
    scoring='neg_root_mean_squared_error',
    n_jobs=cpu_count,
    verbose=0,
)

model_rand_lgb.fit(X, y)

params_best_lgb = model_rand_lgb.best_params_
display(Markdown(f"#### Выбранные гиперпараметры для `LightGBM`: `{params_best_lgb}` <br> Best `RMSE`: {round(abs(model_rand_lgb.best_score_), 3)}"))

#### Выбранные гиперпараметры для `LightGBM`: `{'learning_rate': 0.05, 'max_bin': 40, 'other_rate': 0.2, 'top_rate': 0.25}` <br> Best `RMSE`: 11.758

#### <center> Обучение по фолдам для `LightGBM`

In [115]:
n_splits = 3
models = []
scores = []

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = lgb.LGBMRegressor(
        **params_best_lgb,
        boosting_type='gbdt',
        data_sample_strategy='goss',
        cat_feature=[0, 1, 2],
        random_state=42,
        force_row_wise=True,
        n_jobs=cpu_count,
        verbose=-1
    )
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(10)],
        categorical_feature=[0, 1, 2],
    )
    models.append(model)
    scores.append(model.best_score_['valid_0']['rmse'])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[74]	valid_0's rmse: 12.2875	valid_0's l2: 150.982
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[97]	valid_0's rmse: 12.0945	valid_0's l2: 146.277
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[71]	valid_0's rmse: 11.7204	valid_0's l2: 137.368


In [122]:
display(Markdown(f"#### Лучший `LightGBM` `RMSE` по {n_splits} фолдам: `{round(min(scores), 3)}`"))

#### Лучший `LightGBM` `RMSE` по 3 фолдам: `11.72`

In [126]:
best_model_lgb = models[np.argmin(scores)]

### <center> **XGBoost**

Построим базовую модель XGBoost

In [23]:
model_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    booster='dart',
    enable_categorical=True,
    random_state=42,
    nthread=cpu_count,
    early_stopping_rounds=10,
)
model_xgb.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=3
)

display(Markdown(f"#### <br>`XGBoost` `RMSE` по дефолту: {round(model_xgb.best_score, 3)}"))

[0]	validation_0-rmse:15.73818
[3]	validation_0-rmse:13.04271
[6]	validation_0-rmse:12.64687
[9]	validation_0-rmse:12.65160
[12]	validation_0-rmse:12.67780
[15]	validation_0-rmse:12.75195
[17]	validation_0-rmse:12.83047


#### <br>`XGBoost` `RMSE` по дефолту: 12.621

#### <center> Настройка гиперпараметров **`XGBoost`**

In [100]:
param_grid ={
    'eta': np.arange(.04, .06, .01),
    'sample_type': ['weighted'], #['uniform', 'weighted'],
    'normalize_type': ['tree'], #['tree', 'forest'],
    'rate_drop': np.arange(.0, .4, .1),
    'skip_drop': np.arange(.0,.4, .1),
    #'min_child_weight': [1, 2]
    'subsample': [.9],
    #'colsample_bytree': [.9]
}

cv_kf = KFold(n_splits=5, random_state=42, shuffle=True)

estimator_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    booster='dart',
    enable_categorical=True,
    random_state=42,
    n_estimators=100
)

model_tune_xgb = GridSearchCV(
    estimator_xgb,
    param_grid,
    cv=cv_kf,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=cpu_count,
)

model_tune_xgb.fit(X, y)

params_best_xgb = model_tune_xgb.best_params_
display(Markdown(f"#### Выбранные гиперпараметры для `XGBoost`: <br> `{params_best_xgb}` <br> Best `RMSE`: {round(abs(model_tune_xgb.best_score_), 3)}"))

Fitting 5 folds for each of 32 candidates, totalling 160 fits


#### Выбранные гиперпараметры для `XGBoost`: <br> `{'eta': 0.04, 'normalize_type': 'tree', 'rate_drop': 0.0, 'sample_type': 'weighted', 'skip_drop': 0.30000000000000004, 'subsample': 0.9}` <br> Best `RMSE`: 11.723

In [102]:
train_set = xgb.DMatrix(X_train, label=y_train, enable_categorical=True, nthread=cpu_count)
test_set = xgb.DMatrix(X_test, label=y_test, enable_categorical=True, nthread=cpu_count)

params ={'objective':'reg:squarederror', 'booster': 'dart', 'seed': 42}
params_update = params.update(params_best_xgb)

model = xgb.train(params_update, 
                  dtrain=train_set, 
                  num_boost_round=100, 
                  evals=[(train_set, 'dtrain'), (test_set, 'dtest')],
                  early_stopping_rounds=10,
                  verbose_eval=5)

[0]	dtrain-rmse:14.51967	dtest-rmse:15.73818
[5]	dtrain-rmse:9.49716	dtest-rmse:12.65256
[10]	dtrain-rmse:7.94051	dtest-rmse:12.71816
[15]	dtrain-rmse:7.23939	dtest-rmse:12.75195
[17]	dtrain-rmse:6.69989	dtest-rmse:12.83047


#### <center> Обучение по фолдам для `XGBoost`

In [114]:
n_splits = 3
models = []
scores = []

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = xgb.XGBRegressor(
        **params_best_xgb,
        objective='reg:squarederror',
        booster='dart',
        enable_categorical=True,
        random_state=42,
        n_jobs=cpu_count,
        early_stopping_rounds=10,
    )
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=0
    )
    models.append(model)
    scores.append(model.best_score)

print(scores)
display(Markdown(f"#### Лучший `XGBoost` `RMSE` по {n_splits} фолдам: `{round(min(scores), 3)}`"))

best_model_xgb = models[np.argmin(scores)]

[12.13894015585641, 12.249269035927506, 11.91715643210969]


#### Лучший `XGBoost` `RMSE` по 3 фолдам: `11.917`