In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import phik
from phik.report import plot_correlation_matrix
from phik import report

import shap

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

In [20]:
from typing import Literal
from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown

### Подготовка данных

In [3]:
cols_to_drop = ['car_id', 'target_class', 'deviation_normal_count']
cols_cats = ['model', 'car_type', 'fuel_type']

def prepare_data(df: pd.DataFrame, cols_to_drop: list() = [], cols_cats: list() = [], type_df: Literal['train', 'test'] = 'train'):
    cols_to_drop = cols_to_drop
    cols_cats = cols_cats
    df_copy = df.copy()
    df_copy.drop(columns=cols_to_drop, inplace=True)
    
    le = LabelEncoder()
    for col in cols_cats:
        df_copy[col] = le.fit_transform(df_copy[col])
    
    df_copy[cols_cats] = df_copy[cols_cats].astype('category')
    if type_df == 'test':
        return df_copy
    X = df_copy.drop('target_reg', axis=1)
    y = df_copy.target_reg
    return X, y

In [4]:
df_train = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_train.csv')
df_train.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,109.99,another_bug,4.737759,12141310.0,0.1,180.855726,0.023174,174,170
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,34.48,electro_bug,4.480517,18039090.0,0.0,187.862734,12.306011,174,174
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,34.93,gear_stick,4.768391,15883660.0,0.1,102.382857,2.513319,174,173
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,32.22,engine_fuel,3.88092,16518830.0,0.1,172.793237,-5.029476,174,170
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,27.51,engine_fuel,4.181149,13983170.0,0.1,203.462289,-14.260456,174,171


In [5]:
df_test = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_test.csv')
df_test.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,P17494612l,Skoda Rapid,economy,petrol,4.8,2013,42269,2019,gear_stick,3.746207,14075390.0,0.1,195.454152,10.56622,174,170
1,N-1530212S,Renault Sandero,standart,petrol,4.32,2015,90014,2016,engine_overheat,4.318966,19703900.0,0.0,181.538685,11.807941,174,174
2,B-1154399t,Smart ForTwo,economy,petrol,4.46,2015,82684,2017,electro_bug,5.134655,9314946.0,0.1,118.440645,14.862538,174,172
3,F12725233R,Smart ForFour,economy,petrol,2.8,2014,68833,2021,engine_check,4.617356,9336838.0,0.83,112.829785,20.088904,174,172
4,l-1139189J,Skoda Rapid,economy,petrol,6.56,2013,42442,2021,another_bug,4.287471,11962500.0,0.0,187.846088,3.69846,174,172


In [15]:
X, y = prepare_data(df_train, cols_to_drop, cols_cats)

### **Catboost**

#### Обучение CatBoost с параметрами по умолчанию

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
cat_model_default = CatBoostRegressor(thread_count=-1,
                                      random_seed=42,
                                      cat_features=cols_cats)
cat_model_default.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    verbose=200,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100
)

Learning rate set to 0.056174
0:	learn: 17.0065561	test: 17.7387430	best: 17.7387430 (0)	total: 3.4ms	remaining: 3.39s
200:	learn: 9.3813500	test: 12.2560321	best: 12.1641768 (128)	total: 631ms	remaining: 2.51s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12.16417676
bestIteration = 128

Shrink model to first 129 iterations.


<catboost.core.CatBoostRegressor at 0x793cb9d835b0>

In [44]:
display(Markdown(f"#### Значение `RMSE` с default параметрами CatBoost: {round(cat_model_default.best_score_['validation']['RMSE'], 2)}"))

#### Значение `RMSE` с default параметрами CatBoost: 12.16

### Подбор гиперпараметров
#### Catboost

In [27]:
param_grid = {'learning_rate': [.01, .03, .04], 'max_bin': [40, 50, 60], }
model_cat = CatBoostRegressor(random_seed=42, cat_features=cols_cats, thread_count=-1, verbose=False,)
rand_search_res = model_cat.grid_search(param_grid, X, y, verbose=False, plot=False,)
print(model_cat.best_score_)

params_best_cat = rand_search_res['params']
display(Markdown(f"Выбранные гиперпараметры для `CatBoost`: {params_best_cat}"))


bestTest = 10.73227337
bestIteration = 999


bestTest = 10.7365755
bestIteration = 323


bestTest = 10.71436759
bestIteration = 239


bestTest = 10.7808992
bestIteration = 936


bestTest = 10.80901949
bestIteration = 337


bestTest = 10.81892195
bestIteration = 299


bestTest = 10.72167277
bestIteration = 970


bestTest = 10.7488679
bestIteration = 325


bestTest = 10.76114647
bestIteration = 242

Training on fold [0/3]

bestTest = 11.7431856
bestIteration = 208

Training on fold [1/3]

bestTest = 11.7577536
bestIteration = 238

Training on fold [2/3]

bestTest = 11.3622793
bestIteration = 246

{'learn': {'RMSE': 7.230686518563849}}


{'border_count': 40, 'learning_rate': 0.04}

#### Обучение по фолдам

In [33]:
n_splits = 3
models = []
scores = []

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    train_dataset = Pool(data=X_train, label=y_train, cat_features=cols_cats)
    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cols_cats)

    model_cat = CatBoostRegressor(
        **params_best_cat,
        iterations=2000,
        cat_features=cols_cats,
        random_seed=7575,
        early_stopping_rounds=100,)
    
    models.append(model_cat)
    
    model_cat.fit(
        train_dataset,
        eval_set=eval_dataset,
        verbose=500,
        use_best_model=True,
        plot=False
    )
    scores.append(model_cat.best_score_['validation']['RMSE'])

0:	learn: 17.5448536	test: 16.9309868	best: 16.9309868 (0)	total: 2.57ms	remaining: 5.14s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12.01326945
bestIteration = 113

Shrink model to first 114 iterations.
0:	learn: 16.8713527	test: 18.1447932	best: 18.1447932 (0)	total: 8.35ms	remaining: 16.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.82282234
bestIteration = 279

Shrink model to first 280 iterations.
0:	learn: 17.5396387	test: 16.9023884	best: 16.9023884 (0)	total: 5.1ms	remaining: 10.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.33389497
bestIteration = 296

Shrink model to first 297 iterations.


[12.013269447178565, 11.82282234183772, 11.333894966792219]

In [45]:
display(Markdown(f"#### Лучший ``RMSE`` по {n_splits} фолдам: {round(min(scores), 2)}"))

#### Лучший ``RMSE`` по 3 фолдам: 11.33

Сохраним лучшую модель `CatBoost`

In [36]:
best_model_cat = models[np.argmin(scores)]

### **LightGBM**

Посмотрим `RMSE` для `baseline` модели

In [None]:
model_lgb = lgb.LGBMRegressor()