In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import phik
from phik.report import plot_correlation_matrix
from phik import report

import shap
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from typing import Literal
import os
from IPython.display import display, Markdown, HTML

In [7]:
cpu_count = os.cpu_count() - 1
n_splits = 3

### <center> Подготовка данных

In [8]:
cols_to_drop = ['car_id', 'target_class', 'deviation_normal_count']
cols_cats = ['model', 'car_type', 'fuel_type']

def prepare_data(df: pd.DataFrame, cols_to_drop: list() = [], cols_cats: list() = [], type_df: Literal['train', 'test'] = 'train'):
    cols_to_drop = cols_to_drop
    cols_cats = cols_cats
    df_copy = df.copy()
    df_copy.drop(columns=cols_to_drop, inplace=True)
    
    le = LabelEncoder()
    for col in cols_cats:
        df_copy[col] = le.fit_transform(df_copy[col])
    
    df_copy[cols_cats] = df_copy[cols_cats].astype('category')
    if type_df == 'test':
        return df_copy
    X = df_copy.drop('target_reg', axis=1)
    y = df_copy.target_reg
    return X, y

In [10]:
df_train = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_train.csv')
df_train.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,109.99,another_bug,4.737759,12141310.0,0.1,180.855726,0.023174,174,170
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,34.48,electro_bug,4.480517,18039090.0,0.0,187.862734,12.306011,174,174
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,34.93,gear_stick,4.768391,15883660.0,0.1,102.382857,2.513319,174,173
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,32.22,engine_fuel,3.88092,16518830.0,0.1,172.793237,-5.029476,174,170
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,27.51,engine_fuel,4.181149,13983170.0,0.1,203.462289,-14.260456,174,171


In [11]:
df_test = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_test.csv')
df_test.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,P17494612l,Skoda Rapid,economy,petrol,4.8,2013,42269,2019,gear_stick,3.746207,14075390.0,0.1,195.454152,10.56622,174,170
1,N-1530212S,Renault Sandero,standart,petrol,4.32,2015,90014,2016,engine_overheat,4.318966,19703900.0,0.0,181.538685,11.807941,174,174
2,B-1154399t,Smart ForTwo,economy,petrol,4.46,2015,82684,2017,electro_bug,5.134655,9314946.0,0.1,118.440645,14.862538,174,172
3,F12725233R,Smart ForFour,economy,petrol,2.8,2014,68833,2021,engine_check,4.617356,9336838.0,0.83,112.829785,20.088904,174,172
4,l-1139189J,Skoda Rapid,economy,petrol,6.56,2013,42442,2021,another_bug,4.287471,11962500.0,0.0,187.846088,3.69846,174,172


In [12]:
X, y = prepare_data(df_train, cols_to_drop, cols_cats)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

> Сделаем корреляцию фич используя пакет `Phik`, чтобы найти фичи с нулевой корреляцией

In [13]:
corr_features = pd.concat([X, y], axis=1).phik_matrix().round(2)['target_reg'].sort_values(ascending=False)
corr_features

interval columns not set, guessing: ['car_rating', 'year_to_start', 'riders', 'year_to_work', 'mean_rating', 'distance_sum', 'rating_min', 'speed_max', 'user_ride_quality_median', 'user_uniq', 'target_reg']


target_reg                  1.00
speed_max                   0.49
mean_rating                 0.47
user_uniq                   0.42
fuel_type                   0.36
model                       0.29
car_type                    0.18
rating_min                  0.08
car_rating                  0.00
year_to_start               0.00
riders                      0.00
year_to_work                0.00
distance_sum                0.00
user_ride_quality_median    0.00
Name: target_reg, dtype: float64

`cols_no_corr` - список фич с нулевой корреляцией.

In [14]:
cols_no_corr = corr_features[corr_features == 0].index.tolist()
cols_no_corr

['car_rating',
 'year_to_start',
 'riders',
 'year_to_work',
 'distance_sum',
 'user_ride_quality_median']

`X_ncor` - датафрейм без фич с нулевой корреляцией

In [15]:
X_ncor = X.drop(columns=cols_no_corr)

#### `Helper` функция для обучения модели по фолдам и извлечеие модели с лучшей `метрикой`

In [16]:
def pull_best_model(estimator, X, y, kwargs_fit, *, n_splits = 3, random_state=42, scoring='RMSE'):
    models = []
    scores = []
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = estimator.__class__(**estimator.get_params())
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            **kwargs_fit
        )
        if isinstance(model, CatBoostRegressor):
            score = model.best_score_['validation'][scoring]
        elif isinstance(model, lgb.LGBMRegressor):
            score = model.best_score_['valid_0'][scoring.lower()]
        elif isinstance(model, xgb.XGBRegressor):
            score = model.best_score
        else:
            score = model.score(X_test, y_test)
            
        scores.append(score)
        models.append(model)

    best_model = models[np.argmin(scores)]
    best_index = np.argmin(scores)
    
    return best_model, scores[best_index], np.std(scores)

### <center> **Catboost**

#### Обучение `CatBoost` с параметрами по умолчанию

In [17]:
cat_model_default = CatBoostRegressor(
    thread_count=cpu_count,
    random_seed=42,
    cat_features=cols_cats
)
cat_model_default.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    verbose=200,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100
)

display(Markdown(f"#### Значение `RMSE` для `CatBoost` с default параметрами: `{round(cat_model_default.best_score_['validation']['RMSE'], 3)}`"))

Learning rate set to 0.056174
0:	learn: 17.0065561	test: 17.7387430	best: 17.7387430 (0)	total: 70.4ms	remaining: 1m 10s
200:	learn: 9.3813500	test: 12.2560321	best: 12.1641768 (128)	total: 654ms	remaining: 2.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12.16417676
bestIteration = 128

Shrink model to first 129 iterations.


#### Значение `RMSE` для `CatBoost` с default параметрами: `12.164`

### <center> Подбор гиперпараметров
#### Используем встроенный **grid_search** Catboost

In [18]:
param_grid = {
    'learning_rate': [.01, .03, .04],
    'max_bin': [40, 50, 60], 
}
model_cat = CatBoostRegressor(
    random_seed=42,
    cat_features=cols_cats,
    thread_count=cpu_count,
    verbose=0,
)

rand_search_res = model_cat.grid_search(param_grid, X, y, verbose=False, plot=False,)
print(model_cat.best_score_)

params_best_cat = rand_search_res['params']
display(Markdown(f"#### Выбранные гиперпараметры для `CatBoost`:<br> `{params_best_cat}`"))


bestTest = 10.73227337
bestIteration = 999


bestTest = 10.7365755
bestIteration = 323


bestTest = 10.71436759
bestIteration = 239


bestTest = 10.7808992
bestIteration = 936


bestTest = 10.80901949
bestIteration = 337


bestTest = 10.81892195
bestIteration = 299


bestTest = 10.72167277
bestIteration = 970


bestTest = 10.7488679
bestIteration = 325


bestTest = 10.76114647
bestIteration = 242

Training on fold [0/3]

bestTest = 11.7431856
bestIteration = 208

Training on fold [1/3]

bestTest = 11.7577536
bestIteration = 238

Training on fold [2/3]

bestTest = 11.3622793
bestIteration = 246

{'learn': {'RMSE': 7.230686518563849}}


#### Выбранные гиперпараметры для `CatBoost`:<br> `{'border_count': 40, 'learning_rate': 0.04}`

* #### Обучение по фолдам

In [19]:
model_cat = CatBoostRegressor(
    **params_best_cat,
    iterations=2000,
    cat_features=cols_cats,
    random_seed=7575,
    early_stopping_rounds=100,
    thread_count=cpu_count,
)

cat_fit = dict(
    verbose=200,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100
)

best_model_cat, rmse_cat, std_cat = pull_best_model(model_cat, X, y, cat_fit)
display(Markdown(f"#### Лучший ``CatBoost RMSE`` по {n_splits} фолдам: `{round(rmse_cat, 3)}` <br> `STD`: {round(std_cat, 2)}"))

0:	learn: 17.5448536	test: 16.9309868	best: 16.9309868 (0)	total: 3.46ms	remaining: 6.91s
200:	learn: 9.8768638	test: 12.0412599	best: 12.0132694 (113)	total: 390ms	remaining: 3.49s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 12.01326945
bestIteration = 113

Shrink model to first 114 iterations.
0:	learn: 16.8713527	test: 18.1447932	best: 18.1447932 (0)	total: 1.99ms	remaining: 3.98s
200:	learn: 10.0181547	test: 11.8354254	best: 11.8354254 (200)	total: 361ms	remaining: 3.23s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.82282234
bestIteration = 279

Shrink model to first 280 iterations.
0:	learn: 17.5396387	test: 16.9023884	best: 16.9023884 (0)	total: 5.66ms	remaining: 11.3s
200:	learn: 10.2551450	test: 11.4095143	best: 11.4095143 (200)	total: 366ms	remaining: 3.27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.33389497
bestIteration = 296

Shrink model to first 297 iterations.


#### Лучший ``CatBoost RMSE`` по 3 фолдам: `11.334` <br> `STD`: 0.29

* Сделаем обучение по фолдам для датафрейма без фич с нулевой корреляцией

In [20]:
ncor_best_cat, rmse_cat_ncor, std_cat_ncor = pull_best_model(model_cat, X_ncor, y, cat_fit)
display(Markdown(f"#### X_ncor: Лучший ``CatBoost RMSE`` по {n_splits} фолдам: `{round(rmse_cat_ncor, 3)}` <br> `STD`: {round(std_cat_ncor, 2)}"))

0:	learn: 17.4948581	test: 16.8742293	best: 16.8742293 (0)	total: 4.05ms	remaining: 8.1s
200:	learn: 10.6346110	test: 11.9182650	best: 11.9126277 (169)	total: 313ms	remaining: 2.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.91262774
bestIteration = 169

Shrink model to first 170 iterations.
0:	learn: 16.8709847	test: 18.1329689	best: 18.1329689 (0)	total: 3.79ms	remaining: 7.57s
200:	learn: 10.7227348	test: 11.6563935	best: 11.6544598 (190)	total: 280ms	remaining: 2.51s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.63476217
bestIteration = 239

Shrink model to first 240 iterations.
0:	learn: 17.5199882	test: 16.8938932	best: 16.8938932 (0)	total: 2.16ms	remaining: 4.32s
200:	learn: 10.7970690	test: 11.4583216	best: 11.4583216 (200)	total: 337ms	remaining: 3.02s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 11.40363744
bestIteration = 280

Shrink model to first 281 iterations.


#### X_ncor: Лучший ``CatBoost RMSE`` по 3 фолдам: `11.404` <br> `STD`: 0.21

### <center> **LightGBM GOSS**

* Посмотрим `RMSE` для **GOSS** `baseline` модели

In [22]:
model_lgb = lgb.LGBMRegressor(
    boosting_type='goss',
    objective='regression',
    random_state=42,
    cat_feature=[0, 1, 2],
    verbose=-1)

model_lgb.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    categorical_feature=[0, 1, 2],
    callbacks=[lgb.early_stopping(10)]
)

display(Markdown(f"#### <br>`LightGBM` `RMSE` по дефолту: {round(model_lgb.best_score_['valid_0']['rmse'], 3)}"))

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 11.9649	valid_0's l2: 143.159


#### <br>`LightGBM` `RMSE` по дефолту: 11.965

#### <center> Выполним поиск гиперпараметров **`LightGBV`**

In [23]:
param_lgb = {
    'learning_rate': [.05, ],
    'max_bin': [40, 50,],
    'top_rate': [.2, .25, .3],
    'other_rate': [.1, .2, .25]
}

cv_kf = KFold(n_splits=5, random_state=42, shuffle=True)

estimator_lgb = lgb.LGBMRegressor(
    boosting_type='gbdt',
    data_sample_strategy='goss',
    random_state=42,
    force_row_wise=True,
    verbose=-1,
)

model_rand_lgb = GridSearchCV(
    estimator_lgb,
    param_lgb,
    cv=cv_kf,
    scoring='neg_root_mean_squared_error',
    n_jobs=cpu_count,
    verbose=0,
)
n_splits
model_rand_lgb.fit(X, y)

params_best_lgb = model_rand_lgb.best_params_
display(Markdown(f"#### Выбранные гиперпараметры для `LightGBM`:<br> `{params_best_lgb}` <br> Best `RMSE`: {round(abs(model_rand_lgb.best_score_), 3)}"))

#### Выбранные гиперпараметры для `LightGBM`:<br> `{'learning_rate': 0.05, 'max_bin': 40, 'other_rate': 0.2, 'top_rate': 0.25}` <br> Best `RMSE`: 11.758

#### <center> Обучение по фолдам для `LightGBM`

In [24]:
model_lgb = lgb.LGBMRegressor(
    **params_best_lgb,
    boosting_type='gbdt',
    data_sample_strategy='goss',
    cat_feature=[0, 1, 2],
    random_state=42,
    force_row_wise=True,
    n_jobs=cpu_count,
    verbose=-1
)
params_fit_lgb = dict(
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(10)],
    categorical_feature=[0, 1, 2],
)

best_model_lgb, rmse_lgb, std_lgb = pull_best_model(model_lgb, X, y, params_fit_lgb)
display(Markdown(f"#### Лучший `LightGBM` `RMSE` по {n_splits} фолдам: `{round(rmse_lgb, 3)}`<br> `STD`: {round(std_lgb, 2)}"))

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[74]	valid_0's rmse: 12.2875	valid_0's l2: 150.982
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[97]	valid_0's rmse: 12.0945	valid_0's l2: 146.277
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[71]	valid_0's rmse: 11.7204	valid_0's l2: 137.368


#### Лучший `LightGBM` `RMSE` по 3 фолдам: `11.72`<br> `STD`: 0.24

* Обучение по фолдам для `X_ncor`

In [25]:
ncor_best_lgb, rmse_lgb_ncor, std_lgb_ncor = pull_best_model(model_lgb, X.drop(columns=cols_no_corr), y, params_fit_lgb)
display(Markdown(f"#### X_ncor: Лучший ``LightGBM RMSE`` по {n_splits} фолдам: `{round(rmse_lgb_ncor, 3)}` <br> `STD`: {round(std_lgb_ncor, 2)}"))

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[68]	valid_0's rmse: 12.0781	valid_0's l2: 145.881
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[95]	valid_0's rmse: 11.9362	valid_0's l2: 142.474
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[68]	valid_0's rmse: 11.7827	valid_0's l2: 138.832


#### X_ncor: Лучший ``LightGBM RMSE`` по 3 фолдам: `11.783` <br> `STD`: 0.12

### <center> **XGBoost**

* Построим базовую модель *XGBoost*

In [27]:
model_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    booster='dart',
    enable_categorical=True,
    random_state=42,
    nthread=cpu_count,
    early_stopping_rounds=10,
)
model_xgb.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=3
)

display(Markdown(f"#### <br>`XGBoost` `RMSE` по дефолту: {round(model_xgb.best_score, 3)}"))

[0]	validation_0-rmse:15.73818
[3]	validation_0-rmse:13.04271
[6]	validation_0-rmse:12.64687
[9]	validation_0-rmse:12.65160
[12]	validation_0-rmse:12.67780
[15]	validation_0-rmse:12.75195
[17]	validation_0-rmse:12.83047


#### <br>`XGBoost` `RMSE` по дефолту: 12.621

#### <center> Настройка гиперпараметров **`XGBoost`**

In [28]:
param_grid ={
    'eta': np.arange(.04, .06, .01),
    'sample_type': ['weighted'], #['uniform', 'weighted'],
    'normalize_type': ['tree'], #['tree', 'forest'],
    'rate_drop': np.arange(.0, .4, .1),
    'skip_drop': np.arange(.0,.4, .1),
    #'min_child_weight': [1, 2]
    'subsample': [.9],
    #'colsample_bytree': [.9]
}

cv_kf = KFold(n_splits=5, random_state=42, shuffle=True)

estimator_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    booster='dart',
    enable_categorical=True,
    random_state=42,
    n_estimators=100
)

model_tune_xgb = GridSearchCV(
    estimator_xgb,
    param_grid,
    cv=cv_kf,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=cpu_count,
)

model_tune_xgb.fit(X, y)

params_best_xgb = model_tune_xgb.best_params_
display(Markdown(f"#### Выбранные гиперпараметры для `XGBoost`: <br> `{params_best_xgb}` <br> Best `RMSE`: {round(abs(model_tune_xgb.best_score_), 3)}"))

Fitting 5 folds for each of 32 candidates, totalling 160 fits


#### Выбранные гиперпараметры для `XGBoost`: <br> `{'eta': 0.04, 'normalize_type': 'tree', 'rate_drop': 0.0, 'sample_type': 'weighted', 'skip_drop': 0.30000000000000004, 'subsample': 0.9}` <br> Best `RMSE`: 11.723

In [29]:
train_set = xgb.DMatrix(X_train, label=y_train, enable_categorical=True, nthread=cpu_count)
test_set = xgb.DMatrix(X_test, label=y_test, enable_categorical=True, nthread=cpu_count)

params ={'objective': 'reg:squarederror', 'booster': 'dart', 'seed': 42}
params_update = params.update(params_best_xgb)

model = xgb.train(params_update, 
                  dtrain=train_set, 
                  num_boost_round=100, 
                  evals=[(train_set, 'dtrain'), (test_set, 'dtest')],
                  early_stopping_rounds=10,
                  verbose_eval=5)

[0]	dtrain-rmse:14.51967	dtest-rmse:15.73818
[5]	dtrain-rmse:9.49716	dtest-rmse:12.65256
[10]	dtrain-rmse:7.94051	dtest-rmse:12.71816
[15]	dtrain-rmse:7.23939	dtest-rmse:12.75195
[17]	dtrain-rmse:6.69989	dtest-rmse:12.83047


#### <center> Обучение по фолдам для `XGBoost`

In [30]:
model_xgb = xgb.XGBRegressor(
    **params_best_xgb,
    objective='reg:squarederror',
    booster='dart',
    enable_categorical=True,
    random_state=42,
    n_jobs=cpu_count,
    early_stopping_rounds=10,
)
params_fit_xgb = dict(
    verbose=0,
)

best_model_xgb, rmse_xgb, std_xgb = pull_best_model(model_xgb, X, y, params_fit_xgb)
display(Markdown(f"#### Лучший `XGBoost` `RMSE` по {n_splits} фолдам: `{round(rmse_xgb, 3)}`<br> `STD`: `{round(std_xgb, 2)}`"))

#### Лучший `XGBoost` `RMSE` по 3 фолдам: `11.917`<br> `STD`: `0.14`

* Обучение по фолдам для `X_ncor`

In [31]:
ncor_best_xgb, rmse_xgb_ncor, std_xgb_ncor = pull_best_model(model_xgb, X_ncor, y, params_fit_xgb)
display(Markdown(f"#### X_ncor: Лучший ``XGBoost RMSE`` по {n_splits} фолдам: `{round(rmse_xgb_ncor, 3)}` <br> `STD`: {round(std_xgb_ncor, 2)}"))

#### X_ncor: Лучший ``XGBoost RMSE`` по 3 фолдам: `12.054` <br> `STD`: 0.08

### <center> Итоговое `Предсказание` на тестовов датасете

* Сводная табличка для полного датафрейма `X` и датафрейма без фич с нулевой корреляцией `X_ncor`

In [2]:
# display(Markdown("""
# <style>
#     .dataframe td, .dataframe th {
#         font-size: 20px;
#     }
# </style>
# """))

In [75]:
columns_index = [['X', 'X', 'X_ncor', 'X_ncor'], ['rmse','std','rmse','std']]
data_total = [
    [rmse_cat, std_cat, rmse_cat_ncor, std_cat_ncor],
    [rmse_lgb, std_lgb, rmse_lgb_ncor, std_lgb_ncor],
    [rmse_xgb, std_xgb, rmse_xgb_ncor, std_xgb_ncor]
]

total_metrics = pd.DataFrame(data=data_total, index=['CatBoost', 'LightGBM', 'XGBoost'], columns=pd.MultiIndex.from_arrays(columns_index))\
.apply(lambda x: round(x, 2))

display(Markdown(total_metrics.to_html(classes='dataframe')))

Unnamed: 0_level_0,X,X,X_ncor,X_ncor
Unnamed: 0_level_1,rmse,std,rmse,std
CatBoost,11.33,0.29,11.4,0.21
LightGBM,11.72,0.24,11.78,0.12
XGBoost,11.92,0.14,12.05,0.08


#### Выбор модели для предсказания
* Из таблицы видно, что модели обученные на полном датасете `X` имеют более низкий `rmse`. 
* В тоже время модели обученные на датафрейме с исключенными фичами с нулевой корреляцией **X_ncor** - имеют меньшее стандартоное отклонение, что может привести к более точным прогнозам. <br>

Выберем модели с более низким `std`,

Создадим переменную `models_best` с лучшими моделями.

In [35]:
models_best = [best_model_cat, best_model_lgb, best_model_xgb]

Подготовим тестовый набор `df_test`

In [36]:
test_data = prepare_data(df_test, cols_to_drop, cols_cats, type_df='test')

In [85]:
models_best_ncor = [ncor_best_cat, ncor_best_lgb, ncor_best_xgb]
models_name = ['pred_cat', 'pred_lgb', 'pred_xgb']
df_pred_ncor = pd.DataFrame()
for name, model in zip(models_name, models_best_ncor):
    if isinstance(model, xgb.XGBRegressor):
        df_pred_ncor[name] = model.predict(test_data.drop(columns=cols_no_corr), iteration_range=(0, model.best_iteration + 1))
    else: 
        df_pred_ncor[name] = model.predict(test_data.drop(columns=cols_no_corr))
        
target_reg_ncor = df_pred_ncor.apply(np.mean, axis=1)
pd.DataFrame({'car_id': df_test.car_id, 'target_reg': target_reg_ncor}).to_csv('submission.csv', index=False)