In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor, early_stopping

import optuna

In [4]:
def r2_adjusted(y_true: np.ndarray, y_pred: np.ndarray,
                X_test: np.ndarray) -> float:
    """Коэффициент детерминации (множественная регрессия)"""
    N_objects = len(y_true)
    N_features = X_test.shape[1]
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (N_objects - 1) / (N_objects - N_features - 1)


def wape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Weighted Absolute Percent Error"""
    return np.sum(np.abs(y_pred - y_true)) / np.sum(y_true) * 100


def get_metrics_regression(y_test: np.ndarray,
                           y_pred: np.ndarray,
                           X_test: np.ndarray,
                           name: str = None,
                           delta: float = 1.345):
    """Генерация таблицы с метриками для задачи регрессии"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]

    df_metrics['MAE'] = mean_absolute_error(y_test, y_pred)
    df_metrics['RMSE'] = np.sqrt(mean_squared_error(y_test, y_pred))
    df_metrics['R2 adjusted'] = r2_adjusted(y_test, y_pred, X_test)
    df_metrics['WAPE_%'] = wape(y_test, y_pred)

    return df_metrics

In [5]:
def train_lgbm_cv(**params: dict) -> tuple[list, float]:
    """
    Функция, обучающая модель LGBMRegressor на кросс-валидации
    с 6 фолдами(разбиение на фолды по пациентам, в последнем фолде
    объединены 1 и 6 пациенты) с заданными параметрами
    Выводит средние по фолдам метрики на тренировочных и 
    валидационных данных и дельту между ними
    
    Parameters
    ----------
    Словарь с параметрами модели LGBMRegressor

    Returns
    -------
    Массив с предсказаниями на holdout для каждого фолда
    и среднюю метрику по фолдам на трейне для подсчета дельты между 
    метриками на трейне и на отложенной выборке
    """
    mae_oof = np.empty(6)
    mae_train_oof = np.empty(6)
    predicts_test = []
    # первые пять фолдов
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'],
                                                       axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = LGBMRegressor(**params)
        model.fit(X_train_,
                  y_train_,
                  eval_set=[(X_val, y_val)],
                  eval_metric="mae",
                  callbacks=[early_stopping(stopping_rounds=100)])

        y_val_pred = model.predict(X_val)
        y_train_pred = model.predict(X_train_)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)
        mae_train_oof[idx] = mean_absolute_error(y_train_, y_train_pred)

        y_test_pred = model.predict(X_test)
        predicts_test.append(y_test_pred)

        print(
            f"MAE fold {idx + 1} = {mean_absolute_error(y_val, y_val_pred):.3f}"
        )
        print('---')

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))].drop(['p_num', 'target'],
                                                         axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')].drop(
        ['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') |
                      (train.p_num == 'p01')]['target']

    model = LGBMRegressor(**params)
    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric="mae",
              callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train_)

    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)
    mae_train_oof[idx + 1] = mean_absolute_error(y_train_, y_train_pred)

    y_test_pred = model.predict(X_test)
    predicts_test.append(y_test_pred)

    print(f"MAE fold {idx + 2} = {mean_absolute_error(y_val, y_val_pred):.3f}")
    print('---')

    avg_mae_val = np.mean(mae_oof)
    avg_mae_train = np.mean(mae_train_oof)
    print(f"Mean MAE val = {avg_mae_val}")
    print(f"Mean MAE train = {avg_mae_train}")
    print(
        f"Delta between train and val = {(abs(avg_mae_train - avg_mae_val) / avg_mae_train * 100):.1f} %"
    )

    return predicts_test, avg_mae_train

In [6]:
def train_catboost_cv(**params: dict) -> tuple[list, float]:
    """
    Функция, обучающая модель CatBoostRegressor на кросс-валидации
    с 6 фолдами(разбиение на фолды по пациентам, в последнем фолде
    объединены 1 и 6 пациенты) с заданными параметрами
    Выводит средние по фолдам метрики на тренировочных и 
    валидационных данных и дельту между ними
   
    Parameters
    ----------
    Словарь с параметрами модели LGBMRegressor

    Returns
    -------
    Массив с предсказаниями на holdout для каждого фолда
    и среднюю метрику по фолдам на трейне для подсчета дельты между 
    метриками на трейне и на отложенной выборке
    """
    mae_oof = np.empty(6)
    mae_train_oof = np.empty(6)
    predicts_test = []
    # первые пять фолдов
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(
            ['p_num', 'target'], axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = CatBoostRegressor(**params)

        eval_set = [(X_val, y_val)]

        model.fit(X_train_,
                  y_train_,
                  eval_set=eval_set,
                  verbose=False,
                  early_stopping_rounds=100)

        y_val_pred = model.predict(X_val)
        y_train_pred = model.predict(X_train_)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)
        mae_train_oof[idx] = mean_absolute_error(y_train_, y_train_pred)

        y_test_pred = model.predict(X_test)
        predicts_test.append(y_test_pred)

        print(
            f"MAE fold {idx + 1} = {mean_absolute_error(y_val, y_val_pred):.3f}")
        print('---')

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (
        train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') | (
        train.p_num == 'p01')]['target']

    model = CatBoostRegressor(**params)

    eval_set = [(X_val, y_val)]

    model.fit(X_train_,
              y_train_,
              eval_set=eval_set,
              verbose=False,
              early_stopping_rounds=100)

    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train_)

    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)
    mae_train_oof[idx + 1] = mean_absolute_error(y_train_, y_train_pred)

    y_test_pred = model.predict(X_test)
    predicts_test.append(y_test_pred)

    print(f"MAE fold {idx + 2} = {mean_absolute_error(y_val, y_val_pred):.3f}")
    print('---')

    avg_mae_val = np.mean(mae_oof)
    avg_mae_train = np.mean(mae_train_oof)
    print(f"Mean MAE val = {avg_mae_val}")
    print(f"Mean MAE train = {avg_mae_train}")
    print(
        f"Delta between train and val = {(abs(avg_mae_train - avg_mae_val) / avg_mae_train * 100):.1f} %")

    return predicts_test, avg_mae_train

In [7]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [8]:
X_test, y_test = test.drop(['p_num', 'target'], axis=1), test['target']
X_train, y_train = train.drop(['p_num', 'target'], axis=1), train['target']

In [9]:
patients = ['p02', 'p10', 'p12', 'p04', 'p11']

In [10]:
metrics = pd.read_csv("metrics.csv")

In [11]:
metrics

Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197


# Tuning LGBMRegressor

## LGBM Tune 1

In [None]:
def objective_1_lgb(trial):
    lgb_params = {
        "random_state":
        trial.suggest_categorical("random_state", [42]),
        "verbose":
        trial.suggest_categorical("verbose", [-1]),
        "n_estimators":
        trial.suggest_categorical("n_estimators", [100]),
        "objective":
        trial.suggest_categorical("objective", ['mae']),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.05621533613556329]),
        #          "objective":
        #         trial.suggest_categorical("objective", ['mae', 'rmse', None]),
        #         "learning_rate":
        #         trial.suggest_float("learning_rate", 0.045, 0.06, log=True),
        "num_leaves":
        trial.suggest_int("num_leaves", 16, 25),
        "max_depth":
        trial.suggest_int("max_depth", 6, 10),
        "max_bin":
        trial.suggest_int("max_bin", 100, 250, step=10),
        "min_child_samples":
        trial.suggest_int("min_child_samples", 100, 1000, step=100),
        "min_split_gain":
        trial.suggest_float("min_split_gain", 0.0, 0.9, step=0.1),
        "subsample":
        trial.suggest_float("subsample", 0.8, 1.0),
        "subsample_freq":
        trial.suggest_int("subsample_freq", 1, 3),
        "colsample_bytree":
        trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "reg_alpha":
        trial.suggest_int("reg_alpha", 0, 100),
        "reg_lambda":
        trial.suggest_int("reg_lambda", 0, 100)
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'mae')

    mae_oof = np.empty(6)
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'],
                                                       axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = LGBMRegressor(**lgb_params)
        model.fit(X_train_,
                  y_train_,
                  eval_set=[(X_val, y_val)],
                  eval_metric="mae")

        y_val_pred = model.predict(X_val)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))].drop(['p_num', 'target'],
                                                         axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')].drop(
        ['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') |
                      (train.p_num == 'p01')]['target']

    model = LGBMRegressor(**lgb_params)
    model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], eval_metric="mae")

    y_val_pred = model.predict(X_val)
    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)

    return np.mean(mae_oof)

In [None]:
study_1_lgb = optuna.create_study(direction="minimize", study_name="LGB_1")
study_1_lgb.optimize(objective_1_lgb, n_trials=20, show_progress_bar=True)

In [None]:
# study_1_lgb.best_params

In [None]:
# study_1_lgb.best_value

In [12]:
# здесь и далее сохраняла лучшие параметры Optuna для каждого study в отдельную переменную, так как при перезапуске ячейки
# результаты не воспроизводятся
lgbm_1_params = {'random_state': 42,
                 'verbose': -1,
                 'n_estimators': 100,
                 'objective': 'mae',
                 'learning_rate': 0.05621533613556329,
                 'num_leaves': 21,
                 'max_depth': 7,
                 'max_bin': 230,
                 'min_child_samples': 1000,
                 'min_split_gain': 0.2,
                 'subsample': 0.9479574040260543,
                 'subsample_freq': 1,
                 'colsample_bytree': 0.9489394374238439,
                 'reg_alpha': 1,
                 'reg_lambda': 12}

**Проверяем подобранные параметры на CV**

In [13]:
%%time

predicts_test, avg_mae_train = train_lgbm_cv(**lgbm_1_params)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.65099
MAE fold 1 = 1.651
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[81]	valid_0's l1: 1.18964
MAE fold 2 = 1.190
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[95]	valid_0's l1: 1.29595
MAE fold 3 = 1.296
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.49862
MAE fold 4 = 1.499
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's l1: 1.75268
MAE fold 5 = 1.753
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.98058
MAE fold 6 = 1.981
---
Mean MAE val = 1.5614111845742074
Mea

In [14]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test, 'lgbm tune 1')
])
metrics

MAE holdout = 1.629734136399079
Delta between train and holdout = 19.3 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901


Подбор гиперпараметров способствовал значительному снижению переобучения: разница между метриками на трейне и валидации, а также между метриками на трейне и на отложенной выборке значительно уменьшилась (у baseline LGBM с кросс-валидацией разница между метриками на трейне и валидации была 19.1%, а между метриками на трейне и отложенной выборке - 23.1%). \
Однако переобучение все еще большое, поэтому попробуем несколько раз подобрать гиперпараметры для LGBM на кросс-валидации, чтобы найти лучшую комбинацию гиперпараметров (затем можно провести стэкинг лучших моделей)

## LGBM Tune 2

In [19]:
def objective_2_lgb(trial):
    lgb_params = {
        "random_state":
        trial.suggest_categorical("random_state", [42]),
        "verbose":
        trial.suggest_categorical("verbose", [-1]),
        "n_estimators":
        trial.suggest_categorical("n_estimators", [100]),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.053402226219538856]),
        #         "learning_rate":
        #         trial.suggest_float("learning_rate", 0.05, 0.06, log=True),
        #          "objective":
        #         trial.suggest_categorical("objective", ['mae', 'rmse', None]),
        "num_leaves":
        trial.suggest_int("num_leaves", 16, 30),
        "max_depth":
        trial.suggest_int("max_depth", 6, 12),
        "max_bin":
        trial.suggest_int("max_bin", 100, 250, step=10),
        "min_child_samples":
        trial.suggest_int("min_child_samples", 100, 1000, step=100),
        "min_split_gain":
        trial.suggest_float("min_split_gain", 0.0, 0.9, step=0.1),
        "subsample":
        trial.suggest_float("subsample", 0.8, 1.0),
        "subsample_freq":
        trial.suggest_int("subsample_freq", 1, 3),
        "colsample_bytree":
        trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "reg_alpha":
        trial.suggest_int("reg_alpha", 0, 100),
        "reg_lambda":
        trial.suggest_int("reg_lambda", 0, 100)
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'mae')

    mae_oof = np.empty(6)
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'],
                                                       axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = LGBMRegressor(**lgb_params)
        model.fit(X_train_,
                  y_train_,
                  eval_set=[(X_val, y_val)],
                  eval_metric="mae")

        y_val_pred = model.predict(X_val)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))].drop(['p_num', 'target'],
                                                         axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')].drop(
        ['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') |
                      (train.p_num == 'p01')]['target']

    model = LGBMRegressor(**lgb_params)
    model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], eval_metric="mae")

    y_val_pred = model.predict(X_val)
    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)

    return np.mean(mae_oof)

In [None]:
study_2_lgb = optuna.create_study(direction="minimize", study_name="LGB_2")
study_2_lgb.optimize(objective_2_lgb, n_trials=20, show_progress_bar=True)

In [None]:
# study_2_lgb.best_params

In [None]:
# study_2_lgb.best_value

In [15]:
lgbm_2_params = {'random_state': 42,
                 'verbose': -1,
                 'n_estimators': 100,
                 'learning_rate': 0.053402226219538856,
                 'num_leaves': 19,
                 'max_depth': 11,
                 'max_bin': 200,
                 'min_child_samples': 800,
                 'min_split_gain': 0.0,
                 'subsample': 0.9388008791697994,
                 'subsample_freq': 3,
                 'colsample_bytree': 0.9709495705457634,
                 'reg_alpha': 30,
                 'reg_lambda': 33}

**Проверяем подобранные параметры на CV**

In [16]:
predicts_test, avg_mae_train = train_lgbm_cv(**lgbm_2_params)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[94]	valid_0's l1: 1.63002	valid_0's l2: 5.10138
MAE fold 1 = 1.630
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.29186	valid_0's l2: 2.63065
MAE fold 2 = 1.292
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[81]	valid_0's l1: 1.31681	valid_0's l2: 3.19354
MAE fold 3 = 1.317
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[98]	valid_0's l1: 1.543	valid_0's l2: 4.1296
MAE fold 4 = 1.543
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[98]	valid_0's l1: 1.70939	valid_0's l2: 5.11418
MAE fold 5 = 1.709
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. B

In [17]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test, 'lgbm tune 2')
])
metrics

MAE holdout = 1.6322344422563935
Delta between train and holdout = 18.2 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403


Данная комбинация гиперпараметров привела к еще большему снижению переобучения. При этом метрика на Holdout немного ухудшилась, но незначительно. 

## LGBM Tune 3

In [23]:
def objective_3_lgb(trial):
    lgb_params = {
        "random_state":
        trial.suggest_categorical("random_state", [42]),
        "verbose":
        trial.suggest_categorical("verbose", [-1]),
        "n_estimators":
        trial.suggest_categorical("n_estimators", [500]),
        "objective":
        trial.suggest_categorical("objective", ['mae']),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.014912792408663157]),
        #          "objective":
        #         trial.suggest_categorical("objective", ['mae', 'rmse', None]),
        #         "learning_rate":
        #         trial.suggest_float("learning_rate", 0.009, 0.025, log=True),
        "num_leaves":
        trial.suggest_int("num_leaves", 16, 25),
        "max_depth":
        trial.suggest_int("max_depth", 6, 10),
        "max_bin":
        trial.suggest_int("max_bin", 100, 250, step=10),
        "min_child_samples":
        trial.suggest_int("min_child_samples", 100, 1000, step=100),
        "min_split_gain":
        trial.suggest_float("min_split_gain", 0.0, 0.9, step=0.1),
        "subsample":
        trial.suggest_float("subsample", 0.8, 1.0),
        "subsample_freq":
        trial.suggest_int("subsample_freq", 1, 3),
        "colsample_bytree":
        trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "reg_alpha":
        trial.suggest_int("reg_alpha", 0, 100),
        "reg_lambda":
        trial.suggest_int("reg_lambda", 0, 100)
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'mae')

    mae_oof = np.empty(6)
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'],
                                                       axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = LGBMRegressor(**lgb_params)
        model.fit(X_train_,
                  y_train_,
                  eval_set=[(X_val, y_val)],
                  eval_metric="mae",
                  callbacks=[early_stopping(stopping_rounds=100)])

        y_val_pred = model.predict(X_val)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))].drop(['p_num', 'target'],
                                                         axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')].drop(
        ['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') |
                      (train.p_num == 'p01')]['target']

    model = LGBMRegressor(**lgb_params)
    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric="mae",
              callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)

    return np.mean(mae_oof)

In [None]:
study_3_lgb = optuna.create_study(direction="minimize", study_name="LGB_3")
study_3_lgb.optimize(objective_3_lgb, n_trials=15, show_progress_bar=True)

In [None]:
# study_3_lgb.best_params

In [None]:
# study_3_lgb.best_value

In [18]:
lgbm_3_params = {'random_state': 42,
                 'verbose': -1,
                 'n_estimators': 500,
                 'objective': 'mae',
                 'learning_rate': 0.014912792408663157,
                 'num_leaves': 21,
                 'max_depth': 8,
                 'max_bin': 140,
                 'min_child_samples': 700,
                 'min_split_gain': 0.1,
                 'subsample': 0.9986785929006112,
                 'subsample_freq': 1,
                 'colsample_bytree': 0.974113126619188,
                 'reg_alpha': 55,
                 'reg_lambda': 42}

**Проверяем параметры на CV**

In [19]:
predicts_test, avg_mae_train = train_lgbm_cv(**lgbm_3_params)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 1.64693
MAE fold 1 = 1.647
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[307]	valid_0's l1: 1.18681
MAE fold 2 = 1.187
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[350]	valid_0's l1: 1.30105
MAE fold 3 = 1.301
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[94]	valid_0's l1: 1.50852
MAE fold 4 = 1.509
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's l1: 1.74855
MAE fold 5 = 1.749
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 1.96094
MAE fold 6 = 1.961
---
Mean MAE val = 1.5588003525948944
Mean MAE train = 1.385722003015296
Delta

In [20]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test, 'lgbm tune 3')
])
metrics

MAE holdout = 1.6383688674840295
Delta between train and holdout = 18.2 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Третья комбинация гиперпараметров оказалась еще лучше для борьбы с переобучением, чем предыдущие две. Метрика на отложенной выборке еще немного ухудшилась, что может быть связано с увеличением смещения модели при снижении дисперсии, однако в данном случае переобучение снизилось значительно, а метрика на holdout ухудшилась незначительно.

## LGBM Tune 4

In [28]:
def objective_4_lgb(trial):
    lgb_params = {
        "random_state":
        trial.suggest_categorical("random_state", [42]),
        "verbose":
        trial.suggest_categorical("verbose", [-1]),
        "n_estimators":
        trial.suggest_categorical("n_estimators", [500]),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.021936391093076747]),
        #          "objective":
        #         trial.suggest_categorical("objective", ['mae', 'rmse', None]),
        #         "learning_rate":
        #         trial.suggest_float("learning_rate", 0.009, 0.025, log=True),
        "num_leaves":
        trial.suggest_int("num_leaves", 16, 30),
        "max_depth":
        trial.suggest_int("max_depth", 6, 12),
        "max_bin":
        trial.suggest_int("max_bin", 100, 250, step=10),
        "min_child_samples":
        trial.suggest_int("min_child_samples", 100, 1000, step=100),
        "min_split_gain":
        trial.suggest_float("min_split_gain", 0.0, 0.9, step=0.1),
        "subsample":
        trial.suggest_float("subsample", 0.8, 1.0),
        "subsample_freq":
        trial.suggest_int("subsample_freq", 1, 3),
        "colsample_bytree":
        trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "reg_alpha":
        trial.suggest_int("reg_alpha", 0, 100),
        "reg_lambda":
        trial.suggest_int("reg_lambda", 0, 100)
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'mae')

    mae_oof = np.empty(6)
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(
            ['p_num', 'target'], axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = LGBMRegressor(**lgb_params)
        model.fit(
            X_train_,
            y_train_,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
            callbacks=[early_stopping(stopping_rounds=100)])

        y_val_pred = model.predict(X_val)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (
        train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') | (
        train.p_num == 'p01')]['target']

    model = LGBMRegressor(**lgb_params)
    model.fit(
        X_train_,
        y_train_,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)

    return np.mean(mae_oof)

In [None]:
study_4_lgb = optuna.create_study(direction="minimize", study_name="LGB_4")
study_4_lgb.optimize(objective_4_lgb, n_trials=15, show_progress_bar=True)

In [None]:
# study_4_lgb.best_params

In [None]:
# study_4_lgb.best_value

In [21]:
lgbm_4_params = {'random_state': 42,
                 'verbose': -1,
                 'n_estimators': 500,
                 'learning_rate': 0.021936391093076747,
                 'num_leaves': 23,
                 'max_depth': 6,
                 'max_bin': 140,
                 'min_child_samples': 1000,
                 'min_split_gain': 0.0,
                 'subsample': 0.8809963922091697,
                 'subsample_freq': 1,
                 'colsample_bytree': 0.9906721118787845,
                 'reg_alpha': 13,
                 'reg_lambda': 3}

**Проверяем параметры на CV**

In [22]:
predicts_test, avg_mae_train = train_lgbm_cv(**lgbm_4_params)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[243]	valid_0's l1: 1.629	valid_0's l2: 5.10374
MAE fold 1 = 1.629
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[265]	valid_0's l1: 1.28544	valid_0's l2: 2.60165
MAE fold 2 = 1.285
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[161]	valid_0's l1: 1.32444	valid_0's l2: 3.23219
MAE fold 3 = 1.324
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 1.51122	valid_0's l2: 4.00424
MAE fold 4 = 1.511
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[499]	valid_0's l1: 1.7105	valid_0's l2: 5.12673
MAE fold 5 = 1.711
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l

In [23]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test, 'lgbm tune 4')
])
metrics

MAE holdout = 1.6255045781048656
Delta between train and holdout = 19.6 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Данная комбинация гиперпараметров так же позволила снизить переобучение по сравнению с бейзлайном, но оказалась хуже, чем предыдущие.

## LGBM Tune 5

In [None]:
def objective_5_lgb(trial):
    lgb_params = {
        "random_state":
        trial.suggest_categorical("random_state", [42]),
        "verbose":
        trial.suggest_categorical("verbose", [-1]),
        "n_estimators":
        trial.suggest_categorical("n_estimators", [1000]),
        "objective":
        trial.suggest_categorical("objective", ['mae']),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.005483200175064182]),
        #          "objective":
        #         trial.suggest_categorical("objective", ['mae', 'rmse', None]),
        #         "learning_rate":
        #         trial.suggest_float("learning_rate", 0.005, 0.008, log=True)
        "num_leaves":
        trial.suggest_int("num_leaves", 16, 22),
        "max_depth":
        trial.suggest_int("max_depth", 6, 10),
        "max_bin":
        trial.suggest_int("max_bin", 100, 250, step=10),
        "min_child_samples":
        trial.suggest_int("min_child_samples", 100, 1000, step=100),
        "min_split_gain":
        trial.suggest_float("min_split_gain", 0.0, 0.9, step=0.1),
        "subsample":
        trial.suggest_float("subsample", 0.8, 1.0),
        "subsample_freq":
        trial.suggest_int("subsample_freq", 1, 3),
        "colsample_bytree":
        trial.suggest_float("colsample_bytree", 0.8, 1.0),
        "reg_alpha":
        trial.suggest_int("reg_alpha", 0, 100),
        "reg_lambda":
        trial.suggest_int("reg_lambda", 0, 100)
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'mae')

    mae_oof = np.empty(6)
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'],
                                                       axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = LGBMRegressor(**lgb_params)
        model.fit(X_train_,
                  y_train_,
                  eval_set=[(X_val, y_val)],
                  eval_metric="mae",
                  callbacks=[early_stopping(stopping_rounds=100)])

        y_val_pred = model.predict(X_val)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))].drop(['p_num', 'target'],
                                                         axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')].drop(
        ['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') |
                      (train.p_num == 'p01')]['target']

    model = LGBMRegressor(**lgb_params)
    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric="mae",
              callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)

    return np.mean(mae_oof)

In [None]:
study_5_lgb = optuna.create_study(direction="minimize", study_name="LGB_5")
study_5_lgb.optimize(objective_5_lgb, n_trials=12, show_progress_bar=True)

In [None]:
# study_5_lgb.best_params

In [None]:
# study_5_lgb.best_value

In [24]:
lgbm_5_params = {'random_state': 42,
 'verbose': -1,
 'n_estimators': 1000,
 'objective': 'mae',
 'learning_rate': 0.005483200175064182,
 'num_leaves': 22,
 'max_depth': 8,
 'max_bin': 170,
 'min_child_samples': 600,
 'min_split_gain': 0.9,
 'subsample': 0.8104803788268242,
 'subsample_freq': 2,
 'colsample_bytree': 0.9987695679080075,
 'reg_alpha': 44,
 'reg_lambda': 2}

In [25]:
predicts_test, avg_mae_train = train_lgbm_cv(**lgbm_5_params)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's l1: 1.65133
MAE fold 1 = 1.651
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[818]	valid_0's l1: 1.18961
MAE fold 2 = 1.190
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[839]	valid_0's l1: 1.30086
MAE fold 3 = 1.301
---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[260]	valid_0's l1: 1.50924
MAE fold 4 = 1.509
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.75208
MAE fold 5 = 1.752
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.97032
MAE fold 6 = 1.970
---
Mean MAE val = 1.5622397270327497
Mean MAE train = 1.395235387404081
De

In [26]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test, 'lgbm tune 5')
])
metrics

MAE holdout = 1.6441050178052556
Delta between train and holdout = 17.8 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Данная комбинация гиперпараметров оказалась лучше всех предыдущих, так как способствовала наибольшему снижению переобучения. Метрика на отложенной выборке хуже, но незначительно, прирост устойчивости модели к переобучению гораздо больше и значительнее, а также важнее в рамках задачи.

# Tuning CatBoostRegressor

In [35]:
def objective_cb(trial):
    cb_params = {
        "iterations":
        trial.suggest_categorical("iterations", [1000]),
        "random_state":
        trial.suggest_categorical("random_state", [42]),
        "eval_metric":
        trial.suggest_categorical("eval_metric", ['MAE']),
        "loss_function":
        trial.suggest_categorical("loss_function", ['MAE']),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.029999999329447743]),
        #         "loss_function":
        #         trial.suggest_categorical("loss_function", ['MAE', 'RMSE']),
        #         "learning_rate":
        #         trial.suggest_float("learning_rate", 0.025, 0.035, log=True)
        "l2_leaf_reg":
        trial.suggest_int("l2_leaf_reg", 3, 5, step=2),
        "border_count":
        trial.suggest_int("border_count", 230, 250, step=10)
    }

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'mae')

    mae_oof = np.empty(6)
    for idx, patient in enumerate(patients):
        X_train_ = train.loc[~(train.p_num == patient)].drop(
            ['p_num', 'target'], axis=1)
        X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'],
                                                       axis=1)
        y_train_ = train.loc[~(train.p_num == patient)]['target']
        y_val = train.loc[train.p_num == patient]['target']

        model = CatBoostRegressor(**cb_params)

        eval_set = [(X_val, y_val)]

        model.fit(X_train_,
                  y_train_,
                  eval_set=eval_set,
                  verbose=False,
                  early_stopping_rounds=100)

        y_val_pred = model.predict(X_val)

        mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)

    # последний фолд(1 + 6 пациенты)
    X_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))].drop(['p_num', 'target'],
                                                         axis=1)
    X_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')].drop(
        ['p_num', 'target'], axis=1)
    y_train_ = train.loc[~((train.p_num == 'p06') |
                           (train.p_num == 'p01'))]['target']
    y_val = train.loc[(train.p_num == 'p06') |
                      (train.p_num == 'p01')]['target']

    model = CatBoostRegressor(**cb_params)

    eval_set = [(X_val, y_val)]

    model.fit(X_train_,
              y_train_,
              eval_set=eval_set,
              verbose=False,
              early_stopping_rounds=100)

    y_val_pred = model.predict(X_val)
    mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)

    return np.mean(mae_oof)

In [None]:
study_cb = optuna.create_study(direction="minimize", study_name="CatBoost")
study_cb.optimize(objective_cb, n_trials=5, show_progress_bar=True)

In [None]:
# study_cb.best_params

In [None]:
# study_cb.best_value

In [27]:
catboost_params = {'iterations': 1000,
 'random_state': 42,
 'eval_metric': 'MAE',
 'loss_function': 'MAE',
 'learning_rate': 0.029999999329447743,
 'l2_leaf_reg': 5,
 'border_count': 240}

In [28]:
predicts_test, avg_mae_train = train_catboost_cv(**catboost_params)

MAE fold 1 = 1.711
---
MAE fold 2 = 1.156
---
MAE fold 3 = 1.312
---
MAE fold 4 = 1.512
---
MAE fold 5 = 1.744
---
MAE fold 6 = 1.951
---
Mean MAE val = 1.5641584170670286
Mean MAE train = 1.3929251340837556
Delta between train and val = 12.3 %


In [29]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test, 'catboost tune')
])
metrics

MAE holdout = 1.6637519839226906
Delta between train and holdout = 19.4 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Тюнинг Catboost способствовал очень значительному снижению переобучения: у baseline Catboost с кросс-валидацией разница между метриками на трейне и тесте была 27.9%, а стала 19.4%; разница между метриками на трейне и кросс-валидации была 22.3%, а стала 12.3%. По проценту переобучения на кросс-валидации Catboost с подобранными гиперпараметрами - одна из лучших моделей наряду с lightGBM с 3-й и 5-й комбинациями гиперпараметров.Однако метрика на отложенной выборке у Catboost хуже, чем у LightGBM.

**Вывод по качеству моделей после тюнинга:** лучшие модели, наиболее устойчивые к переобучению - это LightGBM с 3-й и 5-й комбинациями гиперпараметров: у них самый низкий процент переобучения на валидационных и тестовых данных и очень незначительное ухудшение метрик на отложенной выборке, которое, вероятно, связано с bias-variance trade-off. Сatboost с подобранными гиперпараметрами почти так же хорош в плане переобучения, но метрика на holdout немного хуже, чем у LightGBM.

# Stacking

## Stacking 3 models

Попробуем сделать стэкинг над тремя лучшими моделями - это LightGBM c параметрами lgbm_3_params, LightGBM c параметрами lgbm_5_params и Catboost(у Catboost результаты немного хуже, чем у первых двух моделей, но лучше добавить его в стэкинг для повышения разнородности базовых моделей)

LightGBM lgbm_3_params:

In [30]:
# создаем датафреймы с мета-признаками для трейна и теста
meta_X = pd.DataFrame()
meta_X_test = pd.DataFrame()

predicts_val = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = LGBMRegressor(**lgbm_3_params)
    model.fit(
        X_train_,
        y_train_,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    predicts_val.append(y_val_pred)

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = LGBMRegressor(**lgbm_3_params)
model.fit(
    X_train_,
    y_train_,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[early_stopping(stopping_rounds=100)])

y_val_pred = model.predict(X_val)
predicts_val.append(y_val_pred)

# обучаем модель на полной тренировочной выборке для получения предсказаний (мета-признаков) на тесте
model.fit(X_train, y_train)

# добавляем первый мета-признак в датафреймы с мета-признаками для трейна и теста
meta_X['lgbm_1'] = np.concatenate(predicts_val)
meta_X_test['lgbm_1'] = model.predict(X_test)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 1.64693
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[307]	valid_0's l1: 1.18681
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[350]	valid_0's l1: 1.30105
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[94]	valid_0's l1: 1.50852
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's l1: 1.74855
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 1.96094


Добавляем в стэкинг LightGBM с lgbm_5_params:

In [31]:
predicts_val = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = LGBMRegressor(**lgbm_5_params)
    model.fit(
        X_train_,
        y_train_,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    predicts_val.append(y_val_pred)

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = LGBMRegressor(**lgbm_5_params)
model.fit(
    X_train_,
    y_train_,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[early_stopping(stopping_rounds=100)])

y_val_pred = model.predict(X_val)
predicts_val.append(y_val_pred)

# обучаем модель на полной тренировочной выборке для получения предсказаний (мета-признаков) на тесте
model.fit(X_train, y_train)

# добавляем первый мета-признак в датафреймы с мета-признаками для трейна и теста
meta_X['lgbm_2'] = np.concatenate(predicts_val)
meta_X_test['lgbm_2'] = model.predict(X_test)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's l1: 1.65133
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[818]	valid_0's l1: 1.18961
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[839]	valid_0's l1: 1.30086
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[260]	valid_0's l1: 1.50924
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.75208
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 1.97032


Добавляем Catboost:

In [32]:
predicts_val = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = CatBoostRegressor(**catboost_params)

    eval_set = [(X_val, y_val)]

    model.fit(X_train_,
              y_train_,
              eval_set=eval_set,
              verbose=False,
              early_stopping_rounds=100)

    y_val_pred = model.predict(X_val)
    predicts_val.append(y_val_pred)

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = CatBoostRegressor(**catboost_params)

eval_set = [(X_val, y_val)]

model.fit(X_train_,
          y_train_,
          eval_set=eval_set,
          verbose=False,
          early_stopping_rounds=100)

y_val_pred = model.predict(X_val)
predicts_val.append(y_val_pred)

# обучаем модель на полной тренировочной выборке для получения предсказаний (мета-признаков) на тесте
model.fit(X_train, y_train, verbose=False)

# добавляем первый мета-признак в датафреймы с мета-признаками для трейна и теста
meta_X['catboost'] = np.concatenate(predicts_val)
meta_X_test['catboost'] = model.predict(X_test)

**Финальная мета-модель:**

In [33]:
# в качестве финальной мета-модели используем простую линейную регрессию:
meta_model = LinearRegression()
meta_model.fit(meta_X, y_train)

y_pred_train = meta_model.predict(meta_X)
y_pred_test = meta_model.predict(meta_X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"MAE train = {mae_train}")
print(f"MAE test = {mae_test}")
print(f"Delta = {(abs(mae_train - mae_test) / mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, y_pred_test, X_test, 'stacking 3 models')
])
metrics

MAE train = 1.5284184280620579
MAE test = 1.6036795141893496
Delta = 4.9 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Стэкинг трех лучших моделей позволил очень сильно снизить переобучение, а также улучшить метрику на отложенной выборке.

## Stacking 4 models

Попробуем добавить в стэкинг еще 4 модель (с lgbm_2_params):

In [34]:
predicts_val = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = LGBMRegressor(**lgbm_2_params)
    model.fit(
        X_train_,
        y_train_,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    predicts_val.append(y_val_pred)

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = LGBMRegressor(**lgbm_2_params)
model.fit(
    X_train_,
    y_train_,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[early_stopping(stopping_rounds=100)])

y_val_pred = model.predict(X_val)
predicts_val.append(y_val_pred)

# обучаем модель на полной тренировочной выборке для получения предсказаний (мета-признаков) на тесте
model.fit(X_train, y_train)

# добавляем первый мета-признак в датафреймы с мета-признаками для трейна и теста
meta_X['lgbm_3'] = np.concatenate(predicts_val)
meta_X_test['lgbm_3'] = model.predict(X_test)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[94]	valid_0's l1: 1.63002	valid_0's l2: 5.10138
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.29186	valid_0's l2: 2.63065
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[81]	valid_0's l1: 1.31681	valid_0's l2: 3.19354
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[98]	valid_0's l1: 1.543	valid_0's l2: 4.1296
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[98]	valid_0's l1: 1.70939	valid_0's l2: 5.11418
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's l1: 1.96795	valid_0's l2: 6.59215


In [35]:
meta_model = LinearRegression()
meta_model.fit(meta_X, y_train)

y_pred_train = meta_model.predict(meta_X)
y_pred_test = meta_model.predict(meta_X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"MAE train = {mae_train}")
print(f"MAE test = {mae_test}")
print(f"Delta = {(abs(mae_train - mae_test) / mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, y_pred_test, X_test, 'stacking 4 models')
])
metrics

MAE train = 1.526122957506064
MAE test = 1.6031152346539208
Delta = 5.0 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Добавление 4 модели не повлияло на переобучение, но совсем немного улучшило метрику на отложенной выборке.

## Stacking 5 models

Попробуем добавить в стэкинг 5-ю модель (с lgbm_1_params):

In [36]:
predicts_val = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = LGBMRegressor(**lgbm_1_params)
    model.fit(
        X_train_,
        y_train_,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    predicts_val.append(y_val_pred)

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = LGBMRegressor(**lgbm_1_params)
model.fit(
    X_train_,
    y_train_,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[early_stopping(stopping_rounds=100)])

y_val_pred = model.predict(X_val)
predicts_val.append(y_val_pred)

# обучаем модель на полной тренировочной выборке для получения предсказаний (мета-признаков) на тесте
model.fit(X_train, y_train)

# добавляем первый мета-признак в датафреймы с мета-признаками для трейна и теста
meta_X['lgbm_4'] = np.concatenate(predicts_val)
meta_X_test['lgbm_4'] = model.predict(X_test)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.65099
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[81]	valid_0's l1: 1.18964
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[95]	valid_0's l1: 1.29595
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.49862
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's l1: 1.75268
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.98058


In [37]:
meta_model = LinearRegression()
meta_model.fit(meta_X, y_train)

y_pred_train = meta_model.predict(meta_X)
y_pred_test = meta_model.predict(meta_X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"MAE train = {mae_train}")
print(f"MAE test = {mae_test}")
print(f"Delta = {(abs(mae_train - mae_test) / mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, y_pred_test, X_test, 'stacking 5 models')
])
metrics

MAE train = 1.5262357635842605
MAE test = 1.6027954248888743
Delta = 5.0 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Добавление 5-й модели еще немного улучшило метрику на отложенной выборке.

## Stacking 6 models

Попробуем добавить в стэкинг 6-ю модель (с lgbm_4_params):

In [38]:
predicts_val = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = LGBMRegressor(**lgbm_4_params)
    model.fit(
        X_train_,
        y_train_,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    predicts_val.append(y_val_pred)

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = LGBMRegressor(**lgbm_4_params)
model.fit(
    X_train_,
    y_train_,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[early_stopping(stopping_rounds=100)])

y_val_pred = model.predict(X_val)
predicts_val.append(y_val_pred)

# обучаем модель на полной тренировочной выборке для получения предсказаний (мета-признаков) на тесте
model.fit(X_train, y_train)

# добавляем первый мета-признак в датафреймы с мета-признаками для трейна и теста
meta_X['lgbm_5'] = np.concatenate(predicts_val)
meta_X_test['lgbm_5'] = model.predict(X_test)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[243]	valid_0's l1: 1.629	valid_0's l2: 5.10374
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[265]	valid_0's l1: 1.28544	valid_0's l2: 2.60165
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[161]	valid_0's l1: 1.32444	valid_0's l2: 3.23219
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 1.51122	valid_0's l2: 4.00424
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[499]	valid_0's l1: 1.7105	valid_0's l2: 5.12673
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 1.9517	valid_0's l2: 6.52526


In [39]:
meta_model = LinearRegression()
meta_model.fit(meta_X, y_train)

y_pred_train = meta_model.predict(meta_X)
y_pred_test = meta_model.predict(meta_X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"MAE train = {mae_train}")
print(f"MAE test = {mae_test}")
print(f"Delta = {(abs(mae_train - mae_test) / mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, y_pred_test, X_test, 'stacking 6 models')
])
metrics

MAE train = 1.5241357588111524
MAE test = 1.594744789173618
Delta = 4.6 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
1,baseline ridge,1.644019,2.196696,0.505629,19.398459
2,baseline lgbm,1.623072,2.16294,0.520706,19.151295
3,baseline catboost,1.645628,2.206858,0.501044,19.417439
4,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
5,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
6,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
0,lgbm tune 1,1.629734,2.192108,0.507692,19.229901
0,lgbm tune 2,1.632234,2.165808,0.519434,19.259403
0,lgbm tune 3,1.638369,2.201705,0.503372,19.331786


Добавление 6 модели в ансамбль способствовало небольшому дополнительному снижению переобучения и улчушению метрики на отложенной выборке.

In [42]:
metrics = metrics.set_index('model')
metrics

Unnamed: 0_level_0,MAE,RMSE,R2 adjusted,WAPE_%
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline lasso,1.759607,2.310597,0.453032,20.762326
baseline ridge,1.644019,2.196696,0.505629,19.398459
baseline lgbm,1.623072,2.16294,0.520706,19.151295
baseline catboost,1.645628,2.206858,0.501044,19.417439
baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
baseline catboost with CV,1.647048,2.186179,0.510351,19.434197
lgbm tune 1,1.629734,2.192108,0.507692,19.229901
lgbm tune 2,1.632234,2.165808,0.519434,19.259403
lgbm tune 3,1.638369,2.201705,0.503372,19.331786


In [50]:
metrics.to_csv("metrics.csv")

# Выводы

- Лучшая модель после подбора гиперпараметров - это LightGBM с 5-й комбинацией гиперпараметров(lgbm_5_params). Эта модель переобучается значительно меньше, чем бейзлайн, то есть с помощью подбора гиперпараметров удалось улучшить обобщающую способность модели. Метрика на отложенной выборке немного ухудшилась, что, вероятно, обусловлено увеличением смещения модели при снижении ее дисперсии, однако это ухудшение незначительное. 
- Стэкинг 6 моделей (5 LGBM с различными гиперпараметрами и 1 Catboost c подобранными гиперпараметрами) способствовал еще большему снижению переобучения. Также с помощью стэкинга удалось добиться наилучшего значения метрики на отложенной выборке. 
- Поставленную цель - построить модель, обучающуюся на данных одних пациентов и выдающую релевантные предсказания на данных новых пациентов - можно считать выполненной. Однако при снижении дисперсии модели сильно пострадало ее смещение, о чем говорит ухудшение метрик на тренировочной выборке. 
- Модель можно улучшить сбором большего объема данных(от большего числа пациентов), а также улучшением качества самих данных(в использованных данных было очень много пропущеннных значений, некоторые признаки пришлось удалить из-за огромного количества пропусков)