In [4]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Lasso, Ridge

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor, early_stopping

In [5]:
def r2_adjusted(y_true: np.ndarray, y_pred: np.ndarray,
                X_test: np.ndarray) -> float:
    """Коэффициент детерминации (множественная регрессия)"""
    N_objects = len(y_true)
    N_features = X_test.shape[1]
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (N_objects - 1) / (N_objects - N_features - 1)

def wape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Weighted Absolute Percent Error"""
    return np.sum(np.abs(y_pred - y_true)) / np.sum(y_true) * 100

def get_metrics_regression(y_test: np.ndarray,
                           y_pred: np.ndarray,
                           X_test: np.ndarray,
                           name: str = None):
    """Генерация таблицы с метриками для задачи регрессии"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]

    df_metrics['MAE'] = mean_absolute_error(y_test, y_pred)
    df_metrics['RMSE'] = np.sqrt(mean_squared_error(y_test, y_pred))
    df_metrics['R2 adjusted'] = r2_adjusted(y_test, y_pred, X_test)
    df_metrics['WAPE_%'] = wape(y_test, y_pred)

    return df_metrics

In [6]:
df_clean = pd.read_csv("./data/data_clean.csv")
df_clean.head()

Unnamed: 0,id,p_num,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,cals-0:35,cals-0:30,cals-0:25,cals-0:20,cals-0:15,cals-0:10,cals-0:05,cals-0:00,bg+1:00,hour
0,p01_0,p01,7.82653,13.8,9.6,7.893227,13.4,9.7,10.15,12.8,...,5.49,5.49,5.68,5.49,5.49,5.68,5.49,5.49,13.4,6
1,p01_1,p01,7.82653,13.8,9.7,7.893227,13.4,9.2,10.15,12.8,...,5.49,5.49,5.68,5.49,5.49,5.68,5.49,5.49,12.8,6
2,p01_2,p01,7.82653,13.8,9.2,7.893227,13.4,8.7,10.15,12.8,...,5.49,5.49,5.68,5.49,5.49,5.68,5.49,5.49,15.5,7
3,p01_3,p01,7.82653,13.8,8.7,7.893227,13.4,8.4,10.15,12.8,...,5.49,5.49,5.68,5.49,5.49,5.68,5.49,5.49,14.8,7
4,p01_4,p01,7.82653,13.8,8.4,7.893227,13.4,8.1,10.15,12.8,...,5.49,5.49,5.68,5.49,5.49,5.68,5.49,5.49,12.7,7


# Постановка задачи

Я не стала рассматривать данную задачу как задачу на временные ряды, так как тогда пришлось бы строить отдельную модель для каждого участника исследования, ведь у каждого пациента отдельный временной ряд. Это была бы крутая задача с точки зрения персонализированной медицины, но в рамках данного проекта стоит цель построить одну модель по всем данным. Поэтому временную структуру данных я учитывала при обработке данных (при заполнении пропущенных значений), а дальше решала задачу обычной регрессии. \
При разделении данных на тренировочные и тестовые логично определить нескольких участников в тест, а остальных оставить в трейне, так как тестировать качество модели на тех же пациентах, на которых она обучалась, кажется нечестным: с точки зрения релаьного применения модели логично либо строить отдельную модель для каждого пациента (задача персонализированной медицины), либо строить модель, которая, обучившись на одних пациентах, сможет выдавать релевантные предсказания на новых. Так как стоит задача построить одну модель на всех данных, я выбрала второй подход. \
Понятно, что задача сложная и модель скорее всего получится переобученной: так как все люди разные, сложно давать предсказания на новых пациентах, которых модель еще не видела. В реальности (судя по литературным данным) подобные задачи решаются построением отдельной модели для каждого пациента. В рамках данного проекта стоит задача построить на данных от одних пациентов модель, которая обладает приемлемой обобщающей способностью и выдает релевантные прогнозы на данных от новых пациентов.
Разбиение на трейн и тест можно спокойно делать после предобработки данных, так как при заполнении пропущенных значений я все равно группировала данные по пациентам.

# Разделение данных на train и test

In [7]:
# переименовываем колонки, так как будем использовать в том числе LightGBM, который не признает некоторые символы
import re

df_clean = df_clean.rename({'bg+1:00': 'target'}, axis=1)
df_clean = df_clean.rename(columns=lambda x: re.sub(r'[\W]', '_', x))
df_clean.columns

Index(['id', 'p_num', 'bg_5_55', 'bg_5_50', 'bg_5_45', 'bg_5_40', 'bg_5_35',
       'bg_5_30', 'bg_5_25', 'bg_5_20',
       ...
       'cals_0_35', 'cals_0_30', 'cals_0_25', 'cals_0_20', 'cals_0_15',
       'cals_0_10', 'cals_0_05', 'cals_0_00', 'target', 'hour'],
      dtype='object', length=364)

In [8]:
df_clean.p_num.value_counts()

p_num
p03    26028
p02    25872
p10    25454
p12    25299
p04    24686
p11    24555
p01     8459
p06     8383
p05     8288
Name: count, dtype: int64

Количество данных по пациентам различается, есть "большие" пациенты, а есть "маленькие". Если отправить в holdout одного "большого" пациента и одного "маленького", то размер теста получится примерно 20%, что соответствует общепринятым стандартам.

In [9]:
# определяем в тест 3-го пациента(самого большого) и одного маленького, например, 5-го

test_patients = ['p03', 'p05']
test = pd.DataFrame()
for p in test_patients:
    test = pd.concat([test, df_clean[df_clean.p_num == p]], axis=0)
test

Unnamed: 0,id,p_num,bg_5_55,bg_5_50,bg_5_45,bg_5_40,bg_5_35,bg_5_30,bg_5_25,bg_5_20,...,cals_0_35,cals_0_30,cals_0_25,cals_0_20,cals_0_15,cals_0_10,cals_0_05,cals_0_00,target,hour
34331,p03_0,p03,6.500000,6.6,6.7,6.7,6.6,6.5,6.4,6.4,...,5.84,5.84,5.84,5.84,5.84,5.84,5.84,5.84,7.0,6
34332,p03_1,p03,6.600000,6.7,6.7,6.6,6.5,6.4,6.4,6.3,...,5.84,5.84,5.84,5.84,5.84,5.84,5.84,5.84,7.0,6
34333,p03_2,p03,6.700000,6.7,6.6,6.5,6.4,6.4,6.3,6.3,...,5.84,5.84,5.84,5.84,5.84,5.84,5.84,5.84,6.9,6
34334,p03_3,p03,6.700000,6.6,6.5,6.4,6.4,6.3,6.3,6.3,...,5.84,5.84,5.84,5.84,5.84,5.84,5.84,5.84,6.8,6
34335,p03_4,p03,6.600000,6.5,6.4,6.4,6.3,6.3,6.3,6.3,...,5.84,5.84,5.84,5.84,5.84,5.84,5.84,5.84,6.8,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93328,p05_8283,p05,8.045082,5.2,7.1,8.6,4.8,6.6,8.2,5.6,...,4.70,4.70,4.70,6.41,4.80,5.83,4.70,4.70,5.9,23
93329,p05_8284,p05,8.045082,4.8,7.1,8.6,5.6,6.6,8.2,5.7,...,6.41,4.80,5.83,4.70,4.70,4.70,8.96,11.33,6.1,23
93330,p05_8285,p05,8.045082,5.6,7.1,8.6,5.7,6.6,8.2,5.3,...,4.70,4.70,4.70,8.96,11.33,10.48,9.62,4.80,6.1,23
93331,p05_8286,p05,8.045082,5.7,7.1,8.6,5.3,6.6,8.2,5.3,...,8.96,11.33,10.48,9.62,4.80,4.70,8.10,4.80,5.7,0


In [10]:
test.shape[0] / df_clean.shape[0]

0.19384942154736082

In [11]:
train_patients = ['p02', 'p10', 'p12', 'p04', 'p11', 'p01', 'p06']
train = pd.DataFrame()
for p in train_patients:
    train = pd.concat([train, df_clean[df_clean.p_num == p]], axis=0)
train

Unnamed: 0,id,p_num,bg_5_55,bg_5_50,bg_5_45,bg_5_40,bg_5_35,bg_5_30,bg_5_25,bg_5_20,...,cals_0_35,cals_0_30,cals_0_25,cals_0_20,cals_0_15,cals_0_10,cals_0_05,cals_0_00,target,hour
8459,p02_0,p02,9.500000,9.600000,9.6,9.900000,10.100000,10.0,10.000000,10.100000,...,2.50,2.51,2.505,2.51,2.505,2.50,2.51,2.51,7.1,6
8460,p02_1,p02,9.600000,9.600000,9.9,10.100000,10.000000,10.0,10.100000,10.200000,...,2.50,2.51,2.505,2.51,2.505,2.50,2.51,2.51,7.0,6
8461,p02_2,p02,9.600000,9.900000,10.1,10.000000,10.000000,10.1,10.200000,10.200000,...,2.50,2.51,2.505,2.51,2.505,2.50,2.51,2.51,7.2,6
8462,p02_3,p02,9.900000,10.100000,10.0,10.000000,10.100000,10.2,10.200000,10.300000,...,2.50,2.51,2.505,2.51,2.505,2.50,2.51,2.51,7.4,6
8463,p02_4,p02,10.100000,10.000000,10.0,10.100000,10.200000,10.2,10.300000,10.300000,...,2.50,2.51,2.505,2.51,2.505,2.50,2.51,2.51,7.5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101711,p06_8378,p06,9.430682,11.600000,8.2,9.316949,7.936487,9.2,9.803134,7.772222,...,6.15,6.15,6.150,6.15,6.150,6.15,6.15,6.15,12.4,20
101712,p06_8379,p06,9.430682,8.015068,9.2,9.316949,7.936487,9.3,9.803134,7.772222,...,6.15,6.15,6.150,6.15,6.150,6.15,6.15,6.15,11.5,21
101713,p06_8380,p06,9.430682,8.015068,9.3,9.316949,7.936487,9.6,9.803134,7.772222,...,6.15,6.15,6.150,6.15,6.150,6.15,6.15,6.15,11.0,21
101714,p06_8381,p06,9.430682,8.015068,10.1,9.316949,7.936487,10.3,9.803134,7.772222,...,6.15,6.15,6.150,6.15,6.150,6.15,6.15,6.15,7.9,0


In [12]:
# сохраняем id для трейна и теста
train_ids = train['id']
test_ids = test['id']
test_ids

34331       p03_0
34332       p03_1
34333       p03_2
34334       p03_3
34335       p03_4
           ...   
93328    p05_8283
93329    p05_8284
93330    p05_8285
93331    p05_8286
93332    p05_8287
Name: id, Length: 34316, dtype: object

In [13]:
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [14]:
X_train, X_test = train.drop(['target', 'p_num'], axis=1), test.drop([
    'target', 'p_num'], axis=1),
y_train, y_test = train['target'], test['target']

In [15]:
X_test.shape

(34316, 361)

# Modeling baselines

В качестве основной метрики будем использовать MAE.

## Линейные модели

In [16]:
model = Lasso()
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_holdout = mean_absolute_error(y_test, y_pred_test)
print(f"MAE train = {mae_train}")
print(f"MAE holdout = {mae_holdout}")
print(f"Delta = {(abs(mae_train - mae_holdout) / mae_train * 100):.1f} %")

metrics = get_metrics_regression(y_test, y_pred_test, X_test, 'baseline lasso')
metrics

MAE train = 1.604019639007525
MAE holdout = 1.7596070954979848
Delta = 9.7 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326


In [17]:
model = Ridge()
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_holdout = mean_absolute_error(y_test, y_pred_test)
print(f"MAE train = {mae_train}")
print(f"MAE holdout = {mae_holdout}")
print(f"Delta = {(abs(mae_train - mae_holdout) / mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, y_pred_test, X_test, 'baseline ridge')
])
metrics

MAE train = 1.4915448901465769
MAE holdout = 1.6440194281811653
Delta = 10.2 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
0,baseline ridge,1.644019,2.196696,0.505629,19.398459


## Бустинги

In [18]:
model = LGBMRegressor(n_estimators=100, random_state=42,
                      verbose=-1)  # n_estimators=100 - это baseline
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_holdout = mean_absolute_error(y_test, y_pred_test)
print(f"MAE train = {mae_train}")
print(f"MAE holdout = {mae_holdout}")
print(f"Delta = {(abs(mae_train - mae_holdout) / mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, y_pred_test, X_test, 'baseline lgbm')
])
metrics

MAE train = 1.288538679280894
MAE holdout = 1.6230722683096066
Delta = 26.0 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
0,baseline ridge,1.644019,2.196696,0.505629,19.398459
0,baseline lgbm,1.623072,2.16294,0.520706,19.151295


In [19]:
model = CatBoostRegressor(n_estimators=1000,
                          random_state=42)  # n_estimators=1000 это baseline
model.fit(X_train, y_train, silent=True)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_holdout = mean_absolute_error(y_test, y_pred_test)
print(f"MAE train = {mae_train}")
print(f"MAE holdout = {mae_holdout}")
print(f"Delta = {(abs(mae_train - mae_holdout) / mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, y_pred_test, X_test, 'baseline catboost')
])
metrics

MAE train = 1.1446709916367326
MAE holdout = 1.6456279226619241
Delta = 43.8 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
0,baseline ridge,1.644019,2.196696,0.505629,19.398459
0,baseline lgbm,1.623072,2.16294,0.520706,19.151295
0,baseline catboost,1.645628,2.206858,0.501044,19.417439


Модель Lasso отработала плохо(большое смещение), Ridge - гораздо лучше. По метрикам на отложенной выборке лучше всего отработал baseline LGBM, однако он сильно переобучается. Baseline Сatboost отработал хуже, чем Ridge и LGBM, очень сильно переобучившись, но в baseline catboost 1000 деревьев, имеет смысл пробовать раннюю остановку early stopping. \
Сделаем для бейзлайнов Ridge, LGBM и Catboost кросс-валидацию + раннюю остановку для Catboost.

# Modeling baselines + cross-validation

In [20]:
train.p_num.value_counts()

p_num
p02    25872
p10    25454
p12    25299
p04    24686
p11    24555
p01     8459
p06     8383
Name: count, dtype: int64

Следуя общей идее модели, изложенной выше, в кросс-валидации будем так же разбивать данные на фолды по участникам. Данные 1-го и 6-го пациентов объединим в один фолд.

## Кросс-валидация Ridge

In [21]:
patients = ['p02', 'p10', 'p12', 'p04', 'p11']

In [22]:
mae_oof = np.empty(6)
mae_train_oof = np.empty(6)
predicts_test = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(['p_num', 'target'],
                                                         axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = Ridge()
    model.fit(X_train_, y_train_)

    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train_)

    mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)
    mae_train_oof[idx] = mean_absolute_error(y_train_, y_train_pred)

    y_test_pred = model.predict(X_test)
    predicts_test.append(y_test_pred)

    print(f"MAE fold {idx + 1} = {mean_absolute_error(y_val, y_val_pred):.3f}")
    print('---')

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')].drop(
    ['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = Ridge()
model.fit(X_train_, y_train_)

y_val_pred = model.predict(X_val)
y_train_pred = model.predict(X_train_)

mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)
mae_train_oof[idx + 1] = mean_absolute_error(y_train_, y_train_pred)

y_test_pred = model.predict(X_test)
predicts_test.append(y_test_pred)

print(f"MAE fold {idx + 2} = {mean_absolute_error(y_val, y_val_pred):.3f}")
print('---')

avg_mae_val = np.mean(mae_oof)
avg_mae_train = np.mean(mae_train_oof)
print(f"Mean MAE val = {avg_mae_val}")
print(f"Mean MAE train = {avg_mae_train}")
print(
    f"Delta between train and val = {(abs(avg_mae_train - avg_mae_val) / avg_mae_train * 100):.1f} %"
)

MAE fold 1 = 1.622
---
MAE fold 2 = 1.277
---
MAE fold 3 = 1.443
---
MAE fold 4 = 1.577
---
MAE fold 5 = 1.715
---
MAE fold 6 = 4.027
---
Mean MAE val = 1.9434668064597662
Mean MAE train = 1.4867311917748773
Delta between train and val = 30.7 %


Судя по большой разнице между метрикой на трейне и валидации, Ridge сильно склонен к переобучению.

In [23]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test,
                           'baseline ridge with CV')
])
metrics

MAE holdout = 1.6490974411284518
Delta between train and holdout = 10.9 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
0,baseline ridge,1.644019,2.196696,0.505629,19.398459
0,baseline lgbm,1.623072,2.16294,0.520706,19.151295
0,baseline catboost,1.645628,2.206858,0.501044,19.417439
0,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377


## Кросс-валидация LGBM

In [24]:
mae_oof = np.empty(6)
mae_train_oof = np.empty(6)
predicts_test = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
    model.fit(
        X_train_,
        y_train_,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[early_stopping(stopping_rounds=100)])

    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train_)

    mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)
    mae_train_oof[idx] = mean_absolute_error(y_train_, y_train_pred)

    y_test_pred = model.predict(X_test)
    predicts_test.append(y_test_pred)

    print(f"MAE fold {idx + 1} = {mean_absolute_error(y_val, y_val_pred):.3f}")
    print('---')

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
model.fit(
    X_train_,
    y_train_,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[early_stopping(stopping_rounds=100)])

y_val_pred = model.predict(X_val)
y_train_pred = model.predict(X_train_)

mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)
mae_train_oof[idx + 1] = mean_absolute_error(y_train_, y_train_pred)

y_test_pred = model.predict(X_test)
predicts_test.append(y_test_pred)

print(f"MAE fold {idx + 2} = {mean_absolute_error(y_val, y_val_pred):.3f}")
print('---')

avg_mae_val = np.mean(mae_oof)
avg_mae_train = np.mean(mae_train_oof)
print(f"Mean MAE val = {avg_mae_val}")
print(f"Mean MAE train = {avg_mae_train}")
print(
    f"Delta between train and val = {(abs(avg_mae_train - avg_mae_val) / avg_mae_train * 100):.1f} %")

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[46]	valid_0's l1: 1.65108	valid_0's l2: 5.20581
MAE fold 1 = 1.651
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[43]	valid_0's l1: 1.28779	valid_0's l2: 2.62463
MAE fold 2 = 1.288
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[34]	valid_0's l1: 1.33072	valid_0's l2: 3.23762
MAE fold 3 = 1.331
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 1.51885	valid_0's l2: 4.04999
MAE fold 4 = 1.519
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[62]	valid_0's l1: 1.709	valid_0's l2: 5.12582
MAE fold 5 = 1.709
---
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. 

In [25]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds,
                           X_test, 'baseline lgbm with CV')
])
metrics

MAE holdout = 1.6290860861009977
Delta between train and holdout = 23.1 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
0,baseline ridge,1.644019,2.196696,0.505629,19.398459
0,baseline lgbm,1.623072,2.16294,0.520706,19.151295
0,baseline catboost,1.645628,2.206858,0.501044,19.417439
0,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
0,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255


С помощью кросс-валидации удалось немного улучшить обобщаующую способность LGBM (разница между метрикой на трейне и холдауте уменьшилась), хотя переобучение все еще большое.

## Кросс-валидация CatBoost 

In [26]:
mae_oof = np.empty(6)
mae_train_oof = np.empty(6)
predicts_test = []
# первые пять фолдов
for idx, patient in enumerate(patients):
    X_train_ = train.loc[~(train.p_num == patient)].drop(
        ['p_num', 'target'], axis=1)
    X_val = train.loc[train.p_num == patient].drop(['p_num', 'target'], axis=1)
    y_train_ = train.loc[~(train.p_num == patient)]['target']
    y_val = train.loc[train.p_num == patient]['target']

    model = CatBoostRegressor(random_state=42, eval_metric='MAE')

    eval_set = [(X_val, y_val)]

    model.fit(X_train_,
              y_train_,
              eval_set=eval_set,
              verbose=False,
              early_stopping_rounds=100)

    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train_)

    mae_oof[idx] = mean_absolute_error(y_val, y_val_pred)
    mae_train_oof[idx] = mean_absolute_error(y_train_, y_train_pred)

    y_test_pred = model.predict(X_test)
    predicts_test.append(y_test_pred)

    print(f"MAE fold {idx + 1} = {mean_absolute_error(y_val, y_val_pred):.3f}")
    print('---')

# последний фолд(1 + 6 пациенты)
X_train_ = train.loc[~((train.p_num == 'p06') | (train.p_num == 'p01'))].drop(
    ['p_num', 'target'], axis=1)
X_val = train.loc[(train.p_num == 'p06') | (
    train.p_num == 'p01')].drop(['p_num', 'target'], axis=1)
y_train_ = train.loc[~((train.p_num == 'p06') |
                       (train.p_num == 'p01'))]['target']
y_val = train.loc[(train.p_num == 'p06') | (train.p_num == 'p01')]['target']

model = CatBoostRegressor(random_state=42, eval_metric='MAE')

eval_set = [(X_val, y_val)]

model.fit(X_train_,
          y_train_,
          eval_set=eval_set,
          verbose=False,
          early_stopping_rounds=100)

y_val_pred = model.predict(X_val)
y_train_pred = model.predict(X_train_)

mae_oof[idx + 1] = mean_absolute_error(y_val, y_val_pred)
mae_train_oof[idx + 1] = mean_absolute_error(y_train_, y_train_pred)

y_test_pred = model.predict(X_test)
predicts_test.append(y_test_pred)

print(f"MAE fold {idx + 2} = {mean_absolute_error(y_val, y_val_pred):.3f}")
print('---')

avg_mae_val = np.mean(mae_oof)
avg_mae_train = np.mean(mae_train_oof)
print(f"Mean MAE val = {avg_mae_val}")
print(f"Mean MAE train = {avg_mae_train}")
print(
    f"Delta between train and val = {(abs(avg_mae_train - avg_mae_val) / avg_mae_train * 100):.1f} %")

MAE fold 1 = 1.688
---
MAE fold 2 = 1.281
---
MAE fold 3 = 1.323
---
MAE fold 4 = 1.458
---
MAE fold 5 = 1.724
---
MAE fold 6 = 1.976
---
Mean MAE val = 1.575160530647837
Mean MAE train = 1.2875746795962704
Delta between train and val = 22.3 %


In [27]:
holdout_preds = np.mean(np.column_stack(predicts_test), axis=1)
mae_holdout = mean_absolute_error(y_test, holdout_preds)
print(f"MAE holdout = {mae_holdout}")
print(
    f"Delta between train and holdout = {(abs(avg_mae_train - mae_holdout) / avg_mae_train * 100):.1f} %")

metrics = pd.concat([
    metrics,
    get_metrics_regression(y_test, holdout_preds, X_test,
                           'baseline catboost with CV')
])
metrics

MAE holdout = 1.6470482379958413
Delta between train and holdout = 27.9 %


Unnamed: 0,model,MAE,RMSE,R2 adjusted,WAPE_%
0,baseline lasso,1.759607,2.310597,0.453032,20.762326
0,baseline ridge,1.644019,2.196696,0.505629,19.398459
0,baseline lgbm,1.623072,2.16294,0.520706,19.151295
0,baseline catboost,1.645628,2.206858,0.501044,19.417439
0,baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
0,baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
0,baseline catboost with CV,1.647048,2.186179,0.510351,19.434197


Кросс-валидация + использование ранней остановки значительно улучшило обобщающую способность Catboost, хотя переобучение все еще очень большое.

In [28]:
metrics.set_index('model')

Unnamed: 0_level_0,MAE,RMSE,R2 adjusted,WAPE_%
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline lasso,1.759607,2.310597,0.453032,20.762326
baseline ridge,1.644019,2.196696,0.505629,19.398459
baseline lgbm,1.623072,2.16294,0.520706,19.151295
baseline catboost,1.645628,2.206858,0.501044,19.417439
baseline ridge with CV,1.649097,2.223375,0.493548,19.458377
baseline lgbm with CV,1.629086,2.161262,0.521449,19.222255
baseline catboost with CV,1.647048,2.186179,0.510351,19.434197


In [30]:
# сохраняем таблицу с метриками на бейзлайнах
metrics.to_csv("metrics.csv", index=False)

# сохраняем отдельно тренировочные и тестовые данные
train.to_csv("./data/train.csv", index=False)
test.to_csv("./data/test.csv", index=False)

# Вывод

Лучший бейзлайн по метрикам и проценту переобучения - LGBM (с кросс-валидацией). Чтобы снизить переобучение, надо подбирать гиперпараметры на кросс-валидации. Можно попробовать подбор гиперпараметров и для Catboost. Также можно попробовать стэкинг.