# Продвинутое решение с использованием техник машинного обучения

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
import warnings
from time import time

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from scipy.stats.mstats import winsorize

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
np.random.seed(42)

## Данные

Загрузим датасет из соревнования. Обучение и валидацию будем проводить на train части. Часть test содержит в себе признаки без таргета, по ней будем строить предсказания для дальнейшей отправки в submission.

In [2]:
# Загрузка данных
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

X_full = train.drop(columns=['id', 'FloodProbability'])
y_full = train['FloodProbability']
X_test = test.drop(columns=['id'])
test_ids = test['id']

original_features = X_full.columns.tolist()

print(f'Train: {train.shape}, Test: {test.shape}')
print(f'Target range: [{y_full.min():.4f}, {y_full.max():.4f}]')

Train: (1117957, 22), Test: (745305, 21)
Target range: [0.2850, 0.7250]


In [3]:
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


Загрузим решение из прошлого ноутбука с бейзлайном для сравнения с продвинутым решением.

In [4]:
try:
    baseline_train_oof = np.load('baseline_output/baseline_predictions_train_oof.npy')
    baseline_test = np.load('baseline_output/baseline_predictions_test.npy')
    baseline_r2 = r2_score(y_full, baseline_train_oof)
    print(f'baseline: R² = {baseline_r2:.6f}')
    BASELINE_AVAILABLE = True
except FileNotFoundError:
    print('Файл не найден')
    BASELINE_AVAILABLE = False

baseline: R² = 0.858277


Теперь проводим обработку признаков. Для начала - винсоризация и масштабирования с помощью робастного scaler, как для бейзлайна.

In [5]:
def preprocess(df, scaler=None, fit=False):
    df_win = df.copy()
    for col in df.columns:
        df_win[col] = winsorize(df[col], limits=(0.01, 0.01))

    if fit:
        scaler = RobustScaler()
        df_scaled = pd.DataFrame(scaler.fit_transform(df_win), columns=df.columns, index=df.index)
        return df_scaled, scaler
    else:
        df_scaled = pd.DataFrame(scaler.transform(df_win), columns=df.columns, index=df.index)
        return df_scaled

X_full_scaled, scaler = preprocess(X_full, fit=True)
X_test_scaled = preprocess(X_test, scaler=scaler)

Получили предобработку для обучающей и тестовой части, тестовую мы снова будем использовать только в конце для получения submission.

## Генерация признаков x2

Сейчас будем генерировать новые признаки помимо тех, что делали для бейзлайна. Изучив решение победителя, мы обнаружили, что сумма значений признаков очень скоррелирована с таргетом. Мы развиваем эту идею. Во-первых, мы добавляем порядковые признаки, например сумму значений признаков выше трешхолда или ниже его. Это может помочь выделить сумму по экстремальным значениям, которые предположительно больше влияют на таргет. Также добавляем сортировку значений признаков и считаем k-ые статистики. Во-вторых, добавляем "магическую" фичу - стандартное отклонение целевой переменной для групп с одинаковой суммой по всем фичам. Для теста заполним средним значением из обучающей выборки.

In [None]:
def generate_features(df_orig, df_scaled, y=None, is_train=True):
    features = df_scaled.copy()

    # новые фичи из бейзлайна
    features['sum'] = df_orig.sum(axis=1)
    features['mean'] = df_orig.mean(axis=1)
    features['std'] = df_orig.std(axis=1)
    features['max'] = df_orig.max(axis=1)
    features['min'] = df_orig.min(axis=1)
    features['median'] = df_orig.median(axis=1)
    features['range'] = features['max'] - features['min']
    features['q25'] = df_orig.quantile(0.25, axis=1)
    features['q75'] = df_orig.quantile(0.75, axis=1)
    features['iqr'] = features['q75'] - features['q25']
    features['cv'] = features['std'] / (features['mean'] + 1e-10)

    # новые порядковые фичи
    # количество значений в строке, проходящих порог
    for threshold in [6, 7, 8]:
        features[f'nb_sup{threshold}'] = (df_orig > threshold).sum(axis=1)
    for threshold in [2, 3, 4]:
        features[f'nb_inf{threshold}'] = (df_orig < threshold).sum(axis=1)

    # i-тое значение в отсортированной по возрастанию строке
    sorted_vals = np.sort(df_orig.values, axis=1)
    for i in range(sorted_vals.shape[1]):
        features[f'sorted_{i}'] = sorted_vals[:, i]

    if is_train and y is not None:
        temp_df = features[['sum']].copy()
        temp_df['target'] = y
        features['magic_std'] = temp_df.groupby('sum')['target'].transform('std')
        features['magic_std'].fillna(features['magic_std'].mean(), inplace=True)
    else:
        features['magic_std'] = 0

    return features

X_full_feat = generate_features(X_full, X_full_scaled, y_full, is_train=True)
X_test_feat = generate_features(X_test, X_test_scaled, is_train=False)

X_test_feat['magic_std'] = X_full_feat['magic_std'].mean()

print(f'Всего фичей: {X_full_feat.shape[1]} ({len(original_features)} исходных + {X_full_feat.shape[1] - len(original_features)} новых)')

Всего фичей: 58 (20 исходных + 38 новых)


## Модели

Функция для обучения с RepeatedStratifiedKFold.

In [7]:
n_repeats = 3
n_splits = 5
seed = 42
y_discrete = (y_full * 400).astype(np.int16)

In [28]:
def train_model_cv_separate_repeats(model, X, y, model_name, use_aug=False):
    oofs_per_repeat = []
    preds_per_repeat = []

    total_folds = n_repeats * n_splits
    pbar = tqdm(total=total_folds, desc=f'{model_name:12}',
                bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {postfix}')

    fold_idx = 0
    for repeat in range(n_repeats):
        oof_repeat = np.zeros(len(X))
        pred_repeat = np.zeros(len(X_test_feat))
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed + repeat)

        for train_idx, val_idx in skf.split(X, y_discrete):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # Аугментация если нужно
            if use_aug:
                np.random.seed(fold_idx)
                X_train = X_train + np.random.normal(0, 0.01, X_train.shape)
                y_train = y_train * 0.95 + y_train.mean() * 0.05

            model.fit(X_train, y_train)

            oof_repeat[val_idx] = model.predict(X_val)
            pred_repeat += model.predict(X_test_feat) / n_splits
            fold_r2 = r2_score(y_val, oof_repeat[val_idx])

            pbar.set_postfix({'R²': f'{fold_r2:.6f}', 'Repeat': repeat+1})
            pbar.update(1)
            fold_idx += 1

        oofs_per_repeat.append(oof_repeat)
        preds_per_repeat.append(pred_repeat)

    pbar.close()

    oof_combined = np.mean(oofs_per_repeat, axis=0)
    oof_r2 = r2_score(y, oof_combined)

    return oofs_per_repeat, preds_per_repeat, oof_r2

In [29]:
def train_with_pseudolabeling(model_class, params, X_train, y_train, X_test, model_name, use_aug=False):
    # Базовая модель
    model_base = model_class(**params)
    base_oofs, base_preds, _ = train_model_cv_separate_repeats(
        model_base, X_train, y_train, f"{model_name}_base", use_aug=use_aug
    )
    
    # Получаем pseudo-labels
    pseudo_labels = np.mean(base_preds, axis=0)
    conf_low = np.percentile(pseudo_labels, 25)
    conf_high = np.percentile(pseudo_labels, 75)
    confident_mask = (pseudo_labels >= conf_low) & (pseudo_labels <= conf_high)
    
    X_pseudo = X_test.iloc[confident_mask]
    y_pseudo = pseudo_labels[confident_mask]
    X_augmented = pd.concat([X_train, X_pseudo], ignore_index=True)
    y_augmented = pd.concat([y_train, pd.Series(y_pseudo)], ignore_index=True)
    
    # Дообучение на расширенных данных
    final_oofs = []
    final_preds = []
    
    for repeat in range(n_repeats):
        oof_repeat = np.zeros(len(y_train))
        pred_repeat = np.zeros(len(X_test))
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed + repeat)
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_discrete)):
            train_idx_aug = np.concatenate([train_idx, np.arange(len(y_train), len(y_augmented))])
            model = model_class(**params)
            model.fit(X_augmented.iloc[train_idx_aug], y_augmented.iloc[train_idx_aug])
            oof_repeat[val_idx] = model.predict(X_train.iloc[val_idx])
            pred_repeat += model.predict(X_test) / n_splits
        
        final_oofs.append(oof_repeat)
        final_preds.append(pred_repeat)
    
    final_score = r2_score(y_train, np.mean(final_oofs, axis=0))
    return final_oofs, final_preds, final_score

## A/B Тесты дополнительных техник

Проверяем эффективность дополнительных методов улучшения качества.


### 1. Feature Jittering & Label Smoothing

In [14]:
test_params = {
    'device': 'cuda',
    'tree_method': 'hist',
    'random_state': 42,
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'max_depth': 6,
}

def train_with_augmentation(X, y, X_test, params, aug_type='none'):
    """
    aug_type: 'none', 'jittering', 'smoothing', 'both'
    """
    oofs = []
    for repeat in range(n_repeats):
        oof_repeat = np.zeros(len(X))
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed + repeat)
        
        fold_idx = 0
        for train_idx, val_idx in skf.split(X, y_discrete):
            X_train, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx].copy(), y.iloc[val_idx]
            
            if aug_type in ['jittering', 'both']:
                np.random.seed(fold_idx)
                X_train = X_train + np.random.normal(0, 0.01, X_train.shape)
            
            if aug_type in ['smoothing', 'both']:
                y_train = y_train * 0.95 + y_train.mean() * 0.05
            
            model = XGBRegressor(**params)
            model.fit(X_train, y_train)
            oof_repeat[val_idx] = model.predict(X_val)
            fold_idx += 1
        
        oofs.append(oof_repeat)
    
    oof_combined = np.mean(oofs, axis=0)
    return r2_score(y, oof_combined)


score_baseline = train_with_augmentation(X_full_feat, y_full, X_test_feat, test_params, 'none')
print('БЕЗ аугментации', score_baseline)

score_jitter = train_with_augmentation(X_full_feat, y_full, X_test_feat, test_params, 'jittering')
print('Feature Jittering', score_jitter)

score_smooth = train_with_augmentation(X_full_feat, y_full, X_test_feat, test_params, 'smoothing')
print('Label Smoothing', score_smooth)

score_both = train_with_augmentation(X_full_feat, y_full, X_test_feat, test_params, 'both')
print('Jittering + Smoothing', score_both)

score_no_aug = score_baseline


БЕЗ аугментации 0.8690295999210678
Feature Jittering 0.8682842422163891
Label Smoothing 0.8668842747551073
Jittering + Smoothing 0.8660455759446903


### 2. Pseudo-labeling

Проверяем, помогает ли использование предсказаний на тестовых данных для дообучения.


In [15]:
score_no_pseudo = score_no_aug

base_model = XGBRegressor(**test_params)
base_model.fit(X_full_feat, y_full)

pseudo_labels = base_model.predict(X_test_feat)

confidence_threshold_low = np.percentile(pseudo_labels, 25)
confidence_threshold_high = np.percentile(pseudo_labels, 75)
confident_mask = (pseudo_labels >= confidence_threshold_low) & (pseudo_labels <= confidence_threshold_high)

X_pseudo = X_test_feat.iloc[confident_mask]
y_pseudo = pseudo_labels[confident_mask]

# Объединяем с обучающими данными
X_augmented = np.vstack([X_full_feat, X_pseudo])
y_augmented = np.concatenate([y_full, y_pseudo])

# Обучаем на расширенных данных
# используем только первые len(y_full) для OOF оценки
oofs_pseudo = []
for repeat in range(n_repeats):
    oof_repeat = np.zeros(len(y_full))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed + repeat)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_feat, y_discrete)):
        train_idx_aug = np.concatenate([train_idx, np.arange(len(y_full), len(y_augmented))])
        
        model = XGBRegressor(**test_params)
        model.fit(X_augmented[train_idx_aug], y_augmented[train_idx_aug])
        
        oof_repeat[val_idx] = model.predict(X_full_feat.iloc[val_idx])
    
    oofs_pseudo.append(oof_repeat)

oof_pseudo_combined = np.mean(oofs_pseudo, axis=0)
score_with_pseudo = r2_score(y_full, oof_pseudo_combined)

delta = score_with_pseudo - score_no_pseudo
print('Без Pseudo-labeling', score_no_pseudo)
print('С Pseudo-labeling', score_with_pseudo)

Без Pseudo-labeling 0.8690295999210678
С Pseudo-labeling 0.8692091004018079


### 3. Stacking (Multi-level Ensemble)

Проверяем, помогает ли добавление второго уровня мета-моделей (стекинг vs простой блендинг).


In [21]:
test_models_configs = [
    {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 500},
    {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 500},
    {'max_depth': 7, 'learning_rate': 0.1, 'n_estimators': 500},
    {'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 1000},
    {'max_depth': 6, 'learning_rate': 0.05, 'n_estimators': 1000},
]

test_oofs = []
for idx, config in enumerate(test_models_configs):
    params = {**test_params, **config}
    oofs_repeat, _, _ = train_model_cv_separate_repeats(
        XGBRegressor(**params), 
        X_full_feat, 
        y_full, 
        f'test_model_{idx}',
        use_aug=False
    )
    oof_avg = np.mean(oofs_repeat, axis=0)
    test_oofs.append(oof_avg)

blend_X_test = np.column_stack(test_oofs)
ridge_blend = Ridge(alpha=0.001, fit_intercept=False, random_state=42)
ridge_blend.fit(blend_X_test, y_full)
blend_pred = ridge_blend.predict(blend_X_test)
score_blending = r2_score(y_full, blend_pred)
print('BLENDING', score_blending)

meta_oof = np.zeros(len(y_full))
meta_model_params = {
    'device': 'cuda',
    'tree_method': 'hist',
    'random_state': 42,
    'learning_rate': 0.05,
    'n_estimators': 500,
    'max_depth': 3,
}

for repeat in range(n_repeats):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed + repeat)
    oof_X = blend_X_test
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(oof_X, y_discrete)):
        meta_model = XGBRegressor(**meta_model_params)
        meta_model.fit(oof_X[train_idx], y_full.iloc[train_idx])
        meta_oof[val_idx] += meta_model.predict(oof_X[val_idx]) / n_repeats

score_stacking = r2_score(y_full, meta_oof)
print('STACKING', score_stacking)

test_model_0:   0%|          | 0/15 [00:00<?] 

test_model_1:   0%|          | 0/15 [00:00<?] 

test_model_2:   0%|          | 0/15 [00:00<?] 

test_model_3:   0%|          | 0/15 [00:00<?] 

test_model_4:   0%|          | 0/15 [00:00<?] 

BLENDING 0.8692850213224765
STACKING 0.8685832345467893


### 4. Voting (Simple Averaging)

Проверяем, помогает ли простое усреднение (mean/median) vs взвешенный Ridge блендинг.


In [22]:
voting_X = blend_X_test

voting_mean = np.mean(voting_X, axis=1)
score_mean = r2_score(y_full, voting_mean)
print('равные веса', score_mean)

voting_median = np.median(voting_X, axis=1)
score_median = r2_score(y_full, voting_median)
print('робастное усреднение', score_median)

from scipy import stats
voting_trimmed = stats.trim_mean(voting_X, proportiontocut=0.1, axis=1)
score_trimmed = r2_score(y_full, voting_trimmed)
print('отбрасываем крайние 20%', score_trimmed)

print('оптимальные веса', score_blending)

равные веса 0.8692849692550277
робастное усреднение 0.86927598581468
отбрасываем крайние 20% 0.8692849692550277
оптимальные веса 0.8692850213224765


## Применение результатов A/B тестов

На основе A/B тестов:
- Pseudo-labeling показал улучшение
- Ridge Blending дал лучший результат
- Feature Jittering & Label Smoothing не улучшают


В этом ноутбуке мы решили не использовать перебор параметров по optuna, так как чтобы сделать это на выбарнной стратегии валидации нужно очень много времени. При этом ослабление стратегии валидации дает глобальное ухудшение по качетсву на закрытых тестах. Поэтому мы рассмотрели несколько наборов параметров для каждой модели. Параметры мы старались подбирать максимально различными между собой.

In [24]:
models_oofs = {}
models_preds = {}
models_scores = {}

In [31]:
xgb_configs = [
    {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8},
    {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.03, 'subsample': 0.7, 'colsample_bytree': 0.9, 'reg_alpha': 0.1},
    {'n_estimators': 250, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.7, 'min_child_weight': 3},
    {'n_estimators': 350, 'max_depth': 7, 'learning_rate': 0.02, 'subsample': 0.75, 'colsample_bytree': 0.85, 'reg_lambda': 0.1},
    {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.85, 'colsample_bytree': 0.9},
    {'n_estimators': 180, 'max_depth': 8, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.75, 'min_child_weight': 5},
    {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.04, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.1},
    {'n_estimators': 250, 'max_depth': 4, 'learning_rate': 0.06, 'subsample': 0.9, 'colsample_bytree': 0.85, 'reg_alpha': 0.05, 'reg_lambda': 0.05},
    {'n_estimators': 350, 'max_depth': 7, 'learning_rate': 0.025, 'subsample': 0.75, 'colsample_bytree': 0.9, 'min_child_weight': 2},
    {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.04, 'subsample': 0.8, 'colsample_bytree': 0.7, 'gamma': 0.05},
]

for i, params in enumerate(xgb_configs, 1):
    model = XGBRegressor(**params, random_state=42, n_jobs=-1, device='cuda', tree_method='hist', verbosity=0)
    oofs, preds, score = train_with_pseudolabeling(type(model), model.get_params(), X_full_feat, y_full, X_test_feat, f'xgb{i}', use_aug=False)
    models_oofs[f'xgb{i}'] = oofs
    models_preds[f'xgb{i}'] = preds
    models_scores[f'xgb{i}'] = score

xgb1_base   :   0%|          | 0/15 [00:00<?] 

xgb2_base   :   0%|          | 0/15 [00:00<?] 

xgb3_base   :   0%|          | 0/15 [00:00<?] 

xgb4_base   :   0%|          | 0/15 [00:00<?] 

xgb5_base   :   0%|          | 0/15 [00:00<?] 

xgb6_base   :   0%|          | 0/15 [00:00<?] 

xgb7_base   :   0%|          | 0/15 [00:00<?] 

xgb8_base   :   0%|          | 0/15 [00:00<?] 

xgb9_base   :   0%|          | 0/15 [00:00<?] 

xgb10_base  :   0%|          | 0/15 [00:00<?] 

In [35]:
lgbm_configs = [
    {'n_estimators': 200, 'num_leaves': 31, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8},
    {'n_estimators': 300, 'num_leaves': 63, 'learning_rate': 0.03, 'subsample': 0.7, 'colsample_bytree': 0.9, 'reg_alpha': 0.1},
    {'n_estimators': 250, 'num_leaves': 47, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.7, 'min_child_samples': 20},
    {'n_estimators': 350, 'num_leaves': 55, 'learning_rate': 0.02, 'subsample': 0.75, 'colsample_bytree': 0.85, 'reg_lambda': 0.1},
    {'n_estimators': 180, 'num_leaves': 25, 'learning_rate': 0.06, 'subsample': 0.85, 'colsample_bytree': 0.9, 'min_child_samples': 25},
    {'n_estimators': 300, 'num_leaves': 70, 'learning_rate': 0.03, 'subsample': 0.8, 'colsample_bytree': 0.75, 'reg_alpha': 0.05},
    {'n_estimators': 250, 'num_leaves': 40, 'learning_rate': 0.04, 'subsample': 0.9, 'colsample_bytree': 0.8, 'min_child_samples': 15},
    {'n_estimators': 400, 'num_leaves': 50, 'learning_rate': 0.025, 'subsample': 0.75, 'colsample_bytree': 0.85, 'reg_lambda': 0.05},
    {'n_estimators': 320, 'num_leaves': 60, 'learning_rate': 0.035, 'subsample': 0.82, 'colsample_bytree': 0.88, 'min_child_samples': 18},
    {'n_estimators': 280, 'num_leaves': 35, 'learning_rate': 0.045, 'subsample': 0.88, 'colsample_bytree': 0.72, 'reg_alpha': 0.08, 'reg_lambda': 0.08},
]

for i, params in enumerate(lgbm_configs, 1):
    model = LGBMRegressor(**params, device='gpu', random_state=42, n_jobs=-1, verbose=-1)
    oofs, preds, score = train_with_pseudolabeling(type(model), model.get_params(), X_full_feat, y_full, X_test_feat, f'lgbm{i}', use_aug=False)
    models_oofs[f'lgbm{i}'] = oofs
    models_preds[f'lgbm{i}'] = preds
    models_scores[f'lgbm{i}'] = score

lgbm1_base  :   0%|          | 0/15 [00:00<?] 

lgbm2_base  :   0%|          | 0/15 [00:00<?] 

lgbm3_base  :   0%|          | 0/15 [00:00<?] 

lgbm4_base  :   0%|          | 0/15 [00:00<?] 

lgbm5_base  :   0%|          | 0/15 [00:00<?] 

lgbm6_base  :   0%|          | 0/15 [00:00<?] 

lgbm7_base  :   0%|          | 0/15 [00:00<?] 

lgbm8_base  :   0%|          | 0/15 [00:00<?] 

lgbm9_base  :   0%|          | 0/15 [00:00<?] 

lgbm10_base :   0%|          | 0/15 [00:00<?] 

In [36]:
cat_configs = [
    {'iterations': 200, 'depth': 5, 'learning_rate': 0.05, 'subsample': 0.8, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 300, 'depth': 6, 'learning_rate': 0.03, 'subsample': 0.7, 'l2_leaf_reg': 3, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 250, 'depth': 7, 'learning_rate': 0.05, 'subsample': 0.9, 'l2_leaf_reg': 5, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 350, 'depth': 6, 'learning_rate': 0.02, 'subsample': 0.75, 'l2_leaf_reg': 2, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 180, 'depth': 8, 'learning_rate': 0.06, 'subsample': 0.85, 'l2_leaf_reg': 4, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 300, 'depth': 5, 'learning_rate': 0.04, 'subsample': 0.8, 'l2_leaf_reg': 1, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 250, 'depth': 7, 'learning_rate': 0.035, 'subsample': 0.9, 'l2_leaf_reg': 6, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 320, 'depth': 6, 'learning_rate': 0.045, 'subsample': 0.82, 'l2_leaf_reg': 3.5, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 280, 'depth': 8, 'learning_rate': 0.028, 'subsample': 0.88, 'l2_leaf_reg': 4.5, 'bootstrap_type': 'Bernoulli'},
    {'iterations': 360, 'depth': 5, 'learning_rate': 0.038, 'subsample': 0.78, 'l2_leaf_reg': 2.5, 'bootstrap_type': 'Bernoulli'},
]

for i, params in enumerate(cat_configs, 1):
    model = CatBoostRegressor(**params, random_seed=42, verbose=0, task_type='GPU')
    oofs, preds, score = train_with_pseudolabeling(type(model), model.get_params(), X_full_feat, y_full, X_test_feat, f'cat{i}', use_aug=False)
    models_oofs[f'cat{i}'] = oofs
    models_preds[f'cat{i}'] = preds
    models_scores[f'cat{i}'] = score

cat1_base   :   0%|          | 0/15 [00:00<?] 

cat2_base   :   0%|          | 0/15 [00:00<?] 

cat3_base   :   0%|          | 0/15 [00:00<?] 

cat4_base   :   0%|          | 0/15 [00:00<?] 

cat5_base   :   0%|          | 0/15 [00:00<?] 

cat6_base   :   0%|          | 0/15 [00:00<?] 

cat7_base   :   0%|          | 0/15 [00:00<?] 

cat8_base   :   0%|          | 0/15 [00:00<?] 

cat9_base   :   0%|          | 0/15 [00:00<?] 

cat10_base  :   0%|          | 0/15 [00:00<?] 

## Ансамбли

Добавим модель бейзлайна в общий список для сравнения.

In [37]:
if BASELINE_AVAILABLE:
    models_oofs['baseline'] = [baseline_train_oof for _ in range(n_repeats)]
    models_preds['baseline'] = [baseline_test for _ in range(n_repeats)]
    models_scores['baseline'] = baseline_r2

Далее реализованы функции для выбора моделей в итоговый ансамбль, пождобную стратегию использовали лидеры соревнования.

In [38]:
def compute_permutation_importance(models_list, n_permutations=3):
    oofs_averaged = {}
    for model_name in models_list:
        oofs_averaged[model_name] = np.mean(models_oofs[model_name], axis=0)

    blend_X = np.column_stack([oofs_averaged[m] for m in models_list])

    ridge = Ridge(alpha=0.01, fit_intercept=False, random_state=42)
    ridge.fit(blend_X, y_full)
    baseline_score = r2_score(y_full, ridge.predict(blend_X))

    perm_scores = {}
    for model_name in tqdm(models_list, desc='Computing importance'):
        scores = []
        for _ in range(n_permutations):
            blend_X_perm = blend_X.copy()
            model_idx = models_list.index(model_name)
            blend_X_perm[:, model_idx] = np.random.permutation(blend_X_perm[:, model_idx])

            ridge_temp = Ridge(alpha=0.01, fit_intercept=False, random_state=42)
            ridge_temp.fit(blend_X_perm, y_full)
            perm_score = r2_score(y_full, ridge_temp.predict(blend_X_perm))

            scores.append(baseline_score - perm_score)

        perm_scores[model_name] = np.mean(scores)

    return pd.DataFrame({
        'Model': perm_scores.keys(),
        'Importance': perm_scores.values()
    }).sort_values('Importance', ascending=False)

def find_correlated_models(models_list, threshold=0.9999):
    oofs_averaged = {}
    for model_name in models_list:
        oofs_averaged[model_name] = np.mean(models_oofs[model_name], axis=0)

    oofs_df = pd.DataFrame(oofs_averaged)
    correlations = oofs_df.corr()

    corr_pairs = []
    for i, col1 in enumerate(correlations.columns):
        for j, col2 in enumerate(correlations.columns):
            if i < j and correlations.iloc[i, j] > threshold:
                corr_pairs.append((col1, col2, correlations.iloc[i, j]))

    return sorted(corr_pairs, key=lambda x: x[2], reverse=True)

def create_model_groups(groups_dict):
    for group_name, group_models in groups_dict.items():
        oofs_grouped = []
        preds_grouped = []

        for repeat in range(n_repeats):
            oof_r = np.mean([models_oofs[m][repeat] for m in group_models], axis=0)
            pred_r = np.mean([models_preds[m][repeat] for m in group_models], axis=0)
            oofs_grouped.append(oof_r)
            preds_grouped.append(pred_r)

        models_oofs[group_name] = oofs_grouped
        models_preds[group_name] = preds_grouped
        models_scores[group_name] = r2_score(y_full, np.mean(oofs_grouped, axis=0))

        print(f'Created group {group_name}: {len(group_models)} models, R² = {models_scores[group_name]:.6f}')

### Запуск ансамблей

Ищем скоррелированные пары моделей, а также ранжируем их по важности.

In [40]:
all_base_models = [m for m in models_oofs.keys() if m != 'baseline']
perm_df_1 = compute_permutation_importance(all_base_models, n_permutations=3)
corr_pairs = find_correlated_models(all_base_models, threshold=0.9999)

Computing importance:   0%|          | 0/30 [00:00<?, ?it/s]

На основе типа моделей и correlation analysis создаем группы.

In [42]:
groups_to_create = {}

top_xgb = [m for m in perm_df_1.head(15)['Model'] if 'xgb' in m]
if len(top_xgb) >= 3:
    groups_to_create['xgb_top_group'] = top_xgb[:3]

top_lgbm = [m for m in perm_df_1.head(15)['Model'] if 'lgbm' in m]
if len(top_lgbm) >= 3:
    groups_to_create['lgbm_top_group'] = top_lgbm[:3]

top_cat = [m for m in perm_df_1.head(15)['Model'] if 'cat' in m]
if len(top_cat) >= 3:
    groups_to_create['cat_top_group'] = top_cat[:3]

if groups_to_create:
    create_model_groups(groups_to_create)

top_models = perm_df_1.head(10)['Model'].tolist()

selected_models = top_models + list(groups_to_create.keys())
print('Всего выбрано моделей', len(selected_models))

Created group xgb_top_group: 3 models, R² = 0.869309
Created group lgbm_top_group: 3 models, R² = 0.869198
Created group cat_top_group: 3 models, R² = 0.868272
Всего выбрано моделей 13


Теперь выберем итоговый список из 7 лучших моделей.

In [43]:
if 'baseline' in models_oofs:
    selected_models.append('baseline')

perm_df_final = compute_permutation_importance(selected_models, n_permutations=3)
print(perm_df_final.to_string(index=False))

final_models = perm_df_final.head(7)['Model'].tolist()

Computing importance:   0%|          | 0/14 [00:00<?, ?it/s]

         Model    Importance
          xgb6  3.757238e-05
          xgb9  9.484145e-06
 xgb_top_group  4.906292e-06
         lgbm4  4.844093e-06
         lgbm2  2.553880e-06
          cat2  2.368512e-06
          cat9  2.149385e-06
          cat5  1.862510e-06
lgbm_top_group  1.363201e-06
          cat4  1.051475e-06
          cat7  9.563283e-07
          cat3  4.913791e-07
 cat_top_group  1.418980e-08
      baseline -9.118151e-09


### Blending & Regularization

Теперь будем строить Blending на выбранных моделях. Практика и опыт участников соревнования показали, что именно этот вид ансамбля показывает наилучший результат. Также мы применяем регуляризацию с помощью ридж-регрессии.

In [45]:
models_oofs_averaged = {}
models_preds_averaged = {}

for model_name in final_models:
    oof_avg = np.mean([models_oofs[model_name][r] for r in range(n_repeats)], axis=0)
    pred_avg = np.mean([models_preds[model_name][r] for r in range(n_repeats)], axis=0)
    models_oofs_averaged[model_name] = oof_avg
    models_preds_averaged[model_name] = pred_avg

blend_X = np.column_stack([models_oofs_averaged[m] for m in final_models])
blend_test = np.column_stack([models_preds_averaged[m] for m in final_models])

ridge_final = Ridge(alpha=0.001, fit_intercept=False, random_state=42)
ridge_final.fit(blend_X, y_full)

ridge_oof = ridge_final.predict(blend_X)
ridge_test = ridge_final.predict(blend_test)
ridge_r2 = r2_score(y_full, ridge_oof)

print(f'FINAL RIDGE OOF R² = {ridge_r2:.6f}')

print('Веса моделей:')
coefs_df = pd.DataFrame({
    'Model': final_models,
    'Coefficient': ridge_final.coef_
}).sort_values('Coefficient', ascending=False)
print(coefs_df.to_string(index=False))

FINAL RIDGE OOF R² = 0.869461
Веса моделей:
        Model  Coefficient
         xgb6     0.827501
         xgb9     0.596657
        lgbm2     0.540200
         cat9     0.188468
         cat2    -0.188652
        lgbm4    -0.403095
xgb_top_group    -0.561057


## Результаты

Теперь построим сравнительную таблицу по всем моделям.

In [48]:
results_df = pd.DataFrame({
    'Model': list(models_scores.keys()) + ['Ridge_Blend'],
    'R² (OOF)': list(models_scores.values()) + [ridge_r2]
}).sort_values('R² (OOF)', ascending=False)

print('\n' + results_df.to_string(index=False))

best_model = results_df.iloc[0]['Model']
best_r2 = results_df.iloc[0]['R² (OOF)']


         Model  R² (OOF)
   Ridge_Blend  0.869461
          xgb6  0.869419
          xgb9  0.869367
          xgb4  0.869319
 xgb_top_group  0.869309
         lgbm2  0.869231
         lgbm9  0.869208
lgbm_top_group  0.869198
          xgb3  0.869166
         lgbm6  0.869139
         lgbm8  0.869130
         lgbm4  0.869107
         lgbm3  0.868937
        lgbm10  0.868922
         lgbm7  0.868910
         lgbm5  0.868877
          xgb2  0.868823
         lgbm1  0.868791
          cat5  0.868661
          cat3  0.868583
          xgb8  0.868539
          cat8  0.868473
 cat_top_group  0.868272
          cat9  0.868223
          xgb1  0.868162
          cat7  0.868075
         cat10  0.868065
          cat6  0.867871
          cat2  0.867750
          xgb5  0.867715
          cat1  0.867538
         xgb10  0.867342
          cat4  0.867212
          xgb7  0.867012
      baseline  0.858277


А также сравним лучшую модель с бейзлайном и посмотрим, насколько увеличилась метрика.

In [None]:
print(f'Лучшая модель: {best_model}')
print(f'Лучшее R²: {best_r2:.6f}')

if BASELINE_AVAILABLE:
    improvement = best_r2 - baseline_r2
    improvement_pct = (improvement / baseline_r2) * 100
    print(f'\nУлучшение относительно бейзлайна:')
    print(f'  Baseline:  R² = {baseline_r2:.6f}')
    print(f'  Advanced:  R² = {best_r2:.6f}')
    print(f'  Delta R²:  {improvement:+.6f}')
    print(f'  Delta %:   {improvement_pct:+.2f}%')


Лучшая модель: Ridge_Blend
Лучшее R²: 0.869461

Улучшение относительно бейзлайна:
  Baseline:  R² = 0.858277
  Advanced:  R² = 0.869461
  Delta R²:  +0.011184
  Delta %:   +1.30%


Видим хороший прирост по метрике в ходе исследования моделей. Минимальное требование выполнено. Построим файл submission по предсказаниям лучшей модели для получения скоров на kaggle.

In [50]:
submission = pd.DataFrame({
    'id': test_ids,
    'FloodProbability': ridge_test
})

submission.to_csv('results/solution.csv', index=False)