# Импортируем необходимые библиотеки

In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import lightgbm as lgb

import optuna

import warnings
warnings.filterwarnings('ignore')

# Загрузим данные

In [None]:
train_features = pd.read_csv('/kaggle/input/dota-res-predict/train_features.csv', index_col='match_id_hash')
train_targets = pd.read_csv('/kaggle/input/dota-res-predict/train_targets.csv', index_col='match_id_hash')

In [None]:
train_features.head(5)

In [None]:
# train_targets.head(2)
print(len(list(train_features.columns)))
features = list(train_features.columns)
features_idx = [x for x in range(1, 246)]
for i in range(len(features)):
    s = f'{features_idx[i]}. {features[i]}'
    print(s)

In [None]:
X = train_features
y = train_targets['radiant_win']
print(X.shape)
print(train_features.head(5))

# EDA (5 баллов)

In [None]:
# тут EDA, который вы считаете необходимым, например, изучим количество игр в разных режимах
print(X.shape)
print()
print(X['lobby_type'].value_counts())

In [None]:
avg_time = X['game_time'].mean()
median_time = X['game_time'].median()
min_t, max_t = X['game_time'].min(), X['game_time'].max()
print(f'{avg_time=:.2f}')
print(f'{median_time=:.2f}')
print(f'{min_t=:.2f}\n{max_t=:.2f}')

rgc = [f'r{i}_gold' for i in range(1, 6)]
dgc = [f'd{i}_gold' for i in range(1, 6)]

X['r_total_gold'] = X[rgc].sum(axis=1)
X['d_total_gold'] = X[dgc].sum(axis=1)
print(f"avg gold r: {X['r_total_gold'].mean():.2f}")
print(f"avg gold d: {X['d_total_gold'].mean():.2f}\n")

corr = X.corrwith(y).abs().sort_values(ascending=False)
print(corr.head(10))

In [None]:
# посмотрим распределение таргета
td = y.value_counts()
print(td)
print()
print(f"radiant win: {td[1]} ({td[1]/len(y)*100}%)")
print(f"dire win: {td[0]} ({td[0]/len(y)*100}%)")

In [None]:
# далее ваш EDA - все, что вы считаете необходимым

# Обучим CatBoost на чистых данных и посмотрим на метрики

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
oof_cat = np.zeros(len(X))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = CatBoostClassifier(
        early_stopping_rounds=100,
        random_state=69,
        verbose=0,
        thread_count=4
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

    y_pred = model.predict_proba(X_valid)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_valid, y_pred)
    fold_acc = accuracy_score(y_valid, y_pred > 0.5)
    print(f'FOLD {fold}, Validation ROC-AUC score: {fold_roc_auc:.4f}, Accuracy score: {fold_acc:.4f}')

    oof_cat[valid_idx] = y_pred

oof_roc_auc = roc_auc_score(y, oof_cat)
oof_accuracy = accuracy_score(y, oof_cat > 0.5)
print(f'\nOverall CAT OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

# Генерация фичей (5 баллов)

Генерация фичей является одной из важнейших частей построения пайплайна машинного обучения. В данной части вам предлагается придумать и сгенерировать полезные фичи для модели. Важно: они должны повысить метрику. В качестве примера сгенерированы статистические фичи. Вы можете не использовать их, если не считаете их нужными.

In [None]:
def fe(data):
    for c in [
        'kills', 'deaths', 'assists', 'denies', 'gold', 'lh', 'xp', 
        'health', 'max_health', 'max_mana', 'level', 'x', 'y', 
        'stuns', 'creeps_stacked', 'camps_stacked', 'rune_pickups',
        'firstblood_claimed', 'teamfight_participation', 'towers_killed', 
        'roshans_killed', 'obs_placed', 'sen_placed'
    ]:
        r_columns = [f'r{i}_{c}' for i in range(1, 6)]
        d_columns = [f'd{i}_{c}' for i in range(1, 6)]

        eps = 1e-8 # избегаем деления на 0
        
        data['r_total_' + c] = data[r_columns].sum(1)
        data['d_total_' + c] = data[d_columns].sum(1)
        data['total_' + c + '_ratio'] = data['r_total_' + c] / (data['d_total_' + c] + eps)
        
        # попробуйте другие статистики, например среднее или стандартное отклонение
        # более того, если считаете, что фичи, основанные на статистиках не нужны, можете их не использовать - они приведены в качестве примера
        data[f'r_mean_{c}'] = data[r_columns].mean(1)
        data[f'd_mean_{c}'] = data[d_columns].mean(1)
        data[f'r_std_{c}'] = data[r_columns].std(1)
        data[f'd_std_{c}'] = data[d_columns].std(1)
        
        data[f'r_max_{c}'] = data[r_columns].max(1)
        data[f'd_max_{c}'] = data[d_columns].max(1)
        data[f'r_min_{c}'] = data[r_columns].min(1)
        data[f'd_min_{c}'] = data[d_columns].min(1)

        for team in ['r', 'd']:
            kills_columns = [f'{team}{i}_kills' for i in range(1, 6)]
            deaths_columns = [f'{team}{i}_deaths' for i in range(1, 6)]
            assists_columns = [f'{team}{i}_assists' for i in range(1, 6)]
            
            total_kills = data[kills_columns].sum(1)
            total_deaths = data[deaths_columns].sum(1) + eps
            total_assists = data[assists_columns].sum(1)
            
            data[f'{team}_kda'] = (total_kills + total_assists) / total_deaths
            data[f'{team}_kill_participation'] = total_kills / (total_kills + eps)
        for team in ['r', 'd']:
            gold_cols = [f'{team}{i}_gold' for i in range(1, 6)]
            lh_cols = [f'{team}{i}_lh' for i in range(1, 6)]
            
            total_gold = data[gold_cols].sum(axis=1)
            total_lh = data[lh_cols].sum(axis=1) + eps
            
            data[f'{team}_gpm'] = total_gold / (data['game_time'] / 60 + eps)  # GPM
            data[f'{team}_gold_per_lh'] = total_gold / total_lh
    

        for team in ['r', 'd']:
            x_cols = [f'{team}{i}_x' for i in range(1, 6)]
            y_cols = [f'{team}{i}_y' for i in range(1, 6)]
    
            
            data[f'{team}_center_x'] = data[x_cols].mean(1) #ах*еть ху это центр масс
            data[f'{team}_center_y'] = data[y_cols].mean(1)
            
            data[f'{team}_spread_x'] = data[x_cols].std(1)
            data[f'{team}_spread_y'] = data[y_cols].std(1)
        
        for team in ['r', 'd']:
            obs_cols = [f'{team}{i}_obs_placed' for i in range(1, 6)]
            sen_cols = [f'{team}{i}_sen_placed' for i in range(1, 6)]
            
            data[f'{team}_vision_score'] = data[obs_cols].sum(axis=1) + data[sen_cols].sum(axis=1)
        
        for team in ['r', 'd']:
            level_cols = [f'{team}{i}_level' for i in range(1, 6)]
            xp_cols = [f'{team}{i}_xp' for i in range(1, 6)]
            
            data[f'{team}_avg_level'] = data[level_cols].mean(axis=1)
            data[f'{team}_level_spread'] = data[level_cols].std(axis=1)
            data[f'{team}_total_xp'] = data[xp_cols].sum(axis=1)
            
        data['gold_advantage'] = data['r_total_gold'] - data['d_total_gold']
        data['xp_advantage'] = data['r_total_xp'] - data['d_total_xp']
        data['kill_advantage'] = data['r_total_kills'] - data['d_total_kills']
        data['level_advantage'] = data['r_avg_level'] - data['d_avg_level']
        
        data['game_time_minutes'] = data['game_time'] / 60
        data['early_game'] = (data['game_time_minutes'] <= 15).astype(int)
        data['mid_game'] = ((data['game_time_minutes'] > 15) & (data['game_time_minutes'] <= 35)).astype(int)
        data['late_game'] = (data['game_time_minutes'] > 35).astype(int)
        
    return data

# Теперь обучим CatBoost на данных с новыми фичами и посмотрим на метрики

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
oof_cat = np.zeros(len(X))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # чтобы в тест не утекли глобальные статистики трейна, сначала сплитим, потом генерируем фичи
    X_train = fe(X_train)
    X_valid = fe(X_valid)

    model = CatBoostClassifier(
        early_stopping_rounds=100,
        random_state=69,
        verbose=0,
        thread_count=4
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

    y_pred = model.predict_proba(X_valid)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_valid, y_pred)
    fold_acc = accuracy_score(y_valid, y_pred > 0.5)
    print(f'FOLD {fold}, Validation ROC-AUC score: {fold_roc_auc:.4f}, Accuracy score: {fold_acc:.4f}')

    oof_cat[valid_idx] = y_pred

oof_roc_auc = roc_auc_score(y, oof_cat)
oof_accuracy = accuracy_score(y, oof_cat > 0.5)
print(f'\nOverall OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

# Обработка категориальных фичей средствами CatBoost

Попробуем подать категориальные фичи в модель через cat_features.

In [54]:
# выделим очевидные категориальные фичи - вы можете попробовать выделить больше / сгенерировать новые или же вовсе отказаться от этой идеи
cats = ['game_mode', 'lobby_type']

hero_features = []
for team in ['r', 'd']:
    for i in range(1, 6):
        hero_col = f'{team}{i}_hero_id'
        if hero_col in X.columns:
            hero_features.append(hero_col)

cats.extend(hero_features)

b_features = []
for team in ['r', 'd']:
    for i in range(1, 6):
        fb_col = f'{team}{i}_firstblood_claimed'
        if fb_col in X.columns:
            b_features.append(fb_col)

cats.extend(b_features)

custom_cats = [
    'game_duration_category', 'total_gold_category', 'kill_advantage_category'
]

for t in ['r', 'd']:
    custom_cats += [f'{t}_main_support', f'{t}_main_carry']
    custom_cats += [f'{t}{i}_map_quadrant' for i in range(1, 6)]

cats.extend([c for c in custom_cats if c in X.columns])

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
oof_cat = np.zeros(len(X))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # чтобы в тест не утекли глобальные статистики трейна, сначала сплитим, потом генерируем фичи
    X_train = fe(X_train)
    X_valid = fe(X_valid)

    model = CatBoostClassifier(
        cat_features=cats,
        early_stopping_rounds=100,
        random_state=69,
        verbose=0,
        thread_count=4
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

    y_pred = model.predict_proba(X_valid)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_valid, y_pred)
    fold_acc = accuracy_score(y_valid, y_pred > 0.5)
    print(f'FOLD {fold}, Validation ROC-AUC score: {fold_roc_auc:.4f}, Accuracy score: {fold_acc:.4f}')

    oof_cat[valid_idx] = y_pred

oof_roc_auc = roc_auc_score(y, oof_cat)
oof_accuracy = accuracy_score(y, oof_cat > 0.5)
print(f'\nOverall CAT w/ FE OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

# Отбор фичей (5 баллов)

Возможно, на этапе генерации фичей вы решили прибегнуть к автоматической генерации (если нет, то советую обратить внимание на библиотеку OpenFE), или же просто генерировали все подряд. Также, возможно, исходные данные содержали бесполезные и шумные фичи.

Вам предлагается исследовать это и оставить только те фичи, которые вы считаете важными.

Идеи для отбора фичей: 
- по feature importance
- по permutation importance
- по shap values
- recursive feature elimination
- sequential feature selection

Также обратите внимание на гайд от catboost: https://github.com/catboost/catboost/blob/master/catboost/tutorials/feature_selection/select_features_tutorial.ipynb

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score
import shap
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [None]:
# бейзлайн: используем встроенные инструменты catboost - вы можете работать с ним или обратиться к другим инструментам
# здесь нужно быть аккуратным, ведь удаление фичей всегда может привести к падению скора 
# необходимо подобрать грамотный подход, а это можно сделать только путем экспериментов

model = CatBoostClassifier(
    cat_features=cats,
    random_state=69,
    verbose=0,
    thread_count=4
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)

X_train = fe(X_train)
X_test = fe(X_test)

train_pool = Pool(X_train, y_train, cat_features=cats)
test_pool = Pool(X_test, y_test, cat_features=cats)

summary = model.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select=list(range(X_train.shape[1])),
    num_features_to_select=300, # количество фичей, которое хотим отобрать - один из самых важных параметров
    algorithm=None, # точно стоит поиграться с выбором алгоритма - важный параметр !!!
    steps=None, # количество шагов
    shap_calc_type=None,
    train_final_model=False,
    verbose=None,
    logging_level=None,
    plot=False
)

catboost_features = summary['selected_features'] # отобранные фичи
catboost_features = X_train.columns[catboost_features].tolist()


#permutation importance
model_full = CatBoostClassifier(
    cat_features=cats,
    random_state=69,
    verbose=0,
    thread_count=4,
    iterations=500
)

model_full.fit(X_train, y_train)
perm_imp = permutation_importance(
    model_full, X_test, y_test, 
    n_repeats=5, 
    random_state=69,
    scoring='roc_auc'
)

perm_f_scores = pd.DataFrame({
    'feature': X_train.columns,
    'importance': perm_imp.importances_mean,
    'std': perm_imp.importances_std
}).sort_values('importance', ascending=False)

permutation_f = perm_f_scores.head(300)['feature'].tolist()
print(f"perm imp: {len(permutation_f)}")

#shap
sample_size = min(1000, len(X_train))
X_sample = X_train.sample(n=sample_size, random_state=69)
y_sample = y_train.loc[X_sample.index]


model_shap = CatBoostClassifier(
    cat_features=cats,
    random_state=69,
    verbose=0,
    thread_count=4,
    iterations=300
)
model_shap.fit(X_sample, y_sample)

explainer = shap.TreeExplainer(model_shap)
shap_values = explainer.shap_values(X_sample.iloc[:500])

shap_importance = np.abs(shap_values).mean(0)
shap_feature_scores = pd.DataFrame({
    'feature': X_sample.columns,
    'shap_importance': shap_importance
}).sort_values('shap_importance', ascending=False)

shap_features = shap_feature_scores.head(300)['feature'].tolist()
print(f"SHAP: {len(shap_features)}")


#ensemble

feature_votes = defaultdict(int)
feature_methods = {
    'CatBoost': catboost_features,
    'Permutation': permutation_f,
    'SHAP': shap_features,
}

for method, features in feature_methods.items():
    for feature in features:
        feature_votes[feature] += 1

ensemble_ranking = sorted(feature_votes.items(), key=lambda x: x[1], reverse=True)

ensemble_f2 = [f for f, votes in ensemble_ranking if votes >= 2]

if len(ensemble_f2) < 300:
    top_features = [f for f, _ in ensemble_ranking[:300]]
    ensemble_features = top_features
else:
    ensemble_features = ensemble_f2[:300]

print(f"ensemble >=2: {len(ensemble_f2)}")
print(f"final: {len(ensemble_features)}")


results = {}

X_train_cb = X_train[catboost_features]
model_cb = CatBoostClassifier(cat_features=[f for f in cats if f in catboost_features], random_state=69, verbose=0, iterations=300)
cv_cb = cross_val_score(model_cb, X_train_cb, y_train, cv=3, scoring='roc_auc')
results['CatBoost'] = cv_cb.mean()
print('catboost training...')

X_train_perm = X_train[permutation_f]
model_perm = CatBoostClassifier(cat_features=[f for f in cats if f in permutation_f], random_state=69, verbose=0, iterations=300)
cv_perm = cross_val_score(model_perm, X_train_perm, y_train, cv=3, scoring='roc_auc')
results['Permutation'] = cv_perm.mean()
print('permutation training...')

X_train_shap = X_train[shap_features]
model_shap_eval = CatBoostClassifier(cat_features=[f for f in cats if f in shap_features], random_state=69, verbose=0, iterations=300)
cv_shap = cross_val_score(model_shap_eval, X_train_shap, y_train, cv=3, scoring='roc_auc')
results['SHAP'] = cv_shap.mean()
print('shap training...')

X_train_ens = X_train[ensemble_features]
model_ens = CatBoostClassifier(cat_features=[f for f in cats if f in ensemble_features], random_state=69, verbose=0, iterations=300)
cv_ens = cross_val_score(model_ens, X_train_ens, y_train, cv=3, scoring='roc_auc')
results['Ensemble'] = cv_ens.mean()
print('ensemble training...')

for method, score in results.items():
    print(f"{method}: {score:.4f}")

best_method = max(results, key=results.get)
print(f"THE BEST: {best_method} ({results[best_method]:.4f})")

if best_method == 'CatBoost':
    selected_features = catboost_features
elif best_method == 'Permutation':
    selected_features = permutation_f
elif best_method == 'SHAP':
    selected_features = shap_features
else:
    selected_features = ensemble_features

print(f"Итого признаков: {len(selected_features)}")
print()
print(selected_features)

# Обучим модель на отобранных и фичах и посмотрим на метрики

In [None]:
selected_features = ['game_mode', 'r1_kills', 'r1_lh', 'r1_max_health', 'r1_max_mana',
                     'r2_hero_id', 'r2_health', 'r2_rune_pickups', 'r3_max_health',
                     'r3_teamfight_participation', 'r4_deaths', 'r4_health', 'r4_level',
                     'r4_y', 'r4_teamfight_participation', 'r5_max_health', 'r5_x',
                     'r5_rune_pickups', 'r5_firstblood_claimed', 'd1_lh', 'd1_xp', 
                     'd1_health', 'd1_level', 'd1_rune_pickups', 'd2_hero_id', 'd2_assists',
                     'd2_creeps_stacked', 'd2_sen_placed', 'd3_hero_id', 'd3_lh', 'd3_xp', 'd3_max_mana',
                     'd3_y', 'd4_assists', 'd4_y', 'd4_stuns', 'd5_hero_id', 'd5_denies', 'd5_xp', 'd5_rune_pickups',
                     'r_total_kills', 'r_std_kills', 'd_max_kills', 'r_kda', 'd_kda', 'r_gold_per_lh', 'd_gpm', 'd_gold_per_lh',
                     'r_center_y', 'r_spread_y', 'd_center_x', 'r_level_spread', 'd_avg_level', 'gold_advantage', 'xp_advantage',
                     'level_advantage', 'total_deaths_ratio', 'r_mean_deaths', 'total_assists_ratio', 'd_mean_assists', 'd_std_assists',
                     'total_denies_ratio', 'd_mean_denies', 'r_std_denies', 'd_max_denies', 'total_gold_ratio', 'r_mean_lh', 'r_std_lh',
                     'd_std_lh', 'd_max_lh', 'd_min_lh', 'total_xp_ratio', 'r_mean_xp', 'r_std_xp', 'r_total_health', 'total_health_ratio',
                     'r_mean_health', 'd_mean_health', 'r_std_health', 'r_min_health', 'total_max_health_ratio', 'r_total_max_mana',
                     'r_std_max_mana', 'd_std_max_mana', 'r_max_max_mana', 'd_max_max_mana', 'd_total_level', 'total_level_ratio',
                     'r_std_level', 'd_min_level', 'd_total_x', 'r_min_x', 'd_std_y', 'd_max_y', 'total_stuns_ratio', 'd_mean_creeps_stacked',
                     'd_total_camps_stacked', 'r_total_rune_pickups', 'total_rune_pickups_ratio', 'r_std_rune_pickups', 'd_std_rune_pickups',
                     'd_mean_teamfight_participation', 'd_max_teamfight_participation', 'r_min_teamfight_participation', 'r_total_towers_killed',
                     'd_total_towers_killed', 'total_towers_killed_ratio', 'r_mean_towers_killed', 'd_mean_towers_killed', 'r_std_towers_killed',
                     'total_obs_placed_ratio', 'r_mean_sen_placed', 'objectives_len', 'r1_hero_id', 'r1_denies', 'r1_gold', 'r1_x', 'r1_towers_killed', 
                     'r2_assists', 'r2_camps_stacked', 'r2_roshans_killed', 'r3_hero_id', 'r3_health', 'r3_level', 'r3_x', 'r3_y', 'r3_rune_pickups',
                     'r3_towers_killed', 'r3_sen_placed', 'r4_hero_id', 'r4_assists', 'r4_xp', 'r4_max_health', 'r4_rune_pickups', 'r4_obs_placed', 
                     'r5_hero_id', 'r5_towers_killed', 'r5_obs_placed', 'd1_hero_id', 'd1_assists', 'd1_stuns', 'd1_creeps_stacked', 'd1_camps_stacked',
                     'd1_firstblood_claimed', 'd1_towers_killed', 'd2_deaths', 'd2_denies', 'd2_max_health', 'd2_level', 'd2_x', 'd2_y',
                     'd2_teamfight_participation', 'd3_kills', 'd3_health', 'd3_level', 'd3_creeps_stacked', 'd4_hero_id', 'd4_lh', 'd4_max_mana',
                     'd4_rune_pickups', 'd4_firstblood_claimed', 'd4_towers_killed', 'd5_assists', 'd5_max_mana', 'd5_stuns', 'd5_camps_stacked',
                     'd5_towers_killed', 'r_total_gold', 'd_total_kills', 'd_mean_kills', 'd_min_kills', 'r_center_x', 'r_spread_x', 'd_center_y',
                     'r_avg_level', 'r_total_xp', 'd_total_xp', 'kill_advantage', 'game_time_minutes', 'r_total_deaths', 'r_max_deaths', 'd_min_deaths',
                     'd_total_assists', 'r_mean_assists', 'r_std_assists', 'r_max_assists', 'r_mean_denies', 'r_max_denies', 'd_min_denies', 'd_std_gold',
                     'd_min_gold', 'r_total_lh', 'd_total_lh', 'total_lh_ratio', 'd_mean_lh', 'r_max_lh', 'r_min_xp', 'd_total_health', 'r_max_health',
                     'd_max_health', 'r_total_max_health', 'd_total_max_health', 'r_max_max_health', 'r_min_max_health', 'd_total_max_mana',
                     'r_mean_max_mana', 'r_mean_level', 'r_max_level', 'd_max_level', 'r_total_x', 'r_mean_x', 'd_mean_x', 'r_max_x', 'd_max_x',
                     'd_total_y', 'total_y_ratio', 'r_mean_y', 'd_min_y', 'd_mean_stuns', 'r_total_creeps_stacked', 'total_creeps_stacked_ratio',
                     'r_max_creeps_stacked', 'd_mean_camps_stacked', 'r_total_teamfight_participation', 'd_total_teamfight_participation',
                     'r_std_teamfight_participation', 'r_max_teamfight_participation', 'r_max_towers_killed', 'd_max_towers_killed',
                     'r_max_roshans_killed', 'd_total_obs_placed', 'r_mean_obs_placed', 'r_std_obs_placed', 'd_min_obs_placed', 'd_total_sen_placed',
                     'd_mean_sen_placed', 'd_max_sen_placed', 'r1_health', 'r2_gold', 'd1_max_mana', 'r2_lh', 'd_min_xp', 'total_x_ratio',
                     'd_mean_max_health', 'total_teamfight_participation_ratio', 'total_kills_ratio', 'r3_xp', 'r_min_gold', 'r_std_gold',
                     'd3_teamfight_participation', 'r2_max_mana', 'd1_max_health', 'total_max_mana_ratio', 'd_min_x', 'd3_x', 'd_min_max_mana',
                     'd4_kills', 'r4_x', 'r3_gold', 'r2_teamfight_participation', 'r_std_y', 'r_max_stuns', 'd_max_creeps_stacked', 'd2_lh',
                     'd_max_rune_pickups', 'r_gpm', 'r2_xp', 'total_sen_placed_ratio', 'd_mean_y', 'd_max_obs_placed', 'd_std_denies', 'd_spread_y',
                     'r4_stuns', 'd4_health', 'd_std_xp', 'd_total_stuns', 'r1_stuns', 'r5_denies', 'r4_lh', 'r5_teamfight_participation', 'r4_kills',
                     'd_std_creeps_stacked', 'd_total_rune_pickups', 'd_min_teamfight_participation', 'r1_teamfight_participation', 'd2_health', 'd2_rune_pickups',
                     'd_max_max_health', 'total_camps_stacked_ratio', 'd1_x']

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
oof_cat = np.zeros(len(X))


# missing = [f for f in selected_features if f not in fe(X_train).columns]
# print('Missing features:', missing)


for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # чтобы в тест не утекли глобальные статистики трейна, сначала сплитим, потом генерируем фичи
    X_train = fe(X_train)[selected_features]
    X_valid = fe(X_valid)[selected_features]

    model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.1,
    depth=10,
    thread_count=4
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

    y_pred = model.predict_proba(X_valid)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_valid, y_pred)
    fold_acc = accuracy_score(y_valid, y_pred > 0.5)
    print(f'FOLD {fold}, Validation ROC-AUC score: {fold_roc_auc:.4f}, Accuracy score: {fold_acc:.4f}')

    oof_cat[valid_idx] = y_pred

oof_roc_auc = roc_auc_score(y, oof_cat)
oof_accuracy = accuracy_score(y, oof_cat > 0.5)
print(f'\nOverall CAT w/ FE&FS OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

# Блендинг: добавим XGBoost и LightGBM

In [None]:
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    X_train_fe = fe(X_train)
    X_valid_fe = fe(X_valid)
    
    X_train_fs = X_train_fe[selected_features]
    X_valid_fs = X_valid_fe[selected_features]

    for col in cats:
        if col in X_train_fs.columns:
            X_train_fs[col] = X_train_fs[col].astype('category')
            X_valid_fs[col] = X_valid_fs[col].astype('category')

    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=69,
        n_jobs=4,
        metric='auc',
        early_stopping_round=100,
        verbosity=-1
    )
    
    categorical_features = [col for col in cats if col in X_train_fs.columns]
    model.fit(
        X_train_fs, y_train,
        eval_set=[(X_valid_fs, y_valid)],
        categorical_feature=categorical_features
    )

    y_pred = model.predict_proba(X_valid_fs)[:, 1]
    oof_lgb[valid_idx] = y_pred

print(f'LightGBM OOF ROC-AUC: {roc_auc_score(y, oof_lgb):.4f}')

In [None]:
oof_xgb = np.zeros(len(X))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    X_train_fe = fe(X_train)
    X_valid_fe = fe(X_valid)
    
    X_train_fs = X_train_fe[selected_features]
    X_valid_fs = X_valid_fe[selected_features]

    for col in cats:
        if col in X_train_fs.columns:
            X_train_fs[col] = X_train_fs[col].astype('category')
            X_valid_fs[col] = X_valid_fs[col].astype('category')

    model = XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=69,
        n_jobs=4,
        eval_metric='auc',
        early_stopping_rounds=100,
        use_label_encoder=False,
        enable_categorical=True,
        tree_method='hist'
    )
    
    model.fit(
        X_train_fs, y_train,
        eval_set=[(X_valid_fs, y_valid)],
        verbose=False
    )

    y_pred = model.predict_proba(X_valid_fs)[:, 1]
    oof_xgb[valid_idx] = y_pred

print(f'\nOverall XGB w/ FE&FS OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

In [None]:
# сблендим предсказания и посмотрим на метрики

oof_blend = (oof_cat + oof_lgb + oof_xgb) / 3

oof_roc_auc = roc_auc_score(y, oof_blend)
oof_accuracy = accuracy_score(y, oof_blend > 0.5)
print(f'\nOverall BLEND w/ FE&FS OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

# Подбор гиперпараметров (15 баллов)

Для изучения того, какие гиперпараметры доступны у модели для выбора можно использовать '? ModelName'
Определитесь с тем, какие гиперпараметры вы хотите оптимизировать и укажите их в objective. Выполните поиск гиперпараметров с Optuna

In [None]:
# ? CatBoostClassifier

In [None]:
# ? lgb.LGBMClassifier

In [None]:
# ? XGBClassifier

In [67]:
def objective(trial):
    bootstrap_type = trial.suggest_categorical('bootstrap_type',
                                               ['Bayesian', 'Bernoulli', 'MVS'])
    
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'task_type': 'CPU',
        'random_state': 69,
        'verbose': 0,
        # добавьте сюда прочие гиперпараметры, к примеру l2_leaf_reg, random_strength, bagging_temperature, ...
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
        'random_strength': trial.suggest_float('random_strength', 0.5, 2.0),
        # 'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'rsm': trial.suggest_float('rsm', 0.5, 1.0),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
    }
    
    # if 'cats' in globals():
    #     params['cat_features'] = cats
    if bootstrap_type == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 1.0)
    else:
        params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
    oof_cat = np.zeros(len(X))
    
    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        X_train = fe(X_train).loc[:, selected_features]
        X_valid = fe(X_valid).loc[:, selected_features]

        actual_cats = [c for c in cats if c in X_train.columns]
        if len(actual_cats) < len(cats):
            dropped = set(cats) - set(actual_cats)
            print(f"dropped missing cat_features: {dropped}")

        
        model = CatBoostClassifier(**params, cat_features=actual_cats)
        model.fit(
            X_train, y_train,
            eval_set=(X_valid, y_valid),
            early_stopping_rounds=100,
            verbose=0
        )
        
        y_pred = model.predict_proba(X_valid)[:, 1]
        oof_cat[valid_idx] = y_pred

    oof_roc_auc = roc_auc_score(y, oof_cat)
    
    return oof_roc_auc

In [63]:
# print("Columns in X_train:", list(X_train.columns))
# print("cat_features requested:", params.get('cat_features'))
# missing = [c for c in params.get('cat_features', []) if c not in X_train.columns]
# print("missing cat_features:", missing)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=3600)

cat_params = study.best_params
cat_params

[I 2025-05-26 22:36:03,476] A new study created in memory with name: no-name-b81efa1d-1132-43b5-9595-419abc78eccb


dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 

[I 2025-05-26 22:46:00,103] Trial 0 finished with value: 0.8305562985196095 and parameters: {'bootstrap_type': 'Bayesian', 'iterations': 1315, 'learning_rate': 0.023336870969410214, 'depth': 7, 'l2_leaf_reg': 3.7053046186268253, 'random_strength': 1.3786416747538321, 'border_count': 84, 'leaf_estimation_iterations': 10, 'rsm': 0.68504513805649, 'bagging_temperature': 0.34844744507314473}. Best is trial 0 with value: 0.8305562985196095.


dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 

[I 2025-05-26 22:49:49,709] Trial 1 finished with value: 0.8300833719030136 and parameters: {'bootstrap_type': 'MVS', 'iterations': 1218, 'learning_rate': 0.07927583425616179, 'depth': 6, 'l2_leaf_reg': 1.0937817930180922, 'random_strength': 1.0067404809184528, 'border_count': 218, 'leaf_estimation_iterations': 9, 'rsm': 0.8620273591909112, 'subsample': 0.9052127389234579}. Best is trial 0 with value: 0.8305562985196095.


dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 

[I 2025-05-26 22:55:08,419] Trial 2 finished with value: 0.8307594144219459 and parameters: {'bootstrap_type': 'MVS', 'iterations': 723, 'learning_rate': 0.051873047558966476, 'depth': 7, 'l2_leaf_reg': 7.537855949196294, 'random_strength': 0.7484659550520107, 'border_count': 136, 'leaf_estimation_iterations': 7, 'rsm': 0.7468730427348923, 'subsample': 0.6831055388672738}. Best is trial 2 with value: 0.8307594144219459.


dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 

[I 2025-05-26 23:10:22,908] Trial 3 finished with value: 0.8283858346914176 and parameters: {'bootstrap_type': 'Bernoulli', 'iterations': 1699, 'learning_rate': 0.032892774794699625, 'depth': 10, 'l2_leaf_reg': 3.954954572401087, 'random_strength': 0.9729935349996279, 'border_count': 245, 'leaf_estimation_iterations': 5, 'rsm': 0.6288805577888466, 'subsample': 0.8205374650630195}. Best is trial 2 with value: 0.8307594144219459.


dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}
dropped missing cat_features: {'d5_firstblood_claimed', 'd3_firstblood_claimed', 'lobby_type', 'r1_firstblood_claimed', 'd2_firstblood_claimed', 'r4_firstblood_claimed', 'r3_firstblood_claimed', 'r2_firstblood_claimed'}


In [None]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'random_state': 69,
        'n_jobs': 4,
        'verbosity': -1,
        'early_stopping_rounds': 100
        # добавьте сюда прочие гиперпараметры, к примеру num_leaves, min_child_samples, subsample, ...
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
    oof_lgb = np.zeros(len(X))
    
    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        X_train = fe(X_train).iloc[:, selected_features]
        X_valid = fe(X_valid).iloc[:, selected_features]
        
        for col in cats:
            X_train[col] = X_train[col].astype('category')
            X_valid[col] = X_valid[col].astype('category')
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            categorical_feature=cats,
        )
        
        y_pred = model.predict_proba(X_valid)[:, 1]
        oof_lgb[valid_idx] = y_pred

    oof_roc_auc = roc_auc_score(y, oof_lgb)
    
    return oof_roc_auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=3600)

lgb_params = study.best_params
lgb_params.update({
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'random_state': 69,
    'n_jobs': 4,
    'verbosity': -1,
    'early_stopping_rounds': 100
})
lgb_params

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 2.0),
        'eval_metric': 'auc',
        'early_stopping_rounds': 100,
        'use_label_encoder': False,
        'enable_categorical': True,
        'tree_method': 'hist',
        'random_state': 69,
        'n_jobs': 4
        # добавьте сюда прочие гиперпараметры, к примеру gamma, min_child_weight, subsample, ...
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
    oof_xgb = np.zeros(len(X))
    
    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        

        X_train = fe(X_train).iloc[:, selected_features]
        X_valid = fe(X_valid).iloc[:, selected_features]
        
        for col in cats:
            X_train[col] = X_train[col].astype('category')
            X_valid[col] = X_valid[col].astype('category')
        
        model = XGBClassifier(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )
        
        y_pred = model.predict_proba(X_valid)[:, 1]
        oof_xgb[valid_idx] = y_pred

    oof_roc_auc = roc_auc_score(y, oof_xgb)
    
    return oof_roc_auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=3600)

xgb_params = study.best_params
xgb_params.update({
    'eval_metric': 'auc',
    'early_stopping_rounds': 100,
    'use_label_encoder': False,
    'enable_categorical': True,
    'tree_method': 'hist',
    'random_state': 69,
    'n_jobs': 4
})

# Обучим модели с новыми гиперпараметрами

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
oof_cat = np.zeros(len(X))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # чтобы в тест не утекли глобальные статистики трейна, сначала сплитим, потом генерируем фичи
    X_train = fe(X_train).iloc[:, selected_features]
    X_valid = fe(X_valid).iloc[:, selected_features]

    model = CatBoostClassifier(
        **cat_params,
        thread_count=4
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

    y_pred = model.predict_proba(X_valid)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_valid, y_pred)
    fold_acc = accuracy_score(y_valid, y_pred > 0.5)
    print(f'FOLD {fold}, Validation ROC-AUC score: {fold_roc_auc:.4f}, Accuracy score: {fold_acc:.4f}')

    oof_cat[valid_idx] = y_pred

oof_roc_auc = roc_auc_score(y, oof_cat)
oof_accuracy = accuracy_score(y, oof_cat > 0.5)
print(f'\nOverall Tuned CAT w/ FE&FS OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
oof_lgb = np.zeros(len(X))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # чтобы в тест не утекли глобальные статистики трейна, сначала сплитим, потом генерируем фичи
    X_train = fe(X_train).iloc[:, selected_features]
    X_valid = fe(X_valid).iloc[:, selected_features]

    # lgbm требует указания категориальных фичей как 'category'
    for col in cats:
        X_train[col] = X_train[col].astype('category')
        X_valid[col] = X_valid[col].astype('category')

    final_params = {
        **best_params,
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'random_state': 69,
        'n_jobs': 4,
        'verbosity': -1,
        'early_stopping_round': 100
    }

    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        categorical_feature=cats
    )

    y_pred = model.predict_proba(X_valid)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_valid, y_pred)
    fold_acc = accuracy_score(y_valid, y_pred > 0.5)
    print(f'FOLD {fold}, Validation ROC-AUC score: {fold_roc_auc:.4f}, Accuracy score: {fold_acc:.4f}')

    oof_lgb[valid_idx] = y_pred

oof_roc_auc = roc_auc_score(y, oof_lgb)
oof_accuracy = accuracy_score(y, oof_lgb > 0.5)
print(f'\nOverall Tuned LGB w/ FE&FS OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
oof_xgb = np.zeros(len(X))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # чтобы в тест не утекли глобальные статистики трейна, сначала сплитим, потом генерируем фичи
    X_train = fe(X_train).iloc[:, selected_features]
    X_valid = fe(X_valid).iloc[:, selected_features]

    # xgb требует указания категориальных фичей как 'category'
    for col in cats:
        X_train[col] = X_train[col].astype('category')
        X_valid[col] = X_valid[col].astype('category')

    cat_columns = [X_train.columns.get_loc(col) for col in cats]

    final_params = {
        **xgb_params,
        'eval_metric': 'auc',
        'early_stopping_rounds': 100,
        'use_label_encoder': False,
        'enable_categorical': True,
        'tree_method': 'hist',
        'random_state': 69,
        'n_jobs': 4
    }

    model = XGBClassifier(**final_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )

    y_pred = model.predict_proba(X_valid)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_valid, y_pred)
    fold_acc = accuracy_score(y_valid, y_pred > 0.5)
    print(f'FOLD {fold}, Validation ROC-AUC score: {fold_roc_auc:.4f}, Accuracy score: {fold_acc:.4f}')

    oof_xgb[valid_idx] = y_pred

oof_roc_auc = roc_auc_score(y, oof_xgb)
oof_accuracy = accuracy_score(y, oof_xgb > 0.5)
print(f'\nOverall Tuned XGB w/ FE&FS OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

In [None]:
# сблендим предсказания и посмотрим на метрики

oof_blend = (oof_cat + oof_lgb + oof_xgb) / 3

oof_roc_auc = roc_auc_score(y, oof_blend)
oof_accuracy = accuracy_score(y, oof_blend > 0.5)
print(f'\nOverall Tuned BLEND w/ FE&FS OOF ROC-AUC: {oof_roc_auc:.4f}, OOF Accuracy: {oof_accuracy:.4f}')

# Итоговая важность фичей

In [None]:
# изучите итоговые важности фичей моделей catboost, lgbm, xgb (используйте обычный feature importance)
# можно использовать модели с последнего фолда
# если хочется более точно - можно посчитать важности на каждом фолде и усреднить (это опционально)

# ваш код:
from catboost import CatBoostClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
for train_idx, valid_idx in skf.split(X, y):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

cat_model = CatBoostClassifier(**cat_params, thread_count=4)
cat_model.fit(X_train, y_train)
cat_imp = dict(zip(X_train.columns, cat_model.get_feature_importance()))
cat_top = sorted(cat_imp.items(), key=lambda x: x[1], reverse=True)[:20]

for col in cats:
    X_train[col] = X_train[col].astype('category')
lgb_model = lgb.LGBMClassifier(**lgb_params)
lgb_model.fit(X_train, y_train)
lgb_imp = dict(zip(X_train.columns, lgb_model.feature_importances_))
lgb_top = sorted(lgb_imp.items(), key=lambda x: x[1], reverse=True)[:20]

xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X_train, y_train)
xgb_imp = dict(zip(X_train.columns, xgb_model.feature_importances_))
xgb_top = sorted(xgb_imp.items(), key=lambda x: x[1], reverse=True)[:20]

print("\nCATBOOST TOP-20:")
for f, imp in cat_top: print(f"{f:30s} {imp:.4f}")

print("\nLIGHTGBM TOP-20:")
for f, imp in lgb_top: print(f"{f:30s} {imp:.4f}")

print("\nXGBOOST TOP-20:")
for f, imp in xgb_top: print(f"{f:30s} {imp:.4f}")
