In [1]:
import optuna
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np
import joblib

In [2]:
# Загрузка данных
train = pd.read_csv('my_data/2_my_train.csv')
X = train.drop(columns=['target', 'variantid1', 'variantid2'])
y = train['target']

In [3]:
# Установка параметров кросс-валидации
n_splits = 5   
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [4]:
# Фиксированные параметры
fixed_params = {
    'max_iter': 1000,  # можно изменить при необходимости
    'loss': 'log_loss',
    'random_state': 42,
    'scoring': 'roc_auc',
    'n_iter_no_change': 100,
    'verbose': 0,
    'class_weight': 'balanced',
    'early_stopping': 'auto',
    'warm_start': True
}

In [5]:
# Функция для оптимизации HistGradientBoostingClassifier
def optimize_hgb(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 3, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 100),
        'l2_regularization': trial.suggest_float('l2_regularization', 0, 10.0),
        'max_features': trial.suggest_float('max_features', 0.8, 1.0),
        'max_bins': trial.suggest_int('max_bins', 8, 255),
        'tol': trial.suggest_float('tol', 1e-7, 1e-3),  
    }
    
    aucs = []
    
    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = HistGradientBoostingClassifier(**params)
        model.fit(X_train, y_train)
        
        preds = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, preds)
        aucs.append(auc)
    
    return np.mean(aucs)

In [6]:
# Оптимизация гиперпараметров с помощью Optuna
study_hgb = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study_hgb.optimize(optimize_hgb, n_trials=4)

[I 2024-09-08 08:26:29,214] A new study created in memory with name: no-name-fa0d579e-d8c4-4cc2-b028-55cc255070b1
[I 2024-09-08 08:47:48,712] Trial 0 finished with value: 0.9638123102572287 and parameters: {'learning_rate': 0.03751655787285152, 'max_leaf_nodes': 951, 'max_depth': 74, 'min_samples_leaf': 60, 'l2_regularization': 1.5601864044243652, 'max_features': 0.8311989040672406, 'max_bins': 22, 'tol': 0.0008661895281603576}. Best is trial 0 with value: 0.9638123102572287.
[I 2024-09-08 08:52:44,225] Trial 1 finished with value: 0.9526150592931998 and parameters: {'learning_rate': 0.060151389673146566, 'max_leaf_nodes': 709, 'max_depth': 5, 'min_samples_leaf': 97, 'l2_regularization': 8.324426408004218, 'max_features': 0.8424678221356553, 'max_bins': 53, 'tol': 0.00018348616940244846}. Best is trial 0 with value: 0.9638123102572287.
[I 2024-09-08 09:11:36,405] Trial 2 finished with value: 0.9618270216290895 and parameters: {'learning_rate': 0.03049380007165782, 'max_leaf_nodes': 526

In [7]:
best_params_hgb = study_hgb.best_params
final_params = {**fixed_params, **best_params_hgb}

In [8]:
# Финальное обучение на всей выборке и сохранение модели
final_hgb = HistGradientBoostingClassifier(**final_params)
final_hgb.fit(X, y)
joblib.dump(final_hgb, 'final_hgb_model.pkl')

['final_hgb_model.pkl']