
# 07 — LightGBM + Optuna

Быстрый тюнинг LGBM через Optuna с приоритетом GPU. Данные — как в `01_eda.ipynb`.


In [2]:

from __future__ import annotations

import json
from pathlib import Path
from typing import List, Tuple

import numpy as np
import optuna
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split

try:
    import lightgbm as lgb
except ImportError as exc:
    raise ImportError("Установите lightgbm: pip install lightgbm") from exc

optuna.logging.set_verbosity(optuna.logging.WARNING)


## Настройка путей и констант



In [3]:

PROJECT_ROOT = Path('..', '').resolve()
DATA_DIR = PROJECT_ROOT / 'data' / 'raw'
MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = 'hospital_death'
ID_COL = 'encounter_id'
PATIENT_COL = 'patient_id'
RANDOM_STATE = 42
N_TRIALS = 20
N_SPLITS = 3

print(f'Project root: {PROJECT_ROOT}')
print(f'Data dir: {DATA_DIR}')


Project root: D:\cursor projects\automl2025
Data dir: D:\cursor projects\automl2025\data\raw


## Загрузка и базовая подготовка данных

Читаем тренировочный и тестовый CSV-файлы, отделяем таргет от признаков и убираем ID-колонки из фичей. Одновременно объединяем train и test, чтобы единообразно привести категориальные признаки к типу category.


In [4]:

def load_data(data_dir: Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_path = data_dir / 'training_v2.csv'
    test_path = data_dir / 'unlabeled.csv'
    if not train_path.exists() or not test_path.exists():
        raise FileNotFoundError('Скачайте данные Kaggle через scripts/download_data.py --unzip')
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    print(f'train shape={train_df.shape}, test shape={test_df.shape}')
    return train_df, test_df


def prepare_frames(train_df: pd.DataFrame, test_df: pd.DataFrame):
    y = train_df[TARGET_COL].copy()
    train_df = train_df.drop(columns=[TARGET_COL])

    test_ids = test_df[ID_COL].copy()

    drop_cols = {ID_COL, PATIENT_COL}
    feature_cols = [c for c in train_df.columns if c not in drop_cols]

    full = pd.concat([train_df, test_df], axis=0, ignore_index=True)

    cat_cols = [c for c in feature_cols if full[c].dtype == 'object']
    for col in cat_cols:
        full[col] = full[col].astype('category')

    X_all = full[feature_cols]
    X_train = X_all.iloc[: len(train_df)].copy()
    X_test = X_all.iloc[len(train_df) :].copy()

    return X_train, y, X_test, test_ids, cat_cols


## Проверка и выбор устройства (GPU/CPU)

Небольшая проба LightGBM на подвыборке: если обучение успешно запускается на GPU, используем `device='gpu'`, иначе автоматически переключаемся на CPU. Это позволяет запускать ноутбук и на машинах без видеокарты.


In [5]:

train_df, test_df = load_data(DATA_DIR)
X, y, X_test, test_ids, cat_cols = prepare_frames(train_df, test_df)
print(f'Features: {X.shape[1]}, categorical: {len(cat_cols)}')


train shape=(91713, 186), test shape=(39308, 186)
Features: 183, categorical: 8


In [6]:

# Проверка GPU: если недоступен — вернём CPU, чтобы не упасть

def detect_device(sample_X: pd.DataFrame, sample_y: pd.Series, categorical: List[str]) -> str:
    try:
        probe = lgb.LGBMClassifier(
            objective='binary',
            metric='auc',
            device='gpu',
            n_estimators=5,
            num_leaves=8,
            learning_rate=0.1,
            random_state=RANDOM_STATE,
        )
        probe.fit(
            sample_X,
            sample_y,
            categorical_feature=categorical,
            eval_set=[(sample_X, sample_y)],
            eval_metric='auc',
        )
        print("GPU доступен, используем device='gpu'")
        return 'gpu'
    except Exception as exc:  # pragma: no cover
        print(f'GPU недоступен, fallback на CPU: {exc}')
        return 'cpu'


device_choice = detect_device(X.head(200), y.head(200), cat_cols)


[LightGBM] [Info] Number of positive: 12, number of negative: 188
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4293
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 165
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 126 dense feature groups (0.02 MB) transferred to GPU in 0.005212 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060000 -> initscore=-2.751535
[LightGBM] [Info] Start training from score -2.751535
GPU доступен, используем device='gpu'


In [7]:

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)


def build_params(trial: optuna.Trial) -> dict:
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 256),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 200),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 400, 1600),
    }


def objective(trial: optuna.Trial) -> float:
    params = build_params(trial)
    params.update(
        {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'random_state': RANDOM_STATE,
            'device': device_choice,
        }
    )

    scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train,
            y_train,
            categorical_feature=cat_cols,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(50, verbose=False)],
        )
        preds = model.predict_proba(X_val)[:, 1]
        scores.append(roc_auc_score(y_val, preds))

    mean_score = float(np.mean(scores))
    trial.set_user_attr('fold_scores', scores)
    return mean_score


In [8]:

study = optuna.create_study(direction='maximize', study_name='lgbm_gpu_optuna')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

print(f'Best AUC: {study.best_value:.5f}')
print('Best params:')
print(json.dumps(study.best_params, indent=2))


[LightGBM] [Info] Number of positive: 5277, number of negative: 55865
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25428
[LightGBM] [Info] Number of data points in the train set: 61142, number of used features: 182
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 120 dense feature groups (7.00 MB) transferred to GPU in 0.016120 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086307 -> initscore=-2.359580
[LightGBM] [Info] Start training from score -2.359580
[LightGBM] [Info] Number of positive: 5277, number of negative: 55865
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25454
[LightGBM] [Info] Number of data points in the train set: 61142, number of used features: 182
[LightGBM] [Info] Using

In [9]:

best_params = {
    **study.best_params,
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'random_state': RANDOM_STATE,
    'device': device_choice,
}

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(
    X_train,
    y_train,
    categorical_feature=cat_cols,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(100, verbose=False)],
)

val_pred = final_model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_pred)
print(f'Hold-out AUC: {val_auc:.5f}')

final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(
    X,
    y,
    categorical_feature=cat_cols,
    eval_set=[(X, y)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(50, verbose=False)],
)

submit = pd.DataFrame({ID_COL: test_ids, TARGET_COL: final_model.predict_proba(X_test)[:, 1]})
submission_path = MODELS_DIR / 'submission_lgbm_optuna_gpu.csv'
submit.to_csv(submission_path, index=False)

print(f'Saved submission to {submission_path}')


[LightGBM] [Info] Number of positive: 6332, number of negative: 67038
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25604
[LightGBM] [Info] Number of data points in the train set: 73370, number of used features: 182
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 125 dense feature groups (8.96 MB) transferred to GPU in 0.007921 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086302 -> initscore=-2.359643
[LightGBM] [Info] Start training from score -2.359643
Hold-out AUC: 0.91144
[LightGBM] [Info] Number of positive: 7915, number of negative: 83798
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25817
[LightGBM] [Info] Number of data points in the train set: 91713, number of used features: 182
[



Saved submission to D:\cursor projects\automl2025\models\submission_lgbm_optuna_gpu.csv


## Результат

На сабмите в kaggle LGBM+Optuna (Private score: 0.90696) показал незначительное превосходство по сравнению с LAMA extended (Private score: 0.90504). 