# 02 — LightAutoML Baseline

Реализован бейслайн на LAMA с GPU. План:

1. Подключение общих утилит.
2. Подготовка данных на EDA.
3. Формирование стратегии split без утечки.
4. Запуск LightAutoML.
5. Сравнение метрик, выбор лучшей модели, сохранение артефактов.
6. Анализ результатов.

## 0. Подготовка окружения и импортов



In [1]:
from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

try:
    from lightautoml.automl.presets.tabular_presets import TabularAutoML
    from lightautoml.tasks import Task
except ImportError as exc:
    raise ImportError(
        "LightAutoML не установлен. Выполните `pip install lightautoml` в используемом окружении и перезапустите ядро."
    ) from exc

PROJECT_ROOT = Path("..", "").resolve()
DATA_DIR = PROJECT_ROOT / "data" / "raw"
MODELS_DIR = PROJECT_ROOT / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = "hospital_death"
ID_COL = "encounter_id"
RANDOM_STATE = 42
FOLDS = 5

print(f"Project root: {PROJECT_ROOT}")


'nlp' extra dependency package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependency package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.




Project root: D:\cursor projects\automl2025


## 1. Загрузка данных и первичная обработка

Используем CSV-файлы.
- удаляем признаки с >40% пропусков;
- добавляем биннинги возраста/ИМТ, взаимодействие `apache` и отношение LOS;
- считаем средние по смертности для `hospital_id`/`icu_type`/`apache_3j_bodysystem` (target-like статистики из train).



In [2]:
HIGH_MISSING_THRESHOLD = 0.4
GROUP_STAT_COLS = ["hospital_id", "icu_type", "apache_3j_bodysystem"]


def add_feature_engineering(df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
    result = df.copy()

    if "bmi" in result.columns:
        result["bmi_missing"] = result["bmi"].isna().astype(int)
        result["bmi_bucket"] = pd.qcut(result["bmi"], q=5, duplicates="drop")

    if "age" in result.columns:
        result["age_bucket"] = pd.cut(result["age"], bins=[0, 30, 50, 65, 80, 120], right=False)

    if {"apache_4a_hospital_death_prob", "apache_4a_icu_death_prob"}.issubset(result.columns):
        result["apache_prob_interaction"] = (
            result["apache_4a_hospital_death_prob"] * result["apache_4a_icu_death_prob"]
        )

    if {"pre_icu_los_days", "icu_los_days"}.issubset(result.columns):
        result["los_ratio"] = (
            result["pre_icu_los_days"].fillna(0) / (result["icu_los_days"].fillna(1) + 1e-3)
        )

    return result


def add_group_statistics(
    train_df: pd.DataFrame, test_df: pd.DataFrame, group_cols: List[str], target_col: str
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    global_mean = train_df[target_col].mean()
    for col in group_cols:
        if col not in train_df.columns:
            continue
        mapping = train_df.groupby(col)[target_col].mean()
        train_df[f"{col}_death_rate"] = train_df[col].map(mapping)
        test_df[f"{col}_death_rate"] = test_df[col].map(mapping).fillna(global_mean)
    return train_df, test_df


def load_datasets(data_dir: Path) -> Dict[str, pd.DataFrame]:
    train_path = data_dir / "training_v2.csv"
    test_path = data_dir / "unlabeled.csv"
    if not train_path.exists() or not test_path.exists():
        raise FileNotFoundError(
            "Не найдены CSV-файлы"
        )

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    train_df = train_df.drop_duplicates(subset=ID_COL)
    test_df = test_df.drop_duplicates(subset=ID_COL)

    missing_share = train_df.isna().mean()
    keep_cols = missing_share[missing_share <= HIGH_MISSING_THRESHOLD].index.tolist()
    keep_cols += [TARGET_COL, ID_COL, "patient_id"]
    keep_cols = list(dict.fromkeys(keep_cols))

    train_df = train_df[keep_cols].copy()
    test_df = test_df[[col for col in keep_cols if col != TARGET_COL]].copy()

    train_df = add_feature_engineering(train_df, is_train=True)
    test_df = add_feature_engineering(test_df, is_train=False)
    train_df, test_df = add_group_statistics(train_df, test_df, GROUP_STAT_COLS, TARGET_COL)

    cat_cols = train_df.select_dtypes(include=["object", "category"]).columns
    for col in cat_cols:
        train_df[col] = train_df[col].astype(str)
        if col in test_df.columns:
            test_df[col] = test_df[col].astype(str)

    return {"train": train_df, "test": test_df}


data = load_datasets(DATA_DIR)
train_df, test_df = data["train"], data["test"]
train_df.head()


Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,bmi_missing,bmi_bucket,age_bucket,apache_prob_interaction,hospital_id_death_rate,icu_type_death_rate,apache_3j_bodysystem_death_rate
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,Sepsis,Cardiovascular,0,"(14.844000000000001, 22.769]","[65, 80)",0.005,0.070621,0.060205,0.157922
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,Respiratory,Respiratory,0,"(26.059, 29.445]","[65, 80)",0.1363,0.043103,0.087495,0.112068
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,Metabolic,Metabolic,0,"(29.445, 34.49]","[0, 30)",0.0,0.070621,0.087495,0.015163
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,Cardiovascular,Cardiovascular,0,"(14.844000000000001, 22.769]","[80, 120)",0.0012,0.070621,0.060205,0.079669
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,Trauma,Trauma,1,,"[0, 30)",,0.027864,0.087495,0.067413


In [3]:
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
print(train_df[TARGET_COL].value_counts(normalize=True))


Train shape: (91713, 119), Test shape: (39308, 118)
hospital_death
0    0.913698
1    0.086302
Name: proportion, dtype: float64


## 2. Стратегия валидации

Используем `StratifiedKFold` (5 фолдов, фиксированный seed).


In [4]:
feature_cols = [col for col in train_df.columns if col not in {TARGET_COL}]
train_data = train_df[feature_cols + [TARGET_COL]].copy()

task = Task("binary", loss="logloss")
folds = list(
    StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE).split(
        train_data[feature_cols], train_data[TARGET_COL]
    )
)
print(f"Prepared {len(folds)} folds for CV")


Prepared 5 folds for CV


In [5]:
def run_lama_experiment(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    preset_params: dict,
    exp_name: str,
) -> dict:
    """Обучает TabularAutoML и возвращает метрики/предсказания."""

    automl = TabularAutoML(task=task, **preset_params)
    oof_pred = automl.fit_predict(
        train_df,
        roles={"target": TARGET_COL, "drop": [ID_COL]},
        cv_iter=folds,
        verbose=1,
    )
    oof_score = roc_auc_score(train_df[TARGET_COL], oof_pred.data[:, 0])

    test_pred = automl.predict(test_df)
    submission = pd.DataFrame({
        ID_COL: test_df[ID_COL],
        TARGET_COL: test_pred.data[:, 0],
    })
    submission_path = MODELS_DIR / f"submission_{exp_name}.csv"
    submission.to_csv(submission_path, index=False)

    return {
        "score": oof_score,
        "submission_path": submission_path,
        "predictions": submission,
    }



## 3. LightAutoML конфигурация A

Первая конфигурация — быстрый GPU preset с небольшим временем обучения.


In [6]:
preset_a_params = {
    "gpu_ids": "0",
    "timeout": 3600,  # 1 час ограничение
    "reader_params": {
        "n_jobs": 4,
    },
    "general_params": {
        "use_algos": [["lgb", "cb", "xgb"]],
    },
}

results_a = run_lama_experiment(train_data, test_df[feature_cols], preset_a_params, "gpu_tiny")
print(f"Config A ROC-AUC: {results_a['score']:.4f}")


[15:19:40] Stdout logging level is INFO.
[15:19:40] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[15:19:40] Task: binary

[15:19:40] Start automl preset with listed constraints:
[15:19:40] - time: 3600.00 seconds
[15:19:40] - CPU: 4 cores
[15:19:40] - memory: 16 GB

[15:19:40] [1mTrain data shape: (91713, 119)[0m

[15:19:51] Layer [1m1[0m train process start. Time left 3588.61 secs
[15:20:04] [1mSelector_LightGBM[0m fitting and predicting completed
[15:20:08] Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
[15:21:08] Fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m finished. score = [1m0.9010776395970835[0m
[15:21:08] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[15:21:08] Start fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m ...
[15:22:35] Fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m finished. score = [1m0.9003271064398358[0m
[15:22:35] [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m fitting and predicting complet

## 4. LightAutoML конфигурация B

Второй запуск. Более тяжёлый с расширенным стеком, увеличиваем `timeout` и включаем feature selection.

In [7]:
preset_b_params = {
    "gpu_ids": "0",
    "timeout": 7200,  # до 2 часов
    "reader_params": {
        "n_jobs": 6,
    },
    "general_params": {
        "use_algos": [["lgb", "cb", "xgb", "linear_l2"]],
        "max_features_cnt": 500,
    },
    "tuning_params": {
        "max_tuning_iter": 50,
    },
}

results_b = run_lama_experiment(train_data, test_df[feature_cols], preset_b_params, "gpu_extended")
print(f"Config B ROC-AUC: {results_b['score']:.4f}")

[15:23:00] Stdout logging level is INFO.
[15:23:00] Task: binary

[15:23:00] Start automl preset with listed constraints:
[15:23:00] - time: 7200.00 seconds
[15:23:00] - CPU: 4 cores
[15:23:00] - memory: 16 GB

[15:23:00] [1mTrain data shape: (91713, 119)[0m

[15:23:08] Layer [1m1[0m train process start. Time left 7192.40 secs
[15:23:12] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[15:23:32] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.8935206315485046[0m
[15:23:32] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[15:23:32] Time left 7167.88 secs

[15:23:47] [1mSelector_LightGBM[0m fitting and predicting completed
[15:23:52] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[15:25:04] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.9010776395970835[0m
[15:25:04] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[15:25:04] Start fitting [1mLvl_0_Pipe_1_Mod_1_CatBoost[0m ...
[15:26:

## 5. Сравнение результатов


In [8]:
comparison = pd.DataFrame(
    [
        {"config": "GPU tiny", "roc_auc": results_a["score"], "submission": results_a["submission_path"].name},
        {"config": "GPU extended", "roc_auc": results_b["score"], "submission": results_b["submission_path"].name},
    ]
)
comparison.sort_values("roc_auc", ascending=False)

Unnamed: 0,config,roc_auc,submission
1,GPU extended,0.904368,submission_gpu_extended.csv
0,GPU tiny,0.903791,submission_gpu_tiny.csv


**Вывод:** Конфигурация с расширенными настройками покала лучший результат (roc_auc 0.904368). Берем её за условный бейслайн


## 6. Вывод

- Файл submission_gpu_extended.csv сабмитим на Kaggle. Кастомное решение будем сравнивать с ним.