In [3]:
import os
import warnings
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import optuna

import torch

from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore")

PROCESSED_DATA_PATH = '../data/processed/'
SUBMISSIONS_PATH = '../submissions/'
OOF_PREDS_PATH = '../oof_preds/'
MODELS_PATH = '../models/'

os.makedirs(SUBMISSIONS_PATH, exist_ok=True)
os.makedirs(OOF_PREDS_PATH, exist_ok=True)
os.makedirs(MODELS_PATH, exist_ok=True)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используемое устройство: {DEVICE}")

RANDOM_STATE = 42
N_SPLITS = 5
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

Используемое устройство: cpu


# 1. Настройка и Загрузка

In [4]:
# Конфигурация путей
PROCESSED_DATA_PATH = '../data/processed/'
SUBMISSIONS_PATH = '../submissions/'
os.makedirs(SUBMISSIONS_PATH, exist_ok=True)

# Загрузка обработанных данных
try:
    train_df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'train_processed.csv'))
    test_df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'test_processed.csv'))
    print(f"Размер тренировочных данных: {train_df.shape}")
    print(f"Размер тестовых данных: {test_df.shape}")
except FileNotFoundError:
    print("Ошибка: Убедитесь, что файлы train_processed.csv и test_processed.csv находятся в папке ../data/processed/")
    print("Пожалуйста, запустите сначала ноутбук 1_data_preparation.ipynb.")
    # Создаем пустые датафреймы, чтобы избежать ошибок ниже
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()

if not train_df.empty:
    # Подготовка признаков и целевой переменной
    TARGET = 'taste_cluster'
    ID_COLS = ['PubChem_ID', 'SMILES', 'SMILES_standardized']
    features = [col for col in train_df.columns if col not in ID_COLS + [TARGET]]

    X = train_df[features]
    y = train_df[TARGET].astype(int)
    X_test = test_df[features]

    print(f"Количество признаков для обучения: {len(features)}")

Размер тренировочных данных: (2064, 221)
Размер тестовых данных: (888, 221)
Количество признаков для обучения: 217


# 2. Тюнинг моделей

In [5]:
# --- 1. Тюнинг гиперпараметров для LightGBM ---

def objective_lgbm(trial):
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'seed': RANDOM_STATE,
        'class_weight': 'balanced',
        'n_estimators': 1500,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 40),
    }

    # Для скорости оцениваем на одном фолде
    train_idx, val_idx = next(kf.split(X, y))
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_val, y_val = X.loc[val_idx], y.loc[val_idx]

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(100, verbose=False)])

    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='macro')
    return f1

print("Запускаем тюнинг для LightGBM...")
study_lgbm = optuna.create_study(direction='maximize')
# Для качественного результата рекомендуется n_trials=50-100
study_lgbm.optimize(objective_lgbm, n_trials=50, show_progress_bar=True)

lgbm_best_params = study_lgbm.best_params
print("Лучшие параметры для LightGBM:", lgbm_best_params)

[I 2025-11-20 12:16:21,063] A new study created in memory with name: no-name-415e6969-b0b9-44e8-96b4-f601a4ce19ad


Запускаем тюнинг для LightGBM...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-20 12:16:21,142] Trial 0 finished with value: 0.04056795131845842 and parameters: {'learning_rate': 0.01428793580767417, 'lambda_l1': 2.696935298465617e-06, 'lambda_l2': 1.2281574331428233, 'num_leaves': 25, 'feature_fraction': 0.8345355669118427, 'bagging_fraction': 0.6296492564696317, 'bagging_freq': 4, 'min_child_samples': 10}. Best is trial 0 with value: 0.04056795131845842.
[I 2025-11-20 12:16:21,174] Trial 1 finished with value: 0.04056795131845842 and parameters: {'learning_rate': 0.0420451328532718, 'lambda_l1': 6.767644351792039, 'lambda_l2': 0.0015087129482529008, 'num_leaves': 36, 'feature_fraction': 0.8811832853802439, 'bagging_fraction': 0.8936542444991236, 'bagging_freq': 1, 'min_child_samples': 10}. Best is trial 0 with value: 0.04056795131845842.
[I 2025-11-20 12:16:21,211] Trial 2 finished with value: 0.04056795131845842 and parameters: {'learning_rate': 0.017544454912721913, 'lambda_l1': 4.038416429418575e-05, 'lambda_l2': 3.7191239709201573, 'num_leaves': 

In [6]:
# --- 2. Тюнинг гиперпараметров для RandomForest ---

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'class_weight': 'balanced',
        'random_state': RANDOM_STATE,
        'n_jobs': -1,
    }

    train_idx, val_idx = next(kf.split(X, y))
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_val, y_val = X.loc[val_idx], y.loc[val_idx]

    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='macro')
    return f1

print("\nЗапускаем тюнинг для RandomForest...")
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=30, show_progress_bar=True)

rf_best_params = study_rf.best_params
print("Лучшие параметры для RandomForest:", rf_best_params)

[I 2025-11-20 12:16:23,392] A new study created in memory with name: no-name-6100395d-feeb-4a94-87ce-2158e799e17a



Запускаем тюнинг для RandomForest...


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-20 12:16:23,839] Trial 0 finished with value: 0.04056795131845842 and parameters: {'n_estimators': 374, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.04056795131845842.
[I 2025-11-20 12:16:24,633] Trial 1 finished with value: 0.04056795131845842 and parameters: {'n_estimators': 921, 'max_depth': 14, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.04056795131845842.
[I 2025-11-20 12:16:25,215] Trial 2 finished with value: 0.04056795131845842 and parameters: {'n_estimators': 685, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.04056795131845842.
[I 2025-11-20 12:16:25,463] Trial 3 finished with value: 0.04056795131845842 and parameters: {'n_estimators': 251, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.04056795

In [7]:
# --- 3. Тюнинг гиперпараметров для XGBoost ---

def objective_xgb(trial):
    params = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'n_estimators': 1500,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'lambda': trial.suggest_float('lambda', 1e-4, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-4, 1.0, log=True),
        'random_state': RANDOM_STATE,
        'use_label_encoder': False,
        'n_jobs': -1,
        'early_stopping_rounds': 100
    }

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    train_idx, val_idx = next(kf.split(X, y))
    X_train, y_train_enc = X.loc[train_idx], y_encoded[train_idx]
    X_val, y_val_enc = X.loc[val_idx], y_encoded[val_idx]

    model = XGBClassifier(**params)
    model.fit(X_train, y_train_enc, eval_set=[(X_val, y_val_enc)], verbose=False)

    preds = model.predict(X_val)
    f1 = f1_score(y_val_enc, preds, average='macro')
    return f1

print("\nЗапускаем тюнинг для XGBoost...")
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30, show_progress_bar=True)

xgb_best_params = study_xgb.best_params
print("Лучшие параметры для XGBoost:", xgb_best_params)

[I 2025-11-20 12:16:41,269] A new study created in memory with name: no-name-0eb009b3-3f67-4656-a2fe-e04e43d8b589



Запускаем тюнинг для XGBoost...


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-20 12:16:48,645] Trial 0 finished with value: 0.051059730250481696 and parameters: {'learning_rate': 0.018827272279491263, 'max_depth': 7, 'subsample': 0.6139506544669787, 'colsample_bytree': 0.7430762406623399, 'lambda': 0.16130431697089057, 'alpha': 0.00029216004978966986}. Best is trial 0 with value: 0.051059730250481696.
[I 2025-11-20 12:16:51,313] Trial 1 finished with value: 0.051059730250481696 and parameters: {'learning_rate': 0.08462437882366416, 'max_depth': 7, 'subsample': 0.8475009569468306, 'colsample_bytree': 0.838330222387445, 'lambda': 0.00011314540728913313, 'alpha': 0.34374232704368113}. Best is trial 0 with value: 0.051059730250481696.
[I 2025-11-20 12:16:54,256] Trial 2 finished with value: 0.051059730250481696 and parameters: {'learning_rate': 0.06886781937715362, 'max_depth': 6, 'subsample': 0.6486690951083381, 'colsample_bytree': 0.8608209910021588, 'lambda': 0.0005639196654455856, 'alpha': 0.0021625981731811963}. Best is trial 0 with value: 0.05105973

# 3. Обучение моделей

In [8]:
# --- Обучение LightGBM на лучших параметрах ---

# Объединяем общие и найденные параметры
final_lgb_params = {
    'objective': 'multiclass', 'metric': 'multi_logloss', 'verbosity': -1,
    'boosting_type': 'gbdt', 'n_jobs': -1, 'seed': RANDOM_STATE, 'class_weight': 'balanced',
    'n_estimators': 1500, # Увеличим для early stopping
    **lgbm_best_params # Добавляем найденные параметры
}

oof_preds_lgb = np.zeros((len(train_df), y.nunique()))
test_preds_lgb = np.zeros((len(test_df), y.nunique()))

print("\nОбучение LightGBM на лучших параметрах...")
for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=N_SPLITS, desc="LGBM OOF")):
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_val, y_val = X.loc[val_idx], y.loc[val_idx]

    model = lgb.LGBMClassifier(**final_lgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(100, verbose=False)])

    oof_preds_lgb[val_idx] = model.predict_proba(X_val)
    test_preds_lgb += model.predict_proba(X_test) / N_SPLITS

np.save(os.path.join(OOF_PREDS_PATH, 'oof_lgbm.npy'), oof_preds_lgb)
np.save(os.path.join(OOF_PREDS_PATH, 'test_lgbm.npy'), test_preds_lgb)


Обучение LightGBM на лучших параметрах...


LGBM OOF:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
# --- Обучение RandomForest на лучших параметрах ---

final_rf_params = {
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
    'class_weight': 'balanced',
    **rf_best_params
}

oof_preds_rf = np.zeros((len(train_df), y.nunique()))
test_preds_rf = np.zeros((len(test_df), y.nunique()))

print("\nОбучение RandomForest на лучших параметрах...")
for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=N_SPLITS, desc="RandomForest OOF")):
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_val, y_val = X.loc[val_idx], y.loc[val_idx]

    model = RandomForestClassifier(**final_rf_params)
    model.fit(X_train, y_train)

    oof_preds_rf[val_idx] = model.predict_proba(X_val)
    test_preds_rf += model.predict_proba(X_test) / N_SPLITS

np.save(os.path.join(OOF_PREDS_PATH, 'oof_rf.npy'), oof_preds_rf)
np.save(os.path.join(OOF_PREDS_PATH, 'test_rf.npy'), test_preds_rf)


Обучение RandomForest на лучших параметрах...


RandomForest OOF:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
# --- Обучение XGBoost на лучших параметрах ---

final_xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'n_estimators': 1500,
    'random_state': RANDOM_STATE,
    'use_label_encoder': False,
    'n_jobs': -1,
    'early_stopping_rounds': 100,
    **xgb_best_params
}

oof_preds_xgb = np.zeros((len(train_df), y.nunique()))
test_preds_xgb = np.zeros((len(test_df), y.nunique()))

print("\nОбучение XGBoost на лучших параметрах...")
for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=N_SPLITS, desc="XGBoost OOF")):
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_val, y_val = X.loc[val_idx], y.loc[val_idx]

    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_val_encoded = le.transform(y_val)

    model = XGBClassifier(**final_xgb_params)
    model.fit(X_train, y_train_encoded,
              eval_set=[(X_val, y_val_encoded)],
              verbose=False)

    oof_preds_xgb[val_idx] = model.predict_proba(X_val)
    test_preds_xgb += model.predict_proba(X_test) / N_SPLITS

np.save(os.path.join(OOF_PREDS_PATH, 'oof_xgb.npy'), oof_preds_xgb)
np.save(os.path.join(OOF_PREDS_PATH, 'test_xgb.npy'), test_preds_xgb)


Обучение XGBoost на лучших параметрах...


XGBoost OOF:   0%|          | 0/5 [00:00<?, ?it/s]

# 4. Взвешенный блендинг и сводки

In [11]:
# --- Финальный взвешенный блендинг и сводка ---

# Загружаем OOF и test предсказания
try:
    oof_lgbm = np.load(os.path.join(OOF_PREDS_PATH, 'oof_lgbm.npy'))
    test_lgbm = np.load(os.path.join(OOF_PREDS_PATH, 'test_lgbm.npy'))

    oof_rf = np.load(os.path.join(OOF_PREDS_PATH, 'oof_rf.npy'))
    test_rf = np.load(os.path.join(OOF_PREDS_PATH, 'test_rf.npy'))

    oof_xgb = np.load(os.path.join(OOF_PREDS_PATH, 'oof_xgb.npy'))
    test_xgb = np.load(os.path.join(OOF_PREDS_PATH, 'test_xgb.npy'))

    print("Предсказания всех моделей после тюнинга успешно загружены.")
except FileNotFoundError as e:
    print(f"Ошибка при загрузке предсказаний: {e}. Убедитесь, что все модели обучены.")
    oof_lgbm = None

results_summary = []

if oof_lgbm is not None:
    # --- Вычисляем F1-score для каждой модели ---
    f1_lgbm = f1_score(y, np.argmax(oof_lgbm, axis=1), average='macro')
    f1_rf = f1_score(y, np.argmax(oof_rf, axis=1), average='macro')
    f1_xgb = f1_score(y, np.argmax(oof_xgb, axis=1), average='macro')

    # --- ИЗМЕНЕНИЕ: Устанавливаем новые веса вручную, отдавая предпочтение лучшей модели ---
    weights = {
        'RandomForest': 0.7,
        'XGBoost': 0.2,
        'LGBM': 0.1
    }

    print("\nНовые, вручную подобранные веса для блендинга:")
    for name, weight in weights.items():
        print(f"{name}: {weight:.2f}")

    # --- Применяем взвешенный блендинг ---
    oof_blend = (weights['LGBM'] * oof_lgbm +
                 weights['RandomForest'] * oof_rf +
                 weights['XGBoost'] * oof_xgb)

    test_blend = (weights['LGBM'] * test_lgbm +
                  weights['RandomForest'] * test_rf +
                  weights['XGBoost'] * test_xgb)

    f1_blend = f1_score(y, np.argmax(oof_blend, axis=1), average='macro')

    # --- Сохранение submission файла ---
    submission_df = pd.DataFrame({'taste_cluster': np.argmax(test_blend, axis=1)})
    file_path = os.path.join(SUBMISSIONS_PATH, 'solution_new_weighted_blend.csv')
    submission_df.to_csv(file_path, index=False)

    # --- Итоговая сводка ---
    results_summary = [
        {'Модель': 'LGBM (Tuned)', 'F1-macro (OOF)': f1_lgbm},
        {'Модель': 'RandomForest (Tuned)', 'F1-macro (OOF)': f1_rf},
        {'Модель': 'XGBoost (Tuned)', 'F1-macro (OOF)': f1_xgb},
        {'Модель': 'New Weighted Blend', 'F1-macro (OOF)': f1_blend}
    ]

    summary_df = pd.DataFrame(results_summary).sort_values('F1-macro (OOF)', ascending=False).reset_index(drop=True)

    print("\n\n" + "="*50)
    print("          ИТОГОВАЯ СВОДКА С НОВЫМИ ВЕСАМИ")
    print("="*50)
    display(summary_df)

    best_model_name = summary_df.loc[0, 'Модель']
    print(f"\n✅ Лучший результат у: '{best_model_name}'")
    print(f"Файл финального решения сохранен как: {file_path}")

Предсказания всех моделей после тюнинга успешно загружены.

Новые, вручную подобранные веса для блендинга:
RandomForest: 0.70
XGBoost: 0.20
LGBM: 0.10


          ИТОГОВАЯ СВОДКА С НОВЫМИ ВЕСАМИ


Unnamed: 0,Модель,F1-macro (OOF)
0,LGBM (Tuned),0.062215
1,RandomForest (Tuned),0.055498
2,XGBoost (Tuned),0.051079
3,New Weighted Blend,0.051079



✅ Лучший результат у: 'LGBM (Tuned)'
Файл финального решения сохранен как: ../submissions/solution_new_weighted_blend.csv
