In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
import lightgbm as lgb
from lightgbm import LGBMClassifier
import optuna
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("train.csv")
validation_data = pd.read_csv("test.csv")
X, y = data.drop('target', axis = 1), data['target']

In [3]:
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X, y)

In [4]:
def objective(trial):

    # Feature pipeline
    pipeline = make_pipeline(
        StandardScaler(),
        PolynomialFeatures(degree=2),
    )

    X_trans = pd.DataFrame(pipeline.fit_transform(X_train), columns = pipeline.get_feature_names_out())

    # Parameters for base models
    lgbm_params = {
        'objective': 'binary',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 8, 64),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 32),
        'random_state': 42,
        'n_jobs': -1,
    }

    xgb_params = {
        'objective': 'binary:logistic',
        'learning_rate': trial.suggest_float('xgb_learning_rate', 1e-2, 0.2, log=True),
        'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 500),
        'max_depth': trial.suggest_int('xgb_max_depth', 10, 32),
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    base_models = [
        ('lgbm', lgb.LGBMClassifier(**lgbm_params)),
        ('xgb', XGBClassifier(**xgb_params)),
    ]

    # Stacking ensemble
    model = StackingClassifier(
        estimators=base_models,
        final_estimator=LogisticRegression(max_iter = trial.suggest_int('max_iter', 500, 10000), random_state=42),
        passthrough=True,
        n_jobs=-1
    )

    # Cross-validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    y_pred_proba = cross_val_predict(model, X_trans, y_train, cv=cv, method='predict_proba')[:, 1]

    # Best threshold
    fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba)
    accuracies = [accuracy_score(y_train, y_pred_proba >= t) for t in thresholds]
    best_idx = np.argmax(accuracies)
    best_threshold = thresholds[best_idx]
    best_acc = accuracies[best_idx]

    trial.set_user_attr('best_threshold', best_threshold)
    return best_acc

# 2. Start optimization
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=100)

print("Лучшие параметры:", study.best_params)
print("Лучшее Accuracy:", study.best_value)
best_threshold = study.best_trial.user_attrs['best_threshold']
print(f"Лучший порог, сохраненный из objective: {best_threshold:.4f}")


[I 2025-04-27 13:45:18,673] A new study created in memory with name: no-name-80a0c07b-c646-457c-ae26-ef9a0b1d3736
[I 2025-04-27 13:50:38,549] Trial 0 finished with value: 0.8934871099050203 and parameters: {'boosting_type': 'dart', 'num_leaves': 49, 'learning_rate': 0.060099747183803134, 'n_estimators': 162, 'max_depth': 13, 'xgb_learning_rate': 0.011900590783184251, 'xgb_n_estimators': 447, 'xgb_max_depth': 23, 'max_iter': 7227}. Best is trial 0 with value: 0.8934871099050203.
[I 2025-04-27 13:54:00,035] Trial 1 finished with value: 0.89280868385346 and parameters: {'boosting_type': 'dart', 'num_leaves': 55, 'learning_rate': 0.018891200276189388, 'n_estimators': 172, 'max_depth': 14, 'xgb_learning_rate': 0.024878734419814436, 'xgb_n_estimators': 310, 'xgb_max_depth': 19, 'max_iter': 3266}. Best is trial 0 with value: 0.8934871099050203.
[I 2025-04-27 13:57:49,442] Trial 2 finished with value: 0.8955223880597015 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 24, 'learning_rate

Лучшие параметры: {'boosting_type': 'gbdt', 'num_leaves': 21, 'learning_rate': 0.06344502132388632, 'n_estimators': 235, 'max_depth': 30, 'xgb_learning_rate': 0.19907919525772444, 'xgb_n_estimators': 261, 'xgb_max_depth': 32, 'max_iter': 9200}
Лучшее Accuracy: 0.898236092265943
Лучший порог, сохраненный из objective: 0.5865


In [5]:
# 3. Final model with the best parameters
best_max_iter = study.best_params.pop('max_iter')

pipeline = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2),
)

X_train_transformed = pd.DataFrame(pipeline.fit_transform(X_train), columns = pipeline.get_feature_names_out())
validation_data_transformed = pd.DataFrame(pipeline.transform(validation_data), columns = pipeline.get_feature_names_out())

# LGBM
best_lgbm = lgb.LGBMClassifier(
    **{k: v for k, v in study.best_params.items() if k.startswith(('boosting_type', 'num_leaves', 'learning_rate', 'n_estimators', 'max_depth', 'subsample', 'colsample_bytree', 'reg_alpha', 'reg_lambda'))},
    random_state=42
)

# XGB (с удалением префикса)
best_xgb = XGBClassifier(
    **{k.replace('xgb_', ''): v for k, v in study.best_params.items() if k.startswith('xgb_')},
    random_state=42, use_label_encoder=False, eval_metric='logloss'
)


best_model = StackingClassifier(
    estimators=[
        ('lgbm', best_lgbm),
        ('xgb', best_xgb),
    ],
    final_estimator=LogisticRegression(max_iter=best_max_iter, random_state=42),
    passthrough=True,
    n_jobs=-1
)

best_model.fit(X_train_transformed, y_train)

# 4. Validation
val_proba = best_model.predict_proba(validation_data_transformed)[:, 1]
val_pred = (val_proba >= best_threshold).astype(int)


print("Validation complete.")

Validation complete.


In [6]:
df_answer = pd.DataFrame({'Prediction': val_pred})
df_answer.to_csv('answer.csv', index=False, header=False)