In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
train = pd.read_csv('my_data/my_train.csv')
test = pd.read_csv('my_data/my_test.csv')

In [None]:
# Выбор целевой переменной и признаков
y = train['apply_promo']
X = train.drop(columns=['apply_promo', 'ClientUUId'])  # ClientUUId также не нужен для обучения
X_test = test.drop(columns=['ClientUUId'])

# Замена специальных символов в названиях признаков (LGBM требует так)
X.columns = X.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)

In [None]:
# Стратифицированные K-Folds
n_splits = 2  # Количество фолдов
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

models = {'xgb': [], 'lgbm': [], 'catboost': []}
val_scores = {'xgb': [], 'lgbm': [], 'catboost': []}

In [None]:
# Функции для Objective в Optuna
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 1.0)
    }
    model = XGBClassifier(**params, eval_metric='auc', use_label_encoder=False)
    model.fit(X_train, y_train)
    y_pred_valid = model.predict_proba(X_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, y_pred_valid)
    return roc_auc

def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 50),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1e-2, log=True)
    }
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)
    y_pred_valid = model.predict_proba(X_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, y_pred_valid)
    return roc_auc

def objective_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 200),
        'depth': trial.suggest_int('depth', 3, 7),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0)
    }
    model = CatBoostClassifier(**params, eval_metric='AUC', verbose=0)
    model.fit(X_train, y_train)
    y_pred_valid = model.predict_proba(X_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, y_pred_valid)
    return roc_auc

In [None]:
# Цикл по фолдам и моделям
for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    # Optuna study для XGB
    study_xgb = optuna.create_study(direction='maximize')
    study_xgb.optimize(objective_xgb, n_trials=5)
    best_model_xgb = XGBClassifier(**study_xgb.best_params, eval_metric='auc', use_label_encoder=False)
    best_model_xgb.fit(X_train, y_train)
    models['xgb'].append(best_model_xgb)
    y_pred_valid_xgb = best_model_xgb.predict_proba(X_valid)[:, 1]
    roc_auc_xgb = roc_auc_score(y_valid, y_pred_valid_xgb)
    val_scores['xgb'].append(roc_auc_xgb)

    # Optuna study для LGBM
    study_lgbm = optuna.create_study(direction='maximize')
    study_lgbm.optimize(objective_lgbm, n_trials=5)
    best_model_lgbm = LGBMClassifier(**study_lgbm.best_params)
    best_model_lgbm.fit(X_train, y_train)
    models['lgbm'].append(best_model_lgbm)
    y_pred_valid_lgbm = best_model_lgbm.predict_proba(X_valid)[:, 1]
    roc_auc_lgbm = roc_auc_score(y_valid, y_pred_valid_lgbm)
    val_scores['lgbm'].append(roc_auc_lgbm)

    # Optuna study для CatBoost
    study_catboost = optuna.create_study(direction='maximize')
    study_catboost.optimize(objective_catboost, n_trials=5)
    best_model_catboost = CatBoostClassifier(**study_catboost.best_params, eval_metric='AUC', verbose=0)
    best_model_catboost.fit(X_train, y_train)
    models['catboost'].append(best_model_catboost)
    y_pred_valid_catboost = best_model_catboost.predict_proba(X_valid)[:, 1]
    roc_auc_catboost = roc_auc_score(y_valid, y_pred_valid_catboost)
    val_scores['catboost'].append(roc_auc_catboost)

    print(f"Фолд {fold} - XGB ROC-AUC: {roc_auc_xgb}, LGBM ROC-AUC: {roc_auc_lgbm}, CatBoost ROC-AUC: {roc_auc_catboost}")


In [None]:
# Усреднение предсказаний на тестовых данных
y_pred_test = np.zeros(len(X_test))
for model_type, model_list in models.items():
    for model in model_list:
        y_pred_test += model.predict_proba(X_test)[:, 1]

In [None]:
y_pred_test /= (n_splits * 3) # 3 модели в каждом kfold

In [None]:
# Создание submit.csv
submit = pd.DataFrame({'apply_promo': y_pred_test})
submit.to_csv('submit.csv', index=False)