In [1]:
# Load saved train/test datasets from CSV files
import pandas as pd

X_train = pd.read_csv("X_train.csv")  # Features train set
X_test = pd.read_csv("X_test.csv")    # Features test set

# Load target variables and convert from DataFrame to Series with .squeeze()
y_train = pd.read_csv("y_train.csv").squeeze()  # Target train set
y_test = pd.read_csv("y_test.csv").squeeze()    # Target test set


In [5]:
import os
import random
import numpy as np
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

warnings.filterwarnings("ignore")

# Define the objective function for Optuna
def objective(trial, X_train, X_test, y_train, y_test):
    # RandomForest parameters
    rf_n = trial.suggest_int("rf_n_estimators", 50, 200)
    rf_depth = trial.suggest_int("rf_max_depth", 3, 20)

    # XGBoost parameters
    xgb_n = trial.suggest_int("xgb_n_estimators", 50, 200)
    xgb_depth = trial.suggest_int("xgb_max_depth", 3, 10)

    # LightGBM parameters
    lgb_n = trial.suggest_int("lgb_n_estimators", 50, 200)
    lgb_depth = trial.suggest_int("lgb_max_depth", 3, 20)
    lgb_lr = trial.suggest_float("lgb_learning_rate", 0.01, 0.3, log=True)

    # LogisticRegression parameter
    lr_C = trial.suggest_float("lr_C", 1e-3, 10, log=True)

    # Apply SMOTE to balance training data
    sm = SMOTE(random_state=SEED)
    X_res, y_res = sm.fit_resample(X_train, y_train)

    # Define base learners
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=rf_n, max_depth=rf_depth, random_state=SEED)),
        ('xgb', XGBClassifier(n_estimators=xgb_n, max_depth=xgb_depth, use_label_encoder=False, eval_metric='logloss', random_state=SEED)),
        ('lgb', LGBMClassifier(n_estimators=lgb_n, max_depth=lgb_depth, learning_rate=lgb_lr, random_state=SEED))
    ]

    final_estimator = LogisticRegression(C=lr_C, max_iter=500, random_state=SEED)

    model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=3, n_jobs=-1)
    model.fit(X_res, y_res)

    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    return f1

# Load your train-test split data beforehand:
# X_train, X_test, y_train, y_test

# Run Optuna with fixed sampler seed
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
study.optimize(lambda trial: objective(trial, X_train, X_test, y_train, y_test), n_trials=30)

print("✅ Best Params:", study.best_params)

# Final model training using best params
best = study.best_params
sm = SMOTE(random_state=SEED)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

estimators = [
    ('rf', RandomForestClassifier(n_estimators=best['rf_n_estimators'], max_depth=best['rf_max_depth'], random_state=SEED)),
    ('xgb', XGBClassifier(n_estimators=best['xgb_n_estimators'], max_depth=best['xgb_max_depth'], use_label_encoder=False, eval_metric='logloss', random_state=SEED)),
    ('lgb', LGBMClassifier(n_estimators=best['lgb_n_estimators'], max_depth=best['lgb_max_depth'], learning_rate=best['lgb_learning_rate'], random_state=SEED))
]

final_estimator = LogisticRegression(C=best['lr_C'], max_iter=500, random_state=SEED)

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5, n_jobs=-1)
stacking_clf.fit(X_resampled, y_resampled)

y_pred_final = stacking_clf.predict(X_test)


[I 2025-06-07 12:47:03,398] A new study created in memory with name: no-name-f446382c-82d0-410f-abe4-f27e7661668f


[I 2025-06-07 12:47:04,420] Trial 0 finished with value: 0.7733333333333333 and parameters: {'rf_n_estimators': 106, 'rf_max_depth': 20, 'xgb_n_estimators': 160, 'xgb_max_depth': 7, 'lgb_n_estimators': 73, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.012184186502221764, 'lr_C': 2.9154431891537547}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-06-07 12:47:05,654] Trial 1 finished with value: 0.7580645161290323 and parameters: {'rf_n_estimators': 140, 'rf_max_depth': 15, 'xgb_n_estimators': 53, 'xgb_max_depth': 10, 'lgb_n_estimators': 175, 'lgb_max_depth': 6, 'lgb_learning_rate': 0.01855998084649059, 'lr_C': 0.00541524411940254}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-06-07 12:47:06,518] Trial 2 finished with value: 0.7788079470198676 and parameters: {'rf_n_estimators': 95, 'rf_max_depth': 12, 'xgb_n_estimators': 115, 'xgb_max_depth': 5, 'lgb_n_estimators': 142, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.027010527749605478, 'lr_C': 0.029204338471814112}. Best is

✅ Best Params: {'rf_n_estimators': 63, 'rf_max_depth': 6, 'xgb_n_estimators': 56, 'xgb_max_depth': 5, 'lgb_n_estimators': 108, 'lgb_max_depth': 7, 'lgb_learning_rate': 0.16755052359850303, 'lr_C': 0.026730883107816707}


In [6]:
# Convert predictions to DataFrame and save to CSV
y_pred_final = pd.DataFrame(y_pred_final)  # Convert array to DataFrame
y_pred_final.to_csv('y_pred_final.csv', index=False)  # Save without index


In [7]:
# Save the trained stacking model using joblib
import joblib

joblib.dump(stacking_clf, 'stacking_model.pkl')  # Save model to file


['stacking_model.pkl']