In [None]:
# Load saved train/test datasets from CSV files
import pandas as pd

X_train = pd.read_csv("X_train.csv")  # Features train set
X_test = pd.read_csv("X_test.csv")    # Features test set

# Load target variables and convert from DataFrame to Series with .squeeze()
y_train = pd.read_csv("y_train.csv").squeeze()  # Target train set
y_test = pd.read_csv("y_test.csv").squeeze()    # Target test set


In [None]:
import optuna
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings

warnings.filterwarnings("ignore")

def objective(trial, X_train, X_test, y_train, y_test):
    # Tune RandomForest parameters
    rf_n = trial.suggest_int("rf_n_estimators", 50, 200)
    rf_depth = trial.suggest_int("rf_max_depth", 3, 20)

    # Tune XGBoost parameters
    xgb_n = trial.suggest_int("xgb_n_estimators", 50, 200)
    xgb_depth = trial.suggest_int("xgb_max_depth", 3, 10)

    # Tune LightGBM parameters
    lgb_n = trial.suggest_int("lgb_n_estimators", 50, 200)
    lgb_depth = trial.suggest_int("lgb_max_depth", 3, 20)
    lgb_lr = trial.suggest_float("lgb_learning_rate", 0.01, 0.3, log=True)

    # Tune LogisticRegression parameter
    lr_C = trial.suggest_float("lr_C", 1e-3, 10, log=True)

    # Balance training data using SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)

    # Define base learners with current trial parameters
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=rf_n, max_depth=rf_depth, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=xgb_n, max_depth=xgb_depth, use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('lgb', LGBMClassifier(n_estimators=lgb_n, max_depth=lgb_depth, learning_rate=lgb_lr, random_state=42))
    ]

    # Final estimator
    final_estimator = LogisticRegression(C=lr_C, max_iter=500, random_state=42)

    # Stacking ensemble
    model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=3, n_jobs=-1)
    model.fit(X_res, y_res)

    # Predict and calculate F1 score on test set
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    return f1

# Create and run study optimizing for maximum F1 score
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_train, X_test, y_train, y_test), n_trials=30)

print("✅ Best Params:", study.best_params)

# Final training with best parameters on balanced train data
best = study.best_params
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

estimators = [
    ('rf', RandomForestClassifier(n_estimators=best['rf_n_estimators'], max_depth=best['rf_max_depth'], random_state=42)),
    ('xgb', XGBClassifier(n_estimators=best['xgb_n_estimators'], max_depth=best['xgb_max_depth'], use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('lgb', LGBMClassifier(n_estimators=best['lgb_n_estimators'], max_depth=best['lgb_max_depth'], learning_rate=best['lgb_learning_rate'], random_state=42))
]

final_estimator = LogisticRegression(C=best['lr_C'], max_iter=500, random_state=42)

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5, n_jobs=-1)
stacking_clf.fit(X_resampled, y_resampled)

y_pred_final = stacking_clf.predict(X_test)


[I 2025-06-06 13:12:16,305] A new study created in memory with name: no-name-f0cb7173-1ae4-4203-83f7-ad60e3f2196d


[I 2025-06-06 13:12:24,310] Trial 0 finished with value: 0.7723035952063915 and parameters: {'rf_n_estimators': 56, 'rf_max_depth': 4, 'xgb_n_estimators': 107, 'xgb_max_depth': 3, 'lgb_n_estimators': 82, 'lgb_max_depth': 14, 'lgb_learning_rate': 0.026444618527784403, 'lr_C': 0.05819607919312124}. Best is trial 0 with value: 0.7723035952063915.
[I 2025-06-06 13:12:30,157] Trial 1 finished with value: 0.7630522088353414 and parameters: {'rf_n_estimators': 60, 'rf_max_depth': 18, 'xgb_n_estimators': 117, 'xgb_max_depth': 4, 'lgb_n_estimators': 70, 'lgb_max_depth': 13, 'lgb_learning_rate': 0.06658543054140154, 'lr_C': 0.02520592270292888}. Best is trial 0 with value: 0.7723035952063915.
[I 2025-06-06 13:12:31,494] Trial 2 finished with value: 0.7710843373493976 and parameters: {'rf_n_estimators': 132, 'rf_max_depth': 19, 'xgb_n_estimators': 174, 'xgb_max_depth': 3, 'lgb_n_estimators': 191, 'lgb_max_depth': 19, 'lgb_learning_rate': 0.01738814216670061, 'lr_C': 0.018323007058340945}. Best is

✅ بهترین پارامترها: {'rf_n_estimators': 170, 'rf_max_depth': 7, 'xgb_n_estimators': 130, 'xgb_max_depth': 5, 'lgb_n_estimators': 108, 'lgb_max_depth': 12, 'lgb_learning_rate': 0.05582511670322425, 'lr_C': 0.055269590001462116}


In [None]:
# Convert predictions to DataFrame and save to CSV
y_pred_final = pd.DataFrame(y_pred_final)  # Convert array to DataFrame
y_pred_final.to_csv('y_pred_final.csv', index=False)  # Save without index


In [None]:
# Save the trained stacking model using joblib
import joblib

joblib.dump(stacking_clf, 'stacking_model.pkl')  # Save model to file


['model/stacking_model.pkl']