<a href="https://colab.research.google.com/github/mshoaib40458/BankChurnPrediction/blob/main/Intern_Intelligence_Automated_Hyperparameter_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install optuna



In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
import plotly.express as px

In [None]:
x, y = make_classification(n_samples=10000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
baseline_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
baseline_scores = cross_val_score(baseline_model, x_train, y_train, cv=5, scoring='accuracy')
print(f"Baseline Accuracy: {baseline_scores.mean():.4f} (+/- {baseline_scores.std()*2:.4f})")


Baseline Accuracy: 0.9656 (+/- 0.0123)


In [None]:
def objective(trial):
    # Suggest hyperparameters for Optuna to try
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    model = xgb.XGBClassifier(**params, random_state=42, eval_metric='logloss', use_label_encoder=False)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, x_train, y_train, cv=kfold, scoring='f1', n_jobs=-1).mean() # Using F1 as our target metric

    return score

In [None]:
study = optuna.create_study(direction='maximize', sampler=TPESampler()) # We want to maximize the F1 score

# Run the optimization
study.optimize(objective, n_trials=20)

[I 2025-08-30 21:36:10,492] A new study created in memory with name: no-name-a1261aaa-2c18-487b-bdc7-2665019bbd9d
[I 2025-08-30 21:36:24,084] Trial 0 finished with value: 0.9361485582035286 and parameters: {'n_estimators': 238, 'max_depth': 10, 'learning_rate': 0.04986061836960352, 'subsample': 0.786105065488794, 'colsample_bytree': 0.7611531768131291, 'gamma': 2.5844804717875087, 'reg_alpha': 7.55492740806107, 'reg_lambda': 9.442653526850208, 'min_child_weight': 10}. Best is trial 0 with value: 0.9361485582035286.
[I 2025-08-30 21:36:26,391] Trial 1 finished with value: 0.9096769336907016 and parameters: {'n_estimators': 219, 'max_depth': 3, 'learning_rate': 0.11471577542677909, 'subsample': 0.925339394674856, 'colsample_bytree': 0.9015752801258438, 'gamma': 4.859271929369917, 'reg_alpha': 6.805007181161925, 'reg_lambda': 6.772743707507921, 'min_child_weight': 10}. Best is trial 0 with value: 0.9361485582035286.
[I 2025-08-30 21:36:27,787] Trial 2 finished with value: 0.91589338607657

In [None]:
# ANALYSIS OF RESULTS
print("Best trial:")
trial = study.best_trial
print(f"  Value (F1 Score): {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Visualize the optimization history
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

Best trial:
  Value (F1 Score): 0.9504
  Params: 
    n_estimators: 431
    max_depth: 7
    learning_rate: 0.016967335883823403
    subsample: 0.6899890177982625
    colsample_bytree: 0.6467790775609144
    gamma: 0.054733057061037815
    reg_alpha: 0.5715234510149076
    reg_lambda: 1.8993080186927165
    min_child_weight: 5


In [None]:
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params, random_state=42, eval_metric='logloss')
final_model.fit(x_train, y_train)

In [None]:
y_pred = final_model.predict(x_test)
y_pred_proba = final_model.predict_proba(x_test)[:, 1]

In [None]:
print("\n--- FINAL MODEL PERFORMANCE REPORT ---")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Test Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Test F1-Score: {f1_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- FINAL MODEL PERFORMANCE REPORT ---
Test Accuracy: 0.9530
Test Precision: 0.9584
Test Recall: 0.9469
Test F1-Score: 0.9526

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1002
           1       0.96      0.95      0.95       998

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [None]:
importance_df = pd.DataFrame({
    'feature': [f'Feature {i}' for i in range(x.shape[1])],
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

fig = px.bar(importance_df, x='importance', y='feature', title='Feature Importance')
fig.show()

In [None]:
import pickle
pickle.dump(final_model, open('model.pkl', 'wb'))
