## ***Essential libraries:***
***

In [19]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization

import warnings
warnings.filterwarnings('ignore')


In [20]:

data = pd.read_csv('../data/clean_data.csv')
data.head()


Unnamed: 0,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


***
#### ***Encoding Categorical Variables***
***

In [21]:

new_data = pd.get_dummies(data, columns=['geography', 'gender'], drop_first=True)

bool_cols = new_data.select_dtypes(include='bool').columns
bool_cols = new_data.select_dtypes(include='bool').columns

new_data[bool_cols] = new_data[bool_cols].astype(int)


***
### ***Data Splitting (Train/Test Sets)***
***

In [22]:

x = new_data.drop(columns='exited',axis=1)
y = new_data['exited']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)


In [23]:

x_train.shape, x_test.shape, y_train.shape, y_test.shape


((8000, 11), (2000, 11), (8000,), (2000,))

***
#### ***Feature Scaling***
***


In [24]:

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


***
#### ***Apply SMOTE to training set***
***


In [25]:

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_scaled, y_train)


In [26]:

unique, counts = np.unique(y_train_resampled, return_counts=True)
print(dict(zip(unique, counts)))


{0: 6365, 1: 6365}


***
#### ***Modeling & Evaluation***
***


In [27]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "xgboost": xgb.XGBClassifier(),
    "lightgbm": lgb.LGBMClassifier(),
    "catboost": cb.CatBoostClassifier(verbose=0)
}

results = {}

for name, model in models.items():
    model.fit(x_train_resampled, y_train_resampled)
    y_pred = model.predict(x_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1 Score": f1}


[LightGBM] [Info] Number of positive: 6365, number of negative: 6365
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1685
[LightGBM] [Info] Number of data points in the train set: 12730, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [29]:

# Results DataFrame
model_results = pd.DataFrame(results).T
display(model_results.sort_values(by='Accuracy', ascending=False))
print(47*'-')
display(model_results.sort_values(by='F1 Score', ascending=False))


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
catboost,0.8515,0.65861,0.542289,0.594816
lightgbm,0.847,0.637143,0.554726,0.593085
xgboost,0.835,0.598901,0.542289,0.569191
Random Forest,0.8265,0.569975,0.557214,0.563522
Gradient Boosting,0.8265,0.558887,0.649254,0.60069
Decision Tree,0.767,0.436,0.542289,0.48337
Logistic Regression,0.684,0.351804,0.679104,0.463497


-----------------------------------------------


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Gradient Boosting,0.8265,0.558887,0.649254,0.60069
catboost,0.8515,0.65861,0.542289,0.594816
lightgbm,0.847,0.637143,0.554726,0.593085
xgboost,0.835,0.598901,0.542289,0.569191
Random Forest,0.8265,0.569975,0.557214,0.563522
Decision Tree,0.767,0.436,0.542289,0.48337
Logistic Regression,0.684,0.351804,0.679104,0.463497


#### ***Insights:***
- ***`Best F1 Score:`*** Gradient Boosting (0.6007) – Even though it doesn’t have the highest accuracy, its balance between precision and recall gives it the best F1 score, ideal for imbalanced datasets.

- ***`Best Accuracy:`*** CatBoost (0.8515) – Performs well overall and is likely the best option if accuracy is your primary metric.

- Logistic Regression shows the weakest performance — typical for linear models on complex data.

***
### ***Tuning***
***

In [30]:

param_grids = {
    "Logistic Regression": {
        'model': LogisticRegression(max_iter=500),
        'params': {
            'C': [0.1, 1, 10]
        }
    },
    "Decision Tree": {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [3, 5],
            'min_samples_split': [2, 5]
        }
    },
    "Random Forest": {
        'model': RandomForestClassifier(n_jobs=-1),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [3, 5]
        }
    },
    "Gradient Boosting": {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3]
        }
    },
    "XGBoost": {
        'model': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
        'params': {
            'n_estimators': [50],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3]
        }
    },
    "LightGBM": {
        'model': lgb.LGBMClassifier(n_jobs=-1),
        'params': {
            'n_estimators': [50],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3]
        }
    },
    "CatBoost": {
        'model': cb.CatBoostClassifier(verbose=0),
        'params': {
            'iterations': [50],
            'learning_rate': [0.05, 0.1],
            'depth': [3]
        }
    }
}

results = {}

for name, mp in param_grids.items():
    grid = GridSearchCV(mp['model'], mp['params'], cv=3, scoring='f1', n_jobs=-1)
    grid.fit(x_train_resampled, y_train_resampled)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(x_test_scaled)

    results[name] = {
        "Best Params": grid.best_params_,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

model_results = pd.DataFrame(results).T
display(model_results.sort_values(by='Accuracy', ascending=False))


[LightGBM] [Info] Number of positive: 6365, number of negative: 6365
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1685
[LightGBM] [Info] Number of data points in the train set: 12730, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Best Params,Accuracy,Precision,Recall,F1 Score
Gradient Boosting,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.8005,0.502762,0.679104,0.577778
LightGBM,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.799,0.5,0.679104,0.575949
XGBoost,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.7965,0.495512,0.686567,0.5756
CatBoost,"{'depth': 3, 'iterations': 50, 'learning_rate'...",0.79,0.484211,0.686567,0.567901
Decision Tree,"{'max_depth': 5, 'min_samples_split': 2}",0.7845,0.473684,0.649254,0.547744
Random Forest,"{'max_depth': 5, 'n_estimators': 50}",0.78,0.468013,0.691542,0.558233
Logistic Regression,{'C': 1},0.684,0.351804,0.679104,0.463497


#### ***Key Insights:***
Overall Performance Drop After Tuning:
- `Most models slightly dropped in accuracy and F1 Score after hyperparameter tuning`, likely due to reduced complexity (e.g., lower max_depth) or constraints added to avoid overfitting.

- `Gradient Boosting Still Performs Well:` Even after tuning, Gradient Boosting maintains a strong F1 Score (0.5778) and good general performance — making it a reliable choice, especially for imbalanced classification tasks.

- C`atBoost Dropped in Accuracy:` Accuracy went down from 0.8515 to 0.790 — hyperparameters like fewer iterations or depth may have caused underfitting.

- `Logistic Regression Remained the Same:` No noticeable improvement — expected for linear models on non-linear data.
***

# ***Final Recommendation***
*After comparing the performance of various classification models before and after hyperparameter tuning, the following recommendations are proposed:*

***$->$ Best Overall Model (Balanced Performance)***       
`Gradient Boosting (Tuned)`
- Best Params: learning_rate=0.1, max_depth=3, n_estimators=100
- F1 Score: 0.5778
- Recall: 0.6791
- Stable and reliable performance across all metrics
- Recommended when both precision and recall are important (e.g., churn, fraud detection)

***$->$ Best Model (Baseline - Highest Accuracy)***             
`CatBoost (Baseline)`
- Accuracy: 0.8515
- Slight trade-off in recall and F1 Score
- Fast and requires minimal preprocessing (handles categorical features internally)
- Recommended for scenarios where accuracy matters more than recall
- Best Lightweight Model (Simple & Interpretable)

`Decision Tree (Tuned)`
- Best Params: max_depth=5, min_samples_split=2
- Improved F1 Score after tuning
- Easy to visualize and explain
- Recommended for quick baseline testing or when explainability is a priority
- Models Not Recommended for This Dataset

`Logistic Regression`
- Consistently underperformed on all metrics
- Suitable only for linearly separable data
***


### ***final model:***

In [31]:

cbc = cb.CatBoostClassifier(verbose=0)  # Suppress training output
cbc.fit(x_train_resampled, y_train_resampled)

y_pred = cbc.predict(x_test_scaled)

acc = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy (CatBoost): {acc:.4f}")


Final Model Accuracy (CatBoost): 0.8515


In [32]:

import joblib

joblib.dump(cbc, '../models/final_catboost_model.pkl')


['../models/final_catboost_model.pkl']