In [3]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'validated_churn_data.parquet')
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [6]:
from src.data_split import train_test_split_stratified

X_train, y_train, X_test, y_test = train_test_split_stratified(
    df,
    target_column_name='Exited'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')


X_train.shape=(8000, 10)
y_train.shape=(8000,)
X_test.shape=(2000, 10)
y_test.shape=(2000,)


In [32]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
# from sklearn.pipeline import make_pipeline
from sklearn.metrics import auc,accuracy_score,f1_score
import optuna


from src.data_split import train_test_split_stratified
from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a train test split
    """
    # pick hyper-parameters
    hyperparams = {
        "objective": 'binary',
        "metric" : "f1_score",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),  
        "max_depth" : trial.suggest_int("max_depth", 3, 50)
    }
    

    skf = StratifiedKFold(n_splits=5)
    f1_scores_ = []
    for train_index, val_index in skf.split(X_train, y_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        f1_ = f1_score(y_val_, y_pred)

        f1_scores_.append(f1_)
   
    # Return the mean score
    return np.array(f1_scores_).mean()

In [33]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-05-30 12:35:14,804] A new study created in memory with name: no-name-ad7011a5-6179-4608-91b0-fc0179d6a881
[I 2024-05-30 12:35:16,114] Trial 0 finished with value: 0.569163202304188 and parameters: {'num_leaves': 108, 'feature_fraction': 0.5989685431256477, 'bagging_fraction': 0.5982248268556387, 'min_child_samples': 56, 'max_depth': 41}. Best is trial 0 with value: 0.569163202304188.
[I 2024-05-30 12:35:16,633] Trial 1 finished with value: 0.5600002622975699 and parameters: {'num_leaves': 93, 'feature_fraction': 0.45484235339222523, 'bagging_fraction': 0.931853804620481, 'min_child_samples': 57, 'max_depth': 3}. Best is trial 0 with value: 0.569163202304188.
[I 2024-05-30 12:35:18,336] Trial 2 finished with value: 0.5257522322355329 and parameters: {'num_leaves': 248, 'feature_fraction': 0.31143449504481624, 'bagging_fraction': 0.8823853321280797, 'min_child_samples': 31, 'max_depth': 26}. Best is trial 0 with value: 0.569163202304188.
[I 2024-05-30 12:35:19,552] Trial 3 finish

In [34]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 70, 'feature_fraction': 0.6627412441646062, 'bagging_fraction': 0.5105893676260749, 'min_child_samples': 91, 'max_depth': 8}


In [36]:
study.best_trial.value

0.5838020681872037

In [37]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1630, number of negative: 6370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 855
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203750 -> initscore=-1.363019
[LightGBM] [Info] Start training from score -1.363019


In [38]:
predictions = pipeline.predict(X_test)




In [39]:
from sklearn.metrics import accuracy_score,recall_score, f1_score

test_accuracy = accuracy_score(y_test, predictions)
test_recall = recall_score(y_test,predictions )
test_f1 =f1_score(y_test,predictions)
print(f'{test_accuracy=:.4f}')
print(f'{test_recall=:.4f}')
print(f'{test_f1=:.4f}')

test_accuracy=0.8710
test_recall=0.5233
test_f1=0.6228


In [40]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['C:\\Users\\karthikeya\\bank_customer_attrition_prediction\\models\\model.pkl']

In [42]:
X_test

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9723,526,France,Male,32,7,125540.05,1,0,0,86786.41
1809,639,France,Female,50,6,115335.32,2,1,1,53130.41
456,733,France,Male,33,3,0.00,1,1,1,7666.73
7458,728,Spain,Female,43,5,0.00,1,1,1,120088.17
7403,796,Spain,Male,56,6,94231.13,1,0,0,121164.60
...,...,...,...,...,...,...,...,...,...,...
7338,708,Germany,Female,54,8,145151.40,1,0,1,125311.17
7095,753,Germany,Female,38,1,117314.92,1,1,0,122021.33
1411,685,Germany,Female,30,4,84958.60,2,0,1,194343.72
435,762,France,Female,51,3,99286.98,1,0,1,85578.63
