# Libraries 

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier


# Functions

In [2]:


def preprocess_data(file_path, drop_cols, target_col):

    # Load dataset with only necessary columns
    df = pd.read_csv(file_path)
    print(f'Initial Data Shape: {df.shape}')

    # Drop specified columns efficiently
    df.drop(columns=drop_cols, inplace=True, errors='ignore')

    # Remove duplicate rows in place
    print(f'Duplicate Rows: { df.duplicated().sum()}')
    df.drop_duplicates(inplace=True)

    print(f'Shape after dropping duplicates: {df.shape}')

    # Standardize features 
    cols_std = [col for col in df.columns if col != target_col]
    print(f'Num of features to standardize: {len(cols_std)}')

    if cols_std:  
        df[cols_std] = StandardScaler().fit_transform(df[cols_std])

    # shuffle df
    df = df.sample(frac=1).reset_index(drop=True)


    return df


In [3]:
def split_data(df, target_col, test_size=0.1, val_size=0.2):
    X, y = df.drop(columns=[target_col]), df[target_col]
    # split data  train val , test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                                        random_state=42, stratify=y)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, 
                                                      random_state=42, stratify=y_train)

    print(f"Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}")
    target_sets = { "Train": y_train, "Validation": y_val, "Test": y_test }


    for set_name, target in target_sets.items():
        print(f"\nTarget Distribution in {set_name} Set:")
        print(target.value_counts(normalize=True))
        
    return X_train, X_val, X_test, y_train, y_val, y_test

In [4]:

def evaluate_model(model, X_true, y_true):
    
    # predict class and probability 
    y_pred = model.predict(X_true)
    y_pred_proba = model.predict_proba(X_true)
    
    # confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0  # True Negative Rate

    # Store metrics in a dictionary
    metrics_dict = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Specificity": specificity}
      
    print(f"\nConfusion Matrix:\n{cm}")
    print("\nEvaluation Metrics:")
    for key, value in metrics_dict.items():
        if value is not None: print(f"{key}: {value:.4f}")
    
    return metrics_dict


# Variables 

In [5]:
file_path = '/kaggle/input/bank-personal-loan-modelling/Bank_Personal_Loan_Modelling.csv'
drop_cols = ['ID', 'ZIP Code']
target_col = 'Personal Loan'
gift_cols_target = ['Income', 'CCAvg', 'CD Account', 'Education', 'Mortgage', 'Family', 'Personal Loan']

In [6]:
classifiers = {
    "SVM" : svm.SVC(probability=True), 
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "LightGBM": LGBMClassifier() }

classifiers_gift = {
    "SVM" : svm.SVC(probability=True), 
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "LightGBM": LGBMClassifier() }

# Main 

**Preprocessing**

In [7]:
df_pre_processed  = preprocess_data(file_path=file_path, 
                drop_cols=drop_cols, 
                target_col=target_col)

Initial Data Shape: (5000, 14)
Duplicate Rows: 13
Shape after dropping duplicates: (4987, 12)
Num of features to standardize: 11


In [8]:
df_gift = df_pre_processed[gift_cols_target]
print(f'GIFT cols: {df_gift.shape}')

GIFT cols: (4987, 7)


In [9]:
def main_function(df, target_col, classifiers): 
    trained_models = {}
    print(f' {"*" * 20} Train Test Split {"*" * 20}')
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df=df, target_col=target_col)
    print("=" * 60)

    for name, classifier in classifiers.items():
        classifier.fit(X_train, y_train)
        trained_models[name]  = classifier
        print(f"\n{classifier.__class__.__name__} Trained")
    
        print(f' {"*" * 20} Validation Set {"*" * 20}')
        evaluate_model(model = classifier, X_true=X_val, y_true=y_val)
        print(f' {"*" * 20} Test Set {"*" * 20}')
        evaluate_model(model = classifier, X_true = X_test, y_true=y_test)
        print("=" * 60)


## **ML models withOUT feature selection**

In [10]:
main_function(df=df_pre_processed, target_col = target_col, classifiers=classifiers)

 ******************** Train Test Split ********************
Train Shape: (3590, 11), Validation Shape: (898, 11), Test Shape: (499, 11)

Target Distribution in Train Set:
Personal Loan
0    0.903621
1    0.096379
Name: proportion, dtype: float64

Target Distribution in Validation Set:
Personal Loan
0    0.904232
1    0.095768
Name: proportion, dtype: float64

Target Distribution in Test Set:
Personal Loan
0    0.903808
1    0.096192
Name: proportion, dtype: float64

SVC Trained
 ******************** Validation Set ********************

Confusion Matrix:
[[811   1]
 [ 21  65]]

Evaluation Metrics:
Accuracy: 0.9755
Precision: 0.9848
Recall: 0.7558
F1-Score: 0.8553
Specificity: 0.9988
 ******************** Test Set ********************

Confusion Matrix:
[[449   2]
 [ 16  32]]

Evaluation Metrics:
Accuracy: 0.9639
Precision: 0.9412
Recall: 0.6667
F1-Score: 0.7805
Specificity: 0.9956

LogisticRegression Trained
 ******************** Validation Set ********************

Confusion Matrix:
[[

## **ML models with feature selection -- GIFT**

In [11]:
main_function(df=df_gift, target_col = target_col, classifiers=classifiers_gift)

 ******************** Train Test Split ********************
Train Shape: (3590, 6), Validation Shape: (898, 6), Test Shape: (499, 6)

Target Distribution in Train Set:
Personal Loan
0    0.903621
1    0.096379
Name: proportion, dtype: float64

Target Distribution in Validation Set:
Personal Loan
0    0.904232
1    0.095768
Name: proportion, dtype: float64

Target Distribution in Test Set:
Personal Loan
0    0.903808
1    0.096192
Name: proportion, dtype: float64

SVC Trained
 ******************** Validation Set ********************

Confusion Matrix:
[[812   0]
 [ 14  72]]

Evaluation Metrics:
Accuracy: 0.9844
Precision: 1.0000
Recall: 0.8372
F1-Score: 0.9114
Specificity: 1.0000
 ******************** Test Set ********************

Confusion Matrix:
[[449   2]
 [ 15  33]]

Evaluation Metrics:
Accuracy: 0.9659
Precision: 0.9429
Recall: 0.6875
F1-Score: 0.7952
Specificity: 0.9956

LogisticRegression Trained
 ******************** Validation Set ********************

Confusion Matrix:
[[803