In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

path = r"D:\Percia_MTech\GUVI\python\Projects\bank_deposit\data\train.csv"
df = pd.read_csv(path)
df.tail()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
749995,749995,29,services,single,secondary,no,1282,no,yes,unknown,4,jul,1006,2,-1,0,unknown,1
749996,749996,69,retired,divorced,tertiary,no,631,no,no,cellular,19,aug,87,1,-1,0,unknown,0
749997,749997,50,blue-collar,married,secondary,no,217,yes,no,cellular,17,apr,113,1,-1,0,unknown,0
749998,749998,32,technician,married,secondary,no,-274,no,no,cellular,26,aug,108,6,-1,0,unknown,0
749999,749999,42,technician,married,secondary,no,1559,no,no,cellular,4,aug,143,1,1,7,failure,0


In [2]:
df.dtypes

id            int64
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object

In [3]:
categorical_cols = df.select_dtypes(include='object').columns
numeric_cols = df.select_dtypes(include=['int64']).columns
print("\nCategorical Columns:", list(categorical_cols))
print("Numeric Columns:", list(numeric_cols))


Categorical Columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numeric Columns: ['id', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y']


In [4]:
df['y'].value_counts()

y
0    659512
1     90488
Name: count, dtype: int64

In [5]:
# remove unwanted columns
unwanted_cols = ['id', 'contact']  
df.drop(columns=unwanted_cols, inplace=True, errors='ignore')

print("Removed unwanted columns:", unwanted_cols)

Removed unwanted columns: ['id', 'contact']


In [96]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,married,secondary,no,7,no,no,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,no,514,no,no,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,no,602,yes,no,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,no,34,yes,no,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,no,889,yes,no,3,feb,902,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,29,services,single,secondary,no,1282,no,yes,4,jul,1006,2,-1,0,unknown,1
749996,69,retired,divorced,tertiary,no,631,no,no,19,aug,87,1,-1,0,unknown,0
749997,50,blue-collar,married,secondary,no,217,yes,no,17,apr,113,1,-1,0,unknown,0
749998,32,technician,married,secondary,no,-274,no,no,26,aug,108,6,-1,0,unknown,0


In [6]:
# label encoding for ordinal or binary features
encoder = LabelEncoder()

for col in ['education','housing','loan','default','poutcome','month','marital']:
    df[col] = encoder.fit_transform(df[col])

In [7]:
# one hot encoding for nominal (no order) features
df_encode = pd.get_dummies(df, columns=['job'], drop_first=True)

In [80]:
df['y'] = df['y'].map({1: 'yes', 0: 'no'})

In [81]:
df['y']

0          no
1          no
2          no
3          no
4         yes
         ... 
749995    yes
749996     no
749997     no
749998     no
749999     no
Name: y, Length: 750000, dtype: object

In [None]:
categorical_cols = df.select_dtypes(include='object').columns
numeric_cols = df.select_dtypes(include=['int64']).columns
print("\nCategorical Columns:", list(categorical_cols))
print("Numeric Columns:", list(numeric_cols))

In [None]:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [8]:
df['y']

0         0
1         0
2         0
3         0
4         1
         ..
749995    1
749996    0
749997    0
749998    0
749999    0
Name: y, Length: 750000, dtype: int64

In [9]:
X = df_encode.drop('y', axis=1)
y = df['y']

print("Feature Shape:", X.shape)
print("Target Shape:", y.shape)

Feature Shape: (750000, 25)
Target Shape: (750000,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression

In [11]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Random Forest Classifier

In [12]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

# XGBoost Classifier

In [13]:
xgb_model = XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.1)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# LightGBM Classifier

In [14]:
lgb_model = LGBMClassifier(random_state=42, n_estimators=100, learning_rate=0.1)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)

[LightGBM] [Info] Number of positive: 72283, number of negative: 527717
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1006
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120472 -> initscore=-1.987971
[LightGBM] [Info] Start training from score -1.987971


# Gradient Boosting

In [15]:
gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Support Vector Machine

In [None]:
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Naive Bayes

In [16]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [29]:
results = []
def evaluate_model(name, model, X_test, y_test, y_prob=None):
    y_pred = model.predict(X_test)
    if y_prob is None:
        # SVM may not have predict_proba
        try:
            y_prob = model.predict_proba(X_test)[:, 1]
        except:
            y_prob = np.zeros(len(y_pred))

    print(f"\n📈 Model: {name}")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
    print("Precision:", round(precision_score(y_test, y_pred), 4))
    print("Recall:", round(recall_score(y_test, y_pred), 4))
    print("F1 Score:", round(f1_score(y_test, y_pred), 4))
    print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_prob)
    }

In [30]:
results.append(evaluate_model("Logistic Regression",log_reg, X_test, y_test))
results.append(evaluate_model("Random Forest",rf,X_test, y_test))
results.append(evaluate_model("XGBoost", xgb_model, X_test, y_test))
results.append(evaluate_model("LightGBM", lgb_model, X_test, y_test))
results.append(evaluate_model("Gradient Boosting", gb_model, X_test, y_test))
#results.append(evaluate_model("SVM", svm_model, X_test, y_test))
results.append(evaluate_model("Naive Bayes", nb_model, X_test, y_test))
results_df = pd.DataFrame(results)


📈 Model: Logistic Regression
Accuracy: 0.9008
Precision: 0.6456
Recall: 0.4054
F1 Score: 0.4981
ROC-AUC: 0.9118

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94    131795
           1       0.65      0.41      0.50     18205

    accuracy                           0.90    150000
   macro avg       0.78      0.69      0.72    150000
weighted avg       0.89      0.90      0.89    150000


📈 Model: Random Forest
Accuracy: 0.9296
Precision: 0.7513
Recall: 0.6277
F1 Score: 0.684
ROC-AUC: 0.9592

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96    131795
           1       0.75      0.63      0.68     18205

    accuracy                           0.93    150000
   macro avg       0.85      0.80      0.82    150000
weighted avg       0.93      0.93      0.93    150000


📈 Model: XGBoost
Accuracy: 0.9291
Precision: 0.7537
Recall: 0.6177
F1 Score:

In [32]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="ROC-AUC", ascending=False)
print("\n🏆 Model Comparison Table:")
display(results_df)


🏆 Model Comparison Table:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC-AUC
3,LightGBM,0.93158,0.754976,0.645867,0.696172,0.961991
2,XGBoost,0.9291,0.753652,0.617742,0.678963,0.960153
1,Random Forest,0.9296,0.751298,0.62774,0.683984,0.959178
4,Gradient Boosting,0.92256,0.7371,0.562593,0.638131,0.950411
0,Logistic Regression,0.900827,0.645587,0.405438,0.498077,0.911834
5,Naive Bayes,0.836873,0.385426,0.578742,0.462704,0.842047


In [None]:
import joblib
model = joblib.load("D:/Percia_MTech/GUVI/python/Projects/emi_prediction/models/classification/XGBoost_best.pkl")
print(type(model))
print(model)
