In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier

**Loading the dataset**

In [None]:
df=pd.read_csv("/content/bank-additional-full.csv.zip",sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
df = df.drop(['default', 'contact', 'month', 'day_of_week', 'poutcome', 'pdays', 'duration'], axis=1)

In [None]:
df = df[df['job'] != 'unknown']
df = df[df['marital'] != 'unknown']
df = df[df['education'] != 'unknown']
df = df[df['housing'] != 'unknown']
df = df[df['loan'] != 'unknown']
#df.loc[df['previous'] >= 1, 'previous'] = 1
#df.loc[df['campaign'] >= 5, 'campaign'] = 5

feature extraction

In [None]:
# Create new 'loan' column based on housing or loan being 'yes'
df['loan'] = ((df['housing'] == 'yes') | (df['loan'] == 'yes')).astype(int)

# Drop the original 'housing' and old 'loan' columns
df = df.drop(columns=['housing'])

In [None]:
df['campaign'] = df['campaign'] + df['previous']
df.loc[df['campaign'] >= 3, 'campaign'] = 3

In [None]:
df['marital'] = df['marital'].map({'married': 1, 'single': 0, 'divorced': 0})

In [None]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [None]:
df['education'] = df['education'].map({'illiterate': 0, 'basic.6y': 0, 'basic.4y': 0, 'basic.9y': 0, 'high.school': 1, 'university.degree': 2, 'professional.course': 2})

In [None]:
# Define the groups for mapping
White_collar = ['admin.', 'management']
blue_collor = ['technician', 'blue-collar']
service_sector = ['services', 'housemaid']
not_employed = ['retired', 'unemployed', 'student']
self_employed = ['entrepreneur', 'self-employed']
# Function to map job categories
def map_job(job):
    if job in White_collar:
        return 'White_collar'
    elif job in blue_collor:
        return 'blue_collor'
    elif job in service_sector:
        return 'service_sector'
    elif job in not_employed:
        return 'not_employed'
    elif job in self_employed:
        return 'self_employed'
    else:
        return job

# Apply the mapping function directly to the 'job' column
df['job'] = df['job'].apply(map_job)

In [None]:
one_hot = pd.get_dummies(df['job'], prefix='job', drop_first=True)
df = df.drop(columns=['job'])
df = pd.concat([df, one_hot], axis=1)
df[one_hot.columns] = df[one_hot.columns].astype(int)

In [None]:
from sklearn.preprocessing import StandardScaler

# List your columns to standardize
cols_to_standardize = ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'nr.employed']  # replace with your columns

scaler = StandardScaler()

# Fit and transform those columns
df[cols_to_standardize] = scaler.fit_transform(df[cols_to_standardize])

In [None]:
df.head()

Unnamed: 0,age,marital,education,loan,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,job_blue_collor,job_not_employed,job_self_employed,job_service_sector
0,56,1,0,0,1,0,0.649546,0.735109,0.895747,4.857,0.328423,0,0,0,0,1
1,57,1,1,0,1,0,0.649546,0.735109,0.895747,4.857,0.328423,0,0,0,0,1
2,37,1,1,1,1,0,0.649546,0.735109,0.895747,4.857,0.328423,0,0,0,0,1
3,40,1,0,0,1,0,0.649546,0.735109,0.895747,4.857,0.328423,0,0,0,0,0
4,56,1,1,1,1,0,0.649546,0.735109,0.895747,4.857,0.328423,0,0,0,0,1


In [None]:
df.shape

(38245, 16)

feature selection

In [None]:
import statsmodels.api as sm
import pandas as pd

def backward_selection(X, y, significance_level=0.05):
    features = list(X.columns)
    while len(features) > 0:
        X_with_const = sm.add_constant(X[features])
        model = sm.OLS(y, X_with_const).fit()
        pvalues = model.pvalues.iloc[1:]  # exclude intercept
        max_pval = pvalues.max()
        if max_pval > significance_level:
            excluded_feature = pvalues.idxmax()
            print(f"Dropping '{excluded_feature}' with p-value {max_pval}")
            features.remove(excluded_feature)
        else:
            break
    print("Final features:", features)
    return features

# Usage example:
# X = df.drop(columns=['target'])
# y = df['target']
# selected_features = backward_selection(X, y)


In [None]:
# Usage example:
X = df.drop(columns=['y'])
y = df['y']
selected_features = backward_selection(X, y)


Dropping 'loan' with p-value 0.8775197069977294
Dropping 'euribor3m' with p-value 0.3211010360177482
Dropping 'job_self_employed' with p-value 0.11992082874443685
Dropping 'age' with p-value 0.09369243093649549
Dropping 'campaign' with p-value 0.07038746271864607
Final features: ['marital', 'education', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'nr.employed', 'job_blue_collor', 'job_not_employed', 'job_service_sector']


In [None]:
X = df.drop(['loan', 'euribor3m', 'job_self_employed', 'age', 'campaign','y'], axis=1)

In [None]:
X.head()

Unnamed: 0,marital,education,previous,emp.var.rate,cons.price.idx,cons.conf.idx,nr.employed,job_blue_collor,job_not_employed,job_service_sector
0,1,0,0,0.649546,0.735109,0.895747,0.328423,0,0,1
1,1,1,0,0.649546,0.735109,0.895747,0.328423,0,0,1
2,1,1,0,0.649546,0.735109,0.895747,0.328423,0,0,1
3,1,0,0,0.649546,0.735109,0.895747,0.328423,0,0,0
4,1,1,0,0.649546,0.735109,0.895747,0.328423,0,0,1


In [None]:
from sklearn.model_selection import train_test_split

# Assuming X is your features DataFrame and y is your target Series
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Evaluate
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

Train accuracy: 0.8915871355732775
Test accuracy: 0.8895280428814224


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# Split your data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {'C': [0.01, 0.1, 1, 10], 'penalty': ['l2'], 'solver': ['lbfgs']}
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
    }
}

best_estimators = {}

for name, mp in models.items():
    print(f"Training {name}...")
    grid = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_estimators[name] = grid.best_estimator_
    print(f"Best params for {name}: {grid.best_params_}")
    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}\n")


Training LogisticRegression...
Best params for LogisticRegression: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
LogisticRegression Accuracy: 0.8911

Training SVM...
Best params for SVM: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
SVM Accuracy: 0.8941

Training RandomForest...
Best params for RandomForest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
RandomForest Accuracy: 0.8940

Training DecisionTree...
Best params for DecisionTree: {'max_depth': 10, 'min_samples_split': 2}
DecisionTree Accuracy: 0.8916



In [None]:
# After grid.fit(...) and printing best params
# Get prediction scores for AUC
if hasattr(grid.best_estimator_, "predict_proba"):
    y_scores = grid.predict_proba(X_test)[:, 1]
else:
    y_scores = grid.decision_function(X_test)

auc = roc_auc_score(y_test, y_scores)
print(f"{name} AUC: {auc:.4f}")

# Then calculate accuracy normally
y_pred = grid.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"{name} Accuracy: {acc:.4f}\n")


DecisionTree AUC: 0.7463
DecisionTree Accuracy: 0.8916



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {'C': [0.01, 0.1, 1, 10], 'penalty': ['l2'], 'solver': ['lbfgs']}
    },
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
    },
    'LDA': {
        'model': LinearDiscriminantAnalysis(),
        'params': {}
    },
    'RidgeClassifier': {
        'model': RidgeClassifier(random_state=42),
        'params': {'alpha': [0.1, 1.0, 10.0]}
    },
    'Perceptron': {
        'model': Perceptron(random_state=42),
        'params': {'alpha': [0.0001, 0.001, 0.01], 'max_iter': [1000]}
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {}
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=500, random_state=42),
        'params': {'hidden_layer_sizes': [(50,), (100,)], 'alpha': [0.0001, 0.001]}
    }
}

best_estimators = {}

for name, mp in models.items():
    print(f"Training {name}...")
    grid = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_estimators[name] = grid.best_estimator_
    print(f"Best params for {name}: {grid.best_params_}")

    # Predict probabilities or decision function for AUC
    if hasattr(grid.best_estimator_, "predict_proba"):
        y_scores = grid.predict_proba(X_test)[:, 1]
    elif hasattr(grid.best_estimator_, "decision_function"):
        y_scores = grid.decision_function(X_test)
    else:
        y_scores = None

    if y_scores is not None:
        auc = roc_auc_score(y_test, y_scores)
        print(f"{name} AUC: {auc:.4f}")
    else:
        print(f"{name} AUC: Not available")

    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}\n")


Training LogisticRegression...
Best params for LogisticRegression: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
LogisticRegression AUC: 0.7484
LogisticRegression Accuracy: 0.8911

Training SVM...
Best params for SVM: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
SVM AUC: 0.6303
SVM Accuracy: 0.8941

Training RandomForest...
Best params for RandomForest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
RandomForest AUC: 0.7744
RandomForest Accuracy: 0.8940

Training DecisionTree...
Best params for DecisionTree: {'max_depth': 10, 'min_samples_split': 2}
DecisionTree AUC: 0.7463
DecisionTree Accuracy: 0.8916

Training LDA...
Best params for LDA: {}
LDA AUC: 0.7441
LDA Accuracy: 0.8852

Training RidgeClassifier...
Best params for RidgeClassifier: {'alpha': 0.1}
RidgeClassifier AUC: 0.7441
RidgeClassifier Accuracy: 0.8915

Training Perceptron...
Best params for Perceptron: {'alpha': 0.0001, 'max_iter': 1000}
Perceptron AUC: 0.4766
Perceptron Accuracy: 0.8876

Training Gaussia