In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv('17.1 UCBA/bank-additional-full.csv', sep=';')

# Overview
print("Shape of dataset:", df.shape)
df.info()
df.head()



Shape of dataset: (41188, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  c

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
# Total number of missing values
df.isna().sum().sum()   
# Check for duplicate rows
df.duplicated().sum()   
# Visually scan for weird entries
df.head(20)             

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [6]:
# Identifying all the unknowns
for col in df.select_dtypes(include='object'):
    if 'unknown' in df[col].unique():
        print(f"{col}: {df[col].value_counts()['unknown']} unknowns")


job: 330 unknowns
marital: 80 unknowns
education: 1731 unknowns
default: 8597 unknowns
housing: 990 unknowns
loan: 990 unknowns


In [7]:
# Drop marital unknowns (low count)
df = df[df['marital'] != 'unknown']

# Replace "unknown" with np.nan where we want to impute
import numpy as np
df['housing'] = df['housing'].replace('unknown', np.nan)
df['loan'] = df['loan'].replace('unknown', np.nan)

# Impute housing and loan with mode
df.loc[:, 'housing'] = df['housing'].fillna(df['housing'].mode()[0])
df.loc[:, 'loan'] = df['loan'].fillna(df['loan'].mode()[0])


In [8]:
# Check for nulls again to confirm imputation worked
print("Missing values remaining:", df.isna().sum().sum())

# Quick peek at class balance in the target
print("\nTarget variable breakdown:")
print(df['y'].value_counts(normalize=True))

# Recheck dataset shape after row drop
print("\nShape after cleaning:", df.shape)


Missing values remaining: 0

Target variable breakdown:
y
no     0.887419
yes    0.112581
Name: proportion, dtype: float64

Shape after cleaning: (41108, 21)


In [10]:
from sklearn.preprocessing import LabelEncoder

# Drop 'duration' (not known before call, so it's data leakage)
df = df.drop(columns=['duration'])

# Encode target variable (yes -> 1, no -> 0)
df['y'] = LabelEncoder().fit_transform(df['y'])

# One-hot encode all categorical features
df_encoded = pd.get_dummies(df, drop_first=True)

# Final check
print("Encoded shape:", df_encoded.shape)
df_encoded.head()


Encoded shape: (41108, 50)


Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,True,False,False,False,True,False,False,False,True,False
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,True,False,False,False,True,False,False,False,True,False
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,True,False,False,False,True,False,False,False,True,False
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,True,False,False,False,True,False,False,False,True,False
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,True,False,False,False,True,False,False,False,True,False


In [11]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df_encoded.drop(columns='y')
y = df_encoded['y']

# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Check balance in both sets
print("Train target breakdown:")
print(y_train.value_counts(normalize=True))
print("\nTest target breakdown:")
print(y_test.value_counts(normalize=True))



Train target breakdown:
y
0    0.887402
1    0.112598
Name: proportion, dtype: float64

Test target breakdown:
y
0    0.887456
1    0.112544
Name: proportion, dtype: float64


In [12]:
from sklearn.preprocessing import StandardScaler

# Get numeric columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

# Initialize scaler
scaler = StandardScaler()

# Fit on training set, transform both train and test
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

print("Scaling complete. Sample of scaled features:")
X_train_scaled[numeric_features].head()


Scaling complete. Sample of scaled features:


Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
39128,1.827963,0.525755,0.195565,1.662834,-1.958253,-1.486388,1.614028,-1.680512,-1.976637
28774,-0.576627,-0.202194,0.195565,-0.347757,-1.195308,-0.862093,-1.425962,-1.271636,-0.934219
38916,1.250861,-0.566168,0.195565,1.662834,-2.212568,-1.596761,2.239274,-1.671298,-2.059368
30911,-0.095709,-0.202194,0.195565,1.662834,-1.195308,-1.175965,-1.23192,-1.309644,-0.934219
25849,0.481393,-0.566168,0.195565,1.662834,-0.114468,-0.646521,-0.326391,0.289001,0.399137


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(name, y_true, y_pred, y_proba):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba)
    return {
        "Model": name,
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1 Score": round(f1, 4),
        "ROC AUC": round(auc, 4)
    }


In [19]:
from sklearn.linear_model import LogisticRegression

# Initialize and train
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Predict
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:, 1]

# Evaluate
lr_results = evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
print(lr_results)



{'Model': 'Logistic Regression', 'Accuracy': 0.9034, 'Precision': 0.7244, 'Recall': 0.2291, 'F1 Score': 0.3481, 'ROC AUC': 0.799}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
from sklearn.tree import DecisionTreeClassifier

# Train
dt = DecisionTreeClassifier(random_state=42, max_depth=6)
dt.fit(X_train, y_train)

# Predict
y_pred_dt = dt.predict(X_test)
y_proba_dt = dt.predict_proba(X_test)[:, 1]

# Evaluate
dt_results = evaluate_model("Decision Tree", y_test, y_pred_dt, y_proba_dt)
print(dt_results)


{'Model': 'Decision Tree', 'Accuracy': 0.9018, 'Precision': 0.6712, 'Recall': 0.25, 'F1 Score': 0.3643, 'ROC AUC': 0.7842}


In [16]:
from sklearn.neighbors import KNeighborsClassifier

# Train kNN (k=5 default)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Predict
y_pred_knn = knn.predict(X_test_scaled)
y_proba_knn = knn.predict_proba(X_test_scaled)[:, 1]

# Evaluate
knn_results = evaluate_model("k-Nearest Neighbors", y_test, y_pred_knn, y_proba_knn)
print(knn_results)



{'Model': 'k-Nearest Neighbors', 'Accuracy': 0.8939, 'Precision': 0.5552, 'Recall': 0.2896, 'F1 Score': 0.3807, 'ROC AUC': 0.7418}


In [17]:
from sklearn.svm import SVC

# Train SVM (RBF kernel)
svm = SVC(probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)

# Predict
y_pred_svm = svm.predict(X_test_scaled)
y_proba_svm = svm.predict_proba(X_test_scaled)[:, 1]

# Evaluate
svm_results = evaluate_model("Support Vector Machine", y_test, y_pred_svm, y_proba_svm)
print(svm_results)


{'Model': 'Support Vector Machine', 'Accuracy': 0.9029, 'Precision': 0.7081, 'Recall': 0.2341, 'F1 Score': 0.3519, 'ROC AUC': 0.7056}
