# step 1: import libraries

In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Step 2: Load Dataset

In [61]:
def load_bank_csv(path="bank.csv"):
    df = pd.read_csv(path)                    # try default
    if df.shape[1] == 1:                      # likely semicolon-delimited
        df = pd.read_csv(path, sep=";")
    return df

df = load_bank_csv("bank.csv")
print("Shape:", df.shape)
print("Columns:", list(df.columns))

Shape: (4521, 17)
Columns: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


# step 3: Data preprocessing

In [62]:
if y.dtype == "O":
    # common mapping for bank marketing dataset
    mapping = {"yes": 1, "no": 0, "y": 1, "n": 0, "true": 1, "false": 0}
    y = y.str.lower().map(mapping)
    if y.isna().any():
        # fallback: factorize any strings to integers
        y, _ = pd.factorize(df[target_col])
y = y.astype(int)
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(0)
if y.nunique() < 2:
    raise ValueError("Label column has only one class in the whole dataset.")

In [63]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# (Optional) scale features for LR; trees don’t require it
scaler = StandardScaler(with_mean=False)  # sparse-safe if X gets large after dummies
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)
# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Decision Tree":       DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "Random Forest":       RandomForestClassifier(class_weight="balanced", n_estimators=300, random_state=42)
}

# Step 4: Model Training

In [64]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Use probabilities for ROC-AUC when available, else fallback to labels
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except Exception:
        auc = roc_auc_score(y_test, y_pred)

    print(f"\n=== {name} ===")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("ROC-AUC:", round(auc, 4))

# 11) (Nice to have) Feature importances for Random Forest
rf = models["Random Forest"]
if hasattr(rf, "feature_importances_"):
    importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
    print("\nTop 15 important features (Random Forest):")
    print(importances.head(15))


=== Logistic Regression ===
Confusion Matrix:
 [[664 137]
 [ 22  82]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9679    0.8290    0.8931       801
           1     0.3744    0.7885    0.5077       104

    accuracy                         0.8243       905
   macro avg     0.6712    0.8087    0.7004       905
weighted avg     0.8997    0.8243    0.8488       905

ROC-AUC: 0.8909

=== Decision Tree ===
Confusion Matrix:
 [[733  68]
 [ 59  45]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9255    0.9151    0.9203       801
           1     0.3982    0.4327    0.4147       104

    accuracy                         0.8597       905
   macro avg     0.6619    0.6739    0.6675       905
weighted avg     0.8649    0.8597    0.8622       905

ROC-AUC: 0.6739

=== Random Forest ===
Confusion Matrix:
 [[789  12]
 [ 87  17]]
Classification Report:
               precision    recall  f1-sco