# Bank Marketing - Experimental Machine Learning Pipeline

**Author:** Umair Tariq (Student UE: 2433032)  
**Purpose:** Progress Evaluation with Parameter-Based Experiments

**Note:** This pipeline implements the strategy defined in the accompanying Coursework Assignment Report.

> **ACADEMIC NOTE:** The 'duration' feature is included for comparison with literature benchmarks (e.g., Moro et al., 2014) but represents a post-contact metric. In a real-world deployment, this would introduce data leakage. See Section 2.1 of the report.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
    accuracy_score,
    f1_score,
    recall_score
)

import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from scipy.stats import ttest_rel

# ----------------------------------------------------------
# RESULTS DIRECTORY
# ----------------------------------------------------------
RESULTS_DIR = "results_run"
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

# ----------------------------------------------------------
# GLOBAL SETTINGS
# ----------------------------------------------------------
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

## 1. Data Loading & Preprocessing

In [None]:
def load_dataset(path="bank-additional/bank-additional.csv"):
    df = pd.read_csv(path, sep=';')
    df["y"] = df["y"].map({"yes": 1, "no": 0})

    encoder = LabelEncoder()
    for col in df.select_dtypes(include="object").columns:
        df[col] = encoder.fit_transform(df[col])

    return df

## 2. Exploratory Data Analysis (EDA)

**Note:** Quantitative correlation analysis shows 'duration' (0.40) and 'euribor3m' (-0.31) as strongest drivers.

In [None]:
def plot_class_distribution(y):
    counts = y.value_counts()
    plt.bar(["No", "Yes"], counts.values)
    plt.title("Target Class Distribution")
    plt.ylabel("Count")
    plt.savefig(os.path.join(RESULTS_DIR, "class_distribution.png"))
    plt.show()

def plot_feature_correlations(df, top_n=15):
    corr = df.corr()["y"].abs().sort_values(ascending=False)[1:top_n+1]
    plt.figure(figsize=(10, 8))
    sns.heatmap(df[corr.index.tolist() + ["y"]].corr(), annot=True, cmap="coolwarm")
    plt.title("Top Feature Correlations with Target")
    plt.savefig(os.path.join(RESULTS_DIR, "feature_correlations.png"))
    plt.show()

## 3. Visualization Utilities & Data Split

In [None]:
def split_data(df):
    X = df.drop("y", axis=1)
    y = df["y"]
    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def plot_confusion(cm, title):
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(os.path.join(RESULTS_DIR, f"cm_{title.replace(' ', '_').lower()}.png"))
    plt.show()

def plot_roc(y_true, y_prob, label):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{label} (AUC={roc_auc:.3f})")

def plot_pr(y_true, y_prob, title):
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    ap = average_precision_score(y_true, y_prob)
    plt.plot(recall, precision, label=f"AP={ap:.3f}")
    plt.title(title)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.savefig(os.path.join(RESULTS_DIR, f"pr_{title.replace(' ', '_').lower()}.png"))
    plt.show()

## 4. Experiment Runner

In [None]:
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(f"\n{name}")
    print(classification_report(y_test, y_pred))

    plot_confusion(confusion_matrix(y_test, y_pred),
                   f"{name} – Confusion Matrix")
    plot_pr(y_test, y_prob,
            f"{name} – Precision Recall Curve")

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "y_prob": y_prob
    }

## 5. Main Experimental Pipeline

This pipeline runs 9 experiments in a 3x3 design matrix:
- **Families:** Decision Tree, Random Forest, LightGBM
- **Objectives:** Accuracy, Recall, Balanced

In [None]:
def run_pipeline():
    print("Loading data...")
    df = load_dataset()

    print("EDA...")
    plot_class_distribution(df["y"])
    plot_feature_correlations(df)

    X_train, X_test, y_train, y_test = split_data(df)

    results = {}

    # =======================
    # DECISION TREE (3 EXPERIMENTS)
    # =======================
    dt_params = {
        "Accuracy": DecisionTreeClassifier(max_depth=10, random_state=42),
        "Recall": DecisionTreeClassifier(max_depth=5, min_samples_leaf=50, random_state=42),
        "Balanced": DecisionTreeClassifier(max_depth=7, min_samples_leaf=20, random_state=42)
    }

    for exp, model in dt_params.items():
        model.fit(X_train, y_train)
        results[f"DT_{exp}"] = evaluate_model(f"Decision Tree – {exp}", model, X_test, y_test)

    # =======================
    # RANDOM FOREST (3 EXPERIMENTS)
    # =======================
    rf_params = {
        "Accuracy": RandomForestClassifier(n_estimators=300, random_state=42),
        "Recall": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42),
        "Balanced": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
    }

    for exp, model in rf_params.items():
        model.fit(X_train, y_train)
        results[f"RF_{exp}"] = evaluate_model(f"Random Forest – {exp}", model, X_test, y_test)

    # =======================
    # LIGHTGBM (3 EXPERIMENTS)
    # =======================
    lgb_params = {
        "Accuracy": lgb.LGBMClassifier(n_estimators=1500, learning_rate=0.02),
        "Recall": lgb.LGBMClassifier(n_estimators=800, learning_rate=0.05, is_unbalance=True),
        "Balanced": lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.03)
    }

    for exp, model in lgb_params.items():
        model.fit(X_train, y_train)
        results[f"LGBM_{exp}"] = evaluate_model(f"LightGBM – {exp}", model, X_test, y_test)

    # =======================
    # ROC COMPARISON (BEST MODELS)
    # =======================
    plt.figure()
    for key in results:
        plot_roc(y_test, results[key]["y_prob"], key)
    plt.plot([0, 1], [0, 1], "--", color="gray")
    plt.legend()
    plt.title("ROC Curve Comparison")
    plt.savefig(os.path.join(RESULTS_DIR, "roc_comparison.png"))
    plt.show()

    # =======================
    # STATISTICAL TEST (LightGBM)
    # =======================
    print("\nStatistical Test (LightGBM Accuracy vs Balanced)")
    acc = [results["LGBM_Accuracy"]["accuracy"]]
    bal = [results["LGBM_Balanced"]["accuracy"]]

    t_stat, p_val = ttest_rel(acc, bal)
    print(f"T-statistic: {t_stat:.4f}, p-value: {p_val:.4f}")

    if p_val < 0.05:
        print("Difference is statistically significant.")
    else:
        print("No statistically significant difference.")

In [None]:
# Execute the pipeline
if __name__ == "__main__":
    run_pipeline()