## AdaBoost: Finetuning, regularization and debiasing

In [9]:
# === AdaBoost Tuning ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib, os
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

# Load preprocessed data and pipeline
df = pd.read_csv("../data/cox-violent-preprocessed.csv")
pipeline = joblib.load("../models/pipeline_preprocessing.pkl")

# Feature preparation
X_raw = df.drop("is_recid", axis=1)
y = df["is_recid"]
X_transformed = pipeline.transform(X_raw)
selected_features = ["age", "sex", "juv_misd_count", "juv_fel_count", "priors_count", "c_charge_degree", "c_charge_desc"]
feature_indices = [X_raw.columns.get_loc(f) for f in selected_features]
X_selected = X_transformed[:, feature_indices]

# Data split
X_trainval, X_test, y_trainval, y_test = train_test_split(X_selected, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.176, stratify=y_trainval, random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define AdaBoost pipeline & GridSearch

base_estimator = DecisionTreeClassifier(max_depth=1)

pipe = Pipeline([
    ("clf", AdaBoostClassifier(base_estimator=base_estimator, random_state=42))
])

param_grid = {
    "clf__n_estimators": [50, 100, 150],
    "clf__learning_rate": [0.5, 1.0, 1.5]
}

grid = GridSearchCV(pipe, param_grid, scoring="roc_auc", cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

# Evaluation
y_proba = best_model.predict_proba(X_val)[:, 1]
y_pred = best_model.predict(X_val)
print("Validation Results:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_pred):.3f}")
print(f"Recall: {recall_score(y_val, y_pred):.3f}")
print(f"F1: {f1_score(y_val, y_pred):.3f}")
print(f"AUC: {roc_auc_score(y_val, y_proba):.3f}")

# Learning curve
train_sizes, train_scores, val_scores = learning_curve(best_model, X_train, y_train, cv=cv, scoring="f1", train_sizes=np.linspace(0.1, 1.0, 5))
plt.figure()
plt.plot(train_sizes, train_scores.mean(axis=1), label="Train F1")
plt.plot(train_sizes, val_scores.mean(axis=1), label="Validation F1")
plt.title("Learning Curve - AdaBoost")
plt.xlabel("Training Size")
plt.ylabel("F1 Score")
plt.legend()
plt.grid()
plt.tight_layout()
os.makedirs("../reports/02_modeling/tuning", exist_ok=True)
plt.savefig("../reports/02_modeling/tuning/learning_curve_adaboost_finetuned.png")
plt.show()

# === Logging dynamic results to CSV ===

model_name = "AdaBoost"

# Prepare log path
log_file = Path("../reports/02_modeling/tuning/model_validation_comparison_log.csv")
log_file.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists

# Prepare result row dynamically
result_row = {
    "Model": model_name,
    "Accuracy": round(accuracy_score(y_val, y_pred), 3),
    "Precision": round(precision_score(y_val, y_pred), 3),
    "Recall": round(recall_score(y_val, y_pred), 3),
    "F1 Score": round(f1_score(y_val, y_pred), 3),
    "AUC": round(roc_auc_score(y_val, y_proba), 3),
    "Best Params": str(grid.best_params_)
}

# Append to log file
if log_file.exists():
    df_log = pd.read_csv(log_file)
else:
    df_log = pd.DataFrame(columns=result_row.keys())

df_log = pd.concat([df_log, pd.DataFrame([result_row])], ignore_index=True)
df_log.to_csv(log_file, index=False)
df_log

TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'