In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scripts.data_loader import load_caravan

In [2]:
# ---------------------------
# 1) Load data and names
# ---------------------------
train, test, X, y, TARGET = load_caravan(data_dir="./data")
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (5822, 86)
Test shape: (4000, 85)


In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print("Train/Val sizes:", X_train.shape, X_val.shape)
print("Positive rate (train/val):", round(y_train.mean(),4), round(y_val.mean(),4))

Train/Val sizes: (4657, 85) (1165, 85)
Positive rate (train/val): 0.0597 0.0601


In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint, uniform

gb = GradientBoostingClassifier(random_state=42)

param_dist = {
    'n_estimators': randint(150, 451),         # 150..450
    'learning_rate': uniform(0.03, 0.17),      # ~0.03..0.20
    'max_depth': randint(2, 5),                # 2..4 (trees inside GB)
    'min_samples_split': randint(2, 21),       # 2..20
    'min_samples_leaf': randint(1, 11),        # 1..10
    'subsample': uniform(0.7, 0.3),            # ~0.7..1.0
    'max_features': ['sqrt', 'log2', None]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gb_search = RandomizedSearchCV(
    estimator=gb,
    param_distributions=param_dist,
    n_iter=40,                    # small but effective search
    scoring='average_precision',  # PR-AUC
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1,
    refit=True
)

gb_search.fit(X_train, y_train)
print("Best GB params:", gb_search.best_params_)
print("Best CV PR-AUC:", round(gb_search.best_score_, 4))

gb_best = gb_search.best_estimator_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [None]:
import numpy as np

proba_val = gb_best.predict_proba(X_val)[:, 1]
preds_val = (proba_val >= 0.5).astype(int)

roc = roc_auc_score(y_val, proba_val)
pra = average_precision_score(y_val, proba_val)
print("\n[GB Tuned] Validation ROC-AUC:", round(roc, 4))
print("[GB Tuned] Validation PR-AUC :", round(pra, 4))

print("\n[GB Tuned] Classification report @ 0.5")
print(classification_report(y_val, preds_val, zero_division=0))

cm = confusion_matrix(y_val, preds_val)
print("Confusion matrix:\n", cm)

def precision_at_k(y_true, scores, k=0.05):
    n = len(scores)
    top = max(1, int(k*n))
    idx = np.argsort(-scores)[:top]
    return float(y_true.iloc[idx].mean())

p5  = precision_at_k(y_val, proba_val, 0.05)
p10 = precision_at_k(y_val, proba_val, 0.10)
print("Precision@Top5%:", round(p5, 4))
print("Precision@Top10%:", round(p10, 4))
