In [8]:
# 04_advanced_models.ipynb — Jigsaw Agile Community Rules (XGBoost + submit)

# Works locally and on Kaggle (Internet OFF). Produces /kaggle/working/submission.csv on Kaggle.

# ========= 0) Imports & environment info =========

import sys, os, glob, re, warnings
import numpy as np
import pandas as pd

print("Python:", sys.version)
print("NumPy :", np.__version__)
print("Pandas:", pd.__version__)
f_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preferred: XGBoost with early stopping; Fallback: Logistic Regression
use_xgb = True
best_threshold = 0.5
val_f1 = None

try:
    import xgboost as xgb
    print("Using XGBoost …")
    xgb_params = dict(
        max_depth=8,
        learning_rate=0.07,
        n_estimators=800,              # large cap; early stopping will trim
        objective="binary:logistic",
        eval_metric="logloss",
        colsample_bytree=0.8,
        subsample=0.9,
        min_child_weight=1,
        reg_lambda=1.0,
        random_state=42,
        tree_method="hist",
        n_jobs=-1
    )
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=False,
        early_stopping_rounds=50
    )
    # Probabilities and dynamic threshold for F1(macro)
    va_prob = model.predict_proba(X_va)[:, 1]
    # Scan thresholds to maximise macro-F1 on validation
    thr_grid = np.linspace(0.2, 0.8, 61)  # coarse but fine for small set
    f1s = []
    for t in thr_grid:
        f1s.append(f1_score(y_va, (va_prob >= t).astype(int), average="macro"))
    best_idx = int(np.argmax(f1s))
    best_threshold = float(thr_grid[best_idx])
    val_f1 = float(f1s[best_idx])
    print(f"Best threshold (val) = {best_threshold:.3f} | Val F1(macro) = {val_f1:.4f}")

    # Confusion matrix at best threshold
    y_pred_va = (va_prob >= best_threshold).astype(int)
    print("Validation confusion matrix:\n", confusion_matrix(y_va, y_pred_va))
    print(classification_report(y_va, y_pred_va, digits=4))

    # Refit on ALL data with best n_estimators (best_iteration_) if available
    best_n = getattr(model, "best_iteration", None)
    if best_n is None:
        best_n = getattr(model, "best_ntree_limit", None)
    if best_n is None:
        best_n = xgb_params["n_estimators"]
    else:
        best_n = int(best_n) + 1

    model_final = xgb.XGBClassifier(**{**xgb_params, "n_estimators": best_n})
    model_final.fit(X, y, verbose=False)

    # Predict test with tuned threshold
    test_prob = model_final.predict_proba(X_test)[:, 1]
    test_pred = (test_prob >= best_threshold).astype(int)

except Exception as e:
    warnings.warn(f"XGBoost unavailable or errored ({e}). Falling back to Logistic Regression.")
    use_xgb = False
    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression(
        solver="saga",
        penalty="l2",
        class_weight="balanced",
        max_iter=3000,
        n_jobs=-1,
        random_state=42
    )
    lr.fit(X_tr, y_tr)
    va_prob = lr.predict_proba(X_va)[:, 1]
    # threshold tuning
    thr_grid = np.linspace(0.2, 0.8, 61)
    f1s = [f1_score(y_va, (va_prob >= t).astype(int), average="macro") for t in thr_grid]
    best_idx = int(np.argmax(f1s))
    best_threshold = float(thr_grid[best_idx])
    val_f1 = float(f1s[best_idx])
    print(f"[LR] Best threshold (val) = {best_threshold:.3f} | Val F1(macro) = {val_f1:.4f}")

    y_pred_va = (va_prob >= best_threshold).astype(int)
    print("Validation confusion matrix:\n", confusion_matrix(y_va, y_pred_va))
    print(classification_report(y_va, y_pred_va, digits=4))

    # Train on all & predict test
    lr.fit(X, y)
    test_prob = lr.predict_proba(X_test)[:, 1]
    test_pred = (test_prob >= best_threshold).astype(int)

# ========= 6) Build & validate submission =========
submission = sample.copy()
submission[TARGET_OUT] = test_pred.astype(int)

errors = []
if list(submission.columns) != list(sample.columns):
    errors.append(f"Columns mismatch. Expected {list(sample.columns)}, got {list(submission.columns)}")
if len(submission) != len(sample):
    errors.append(f"Row count mismatch. Expected {len(sample)}, got {len(submission)}")
if not submission[ID_COL].equals(sample[ID_COL]):
    if set(submission[ID_COL]) != set(sample[ID_COL]):
        missing = list(sorted(set(sample[ID_COL]) - set(submission[ID_COL])))[:5]
        extra   = list(sorted(set(submission[ID_COL]) - set(sample[ID_COL])))[:5]
        errors.append(f"ID set differs. Missing: {missing} | Extra: {extra}")
    else:
        errors.append("ID order differs from sample. Must match sample_submission order.")
if submission[TARGET_OUT].isna().any():
    errors.append("Target has NaNs.")
u = set(np.unique(submission[TARGET_OUT]))
if not u.issubset({0,1}):
    errors.append(f"Target invalid values {sorted(u)}; must be 0/1.")

if errors:
    print("❌ Submission invalid:")
    for e in errors: print(" -", e)
    raise SystemExit(1)

# ========= 7) Save submission (Kaggle + local) =========
if IS_KAGGLE:
    submission.to_csv(OUT_KAGGLE, index=False)
    print(f"✅ Saved Kaggle file: {OUT_KAGGLE}")
submission.to_csv(OUT_LOCAL, index=False)
print(f"✅ Saved local copy : {OUT_LOCAL}")

print(f"\nModel used: {'XGBoost' if use_xgb else 'LogisticRegression'}")
print(f"Validation F1 (macro): {val_f1:.4f} at threshold {best_threshold:.3f}")
print("Final submission head:\n", submission.head())


Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
NumPy : 1.26.4
Pandas: 2.2.3
Using XGBoost …




Best threshold (val) = 0.610 | Val F1(macro) = 0.7602
Validation confusion matrix:
 [[167  33]
 [ 64 142]]
              precision    recall  f1-score   support

           0     0.7229    0.8350    0.7749       200
           1     0.8114    0.6893    0.7454       206

    accuracy                         0.7611       406
   macro avg     0.7672    0.7622    0.7602       406
weighted avg     0.7678    0.7611    0.7600       406

✅ Saved local copy : submissions/submission.csv

Model used: XGBoost
Validation F1 (macro): 0.7602 at threshold 0.610
Final submission head:
    row_id  rule_violation
0    2029               0
1    2030               0
2    2031               1
3    2032               1
4    2033               1
