In [2]:
# ===== FORCE DATA PATHS (idempotent, no dependencies) =====
import os, pandas as pd

# 1) Point to your confirmed folder
ABS_DIR = "/Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw"

# 2) Build absolute paths (don't rely on any earlier vars)
FORCE_TRAIN = os.path.join(ABS_DIR, "train.csv")
FORCE_TEST  = os.path.join(ABS_DIR, "test.csv")
FORCE_SAMP  = os.path.join(ABS_DIR, "sample_submission.csv")

for p in (FORCE_TRAIN, FORCE_TEST, FORCE_SAMP):
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing file: {p}")

print("📄 Using:")
print("  train.csv →", FORCE_TRAIN)
print("  test.csv  →", FORCE_TEST)
print("  sample    →", FORCE_SAMP)

# 3) Load once here and expose as globals
train_df = pd.read_csv(FORCE_TRAIN)
test_df  = pd.read_csv(FORCE_TEST)
sample   = pd.read_csv(FORCE_SAMP)

# 4) Standard names
id_candidates = [c for c in sample.columns if ("id" in c.lower() or "row" in c.lower())]
ID_COL = id_candidates[0] if id_candidates else sample.columns[0]
TARGET_OUT = sample.columns[1] if sample.shape[1] >= 2 else "prediction"
print(f"[NAMES] ID_COL={ID_COL} | TARGET_OUT={TARGET_OUT}")

# 5) Override any later resolver functions/vars that older cells might call
KAGGLE_DIR = ""  # neutralise Kaggle-only path
def _first_existing(paths):
    """Always return our forced paths for the three dataset files."""
    prefer = {
        "train.csv": FORCE_TRAIN,
        "test.csv": FORCE_TEST,
        "sample_submission.csv": FORCE_SAMP,
    }
    for p in paths:
        base = os.path.basename(p)
        if base in prefer:
            return prefer[base]
        if os.path.isabs(p) and os.path.exists(p):
            return p
    # Last resort
    for v in prefer.values():
        if os.path.exists(v):
            return v
    return None

# Also set the legacy names some cells expect
TRAIN_PATH = FORCE_TRAIN
TEST_PATH  = FORCE_TEST
SAMP_PATH  = FORCE_SAMP

DATA_READY = True
print("[PATHS] Forced paths set and resolver overridden.")


📄 Using:
  train.csv → /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/train.csv
  test.csv  → /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/test.csv
  sample    → /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/sample_submission.csv
[NAMES] ID_COL=row_id | TARGET_OUT=rule_violation
[PATHS] Forced paths set and resolver overridden.


In [3]:
import os, glob
print("ABS_DIR exists:", os.path.isdir(ABS_DIR))
print("Files in ABS_DIR:", sorted([os.path.basename(p) for p in glob.glob(os.path.join(ABS_DIR, "*"))]))
print("Exists train:", os.path.exists(TRAIN_PATH))
print("Exists test :", os.path.exists(TEST_PATH))
print("Exists samp :", os.path.exists(SAMP_PATH))

ABS_DIR exists: True
Files in ABS_DIR: ['sample_submission.csv', 'test.csv', 'train.csv']
Exists train: True
Exists test : True
Exists samp : True


In [6]:
# 05_ensemble.ipynb — Jigsaw Agile Community Rules
# Purpose:
#  - Load OOF (train) & test probabilities from prior notebooks
#  - Explore blends: mean, rank-avg, weight-avg (optional weights from results/models.json)
#  - Train a simple meta-learner (LogReg stacker) as Platt-like calibrator
#  - Tune threshold (macro-F1) on OOF
#  - Produce /kaggle/working/submission.csv and submissions/submission.csv

# ========= 0) Imports & environment =========
import os, sys, glob, json, math, warnings, re
from datetime import datetime
import numpy as np
import pandas as pd

print("Python:", sys.version)
print("NumPy :", np.__version__)
print("Pandas:", pd.__version__)

IS_KAGGLE = os.path.exists("/kaggle/input")
KAGGLE_DIR = "/kaggle/input/jigsaw-agile-community-rules"
KAGGLE_WORKING = "/kaggle/working" if IS_KAGGLE else None
OUT_KAGGLE = os.path.join(KAGGLE_WORKING, "submission.csv") if IS_KAGGLE else None

os.makedirs("submissions", exist_ok=True)
OUT_LOCAL = "submissions/submission.csv"
os.makedirs("results", exist_ok=True)

# ========= 1) Load core CSVs (Kaggle-first then local) =========

# --- HARD-SET DATA PATHS (replacement for the old resolver cell) ---
import os, pandas as pd

ABS_DIR = "/Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw"

TRAIN_PATH = os.path.join(ABS_DIR, "train.csv")
TEST_PATH  = os.path.join(ABS_DIR, "test.csv")
SAMP_PATH  = os.path.join(ABS_DIR, "sample_submission.csv")

for p in (TRAIN_PATH, TEST_PATH, SAMP_PATH):
    if not isinstance(p, (str, bytes, os.PathLike)) or not os.path.exists(p):
        raise FileNotFoundError(f"Missing file: {p}")

print("📄 Using:")
print("  train.csv →", TRAIN_PATH)
print("  test.csv  →", TEST_PATH)
print("  sample    →", SAMP_PATH)

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
sample   = pd.read_csv(SAMP_PATH)

# Standardise names used later
id_candidates = [c for c in sample.columns if ("id" in c.lower() or "row" in c.lower())]
ID_COL = id_candidates[0] if id_candidates else sample.columns[0]
TARGET_OUT = sample.columns[1] if sample.shape[1] >= 2 else "prediction"
print(f"[NAMES] ID_COL={ID_COL} | TARGET_OUT={TARGET_OUT}")

# Lock out any later resolvers
KAGGLE_DIR = ""
def _first_existing(paths):  # any later calls will return our forced paths
    prefer = {
        "train.csv": TRAIN_PATH,
        "test.csv": TEST_PATH,
        "sample_submission.csv": SAMP_PATH,
    }
    for p in paths:
        base = os.path.basename(p)
        if base in prefer:
            return prefer[base]
        if os.path.isabs(p) and os.path.exists(p):
            return p
    return None



train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
sample   = pd.read_csv(SAMP_PATH)

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)
print("Sample shape:", sample.shape)

TEXT_COL = next((c for c in ["comment_text","body","text"] if c in train_df.columns), None)
TARGET_COL = next((c for c in ["rule_violation","target","label"] if c in train_df.columns), None)
ID_COL, TARGET_OUT = sample.columns[0], sample.columns[1]
assert TARGET_COL, "Target column not found."


# ========= 2) Load OOF & test probabilities =========
OOF_DIR  = "results/oof"
TEST_DIR = "results/test_probs"
oof_files  = sorted(glob.glob(os.path.join(OOF_DIR, "*_oof.csv")))
test_files = sorted(glob.glob(os.path.join(TEST_DIR, "*_test.csv")))

if len(oof_files) == 0 or len(test_files) == 0:
    raise FileNotFoundError(
        f"No OOF/test files found.\n"
        f"Expected OOF in {OOF_DIR}/*_oof.csv and test in {TEST_DIR}/*_test.csv "
        f"with columns: row_id, prob"
    )

def model_key_from_path(p):
    base = os.path.basename(p)
    # strip suffixes
    return re.sub(r"(_oof|_test)?\.csv$", "", base)

# Map model key -> file path
oof_map  = {model_key_from_path(p).replace("_oof",""): p for p in oof_files}
test_map = {model_key_from_path(p).replace("_test",""): p for p in test_files}

# Keep intersection only (models that have BOTH oof and test files)
models = sorted(set(oof_map).intersection(set(test_map)))
if len(models) < 2:
    raise RuntimeError(f"Need at least 2 models with both OOF and test probs; found: {models}")

print("\nModels detected:")
for m in models:
    print(" -", m)

# Load & merge OOF by row_id aligned to train order
train_ids = train_df[ID_COL].values
y_true = train_df[TARGET_COL].astype(int).values

oof_df = pd.DataFrame({ID_COL: train_ids})
for m in models:
    tmp = pd.read_csv(oof_map[m])
    assert {"row_id","prob"}.issubset(tmp.columns), f"OOF file {oof_map[m]} missing columns."
    oof_df = oof_df.merge(tmp.rename(columns={"row_id":ID_COL, "prob":f"prob_{m}"}), on=ID_COL, how="left")

missing_cols = [c for c in oof_df.columns if c.startswith("prob_") and oof_df[c].isna().any()]
if missing_cols:
    raise ValueError(f"OOF merge mismatch; NaNs in {missing_cols}. Ensure row_id alignment with train.csv")

# Load & merge Test by row_id aligned to sample order
test_ids = sample[ID_COL].values
test_df_probs = pd.DataFrame({ID_COL: test_ids})
for m in models:
    tmp = pd.read_csv(test_map[m])
    assert {"row_id","prob"}.issubset(tmp.columns), f"Test file {test_map[m]} missing columns."
    test_df_probs = test_df_probs.merge(tmp.rename(columns={"row_id":ID_COL, "prob":f"prob_{m}"}), on=ID_COL, how="left")

missing_cols_t = [c for c in test_df_probs.columns if c.startswith("prob_") and test_df_probs[c].isna().any()]
if missing_cols_t:
    raise ValueError(f"TEST merge mismatch; NaNs in {missing_cols_t}. Ensure row_id alignment with sample_submission.csv")

print("\nOOF matrix shape:", oof_df.shape, "| Test matrix shape:", test_df_probs.shape)

# ========= 3) Quick diagnostics =========
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix

probs_mat = oof_df[[c for c in oof_df.columns if c.startswith("prob_")]].values
corr = np.corrcoef(probs_mat, rowvar=False)
print("\nModel correlation matrix (OOF probs):")
with np.printoptions(precision=3, suppress=True):
    print(corr)

# ========= 4) Blending strategies =========
def macro_f1_at_threshold(y, p, thr):
    pred = (p >= thr).astype(int)
    return f1_score(y, pred, average="macro")

def tune_threshold(y, p, grid=None):
    if grid is None:
        grid = np.linspace(0.25, 0.75, 201)
    f1s = [macro_f1_at_threshold(y, p, t) for t in grid]
    i = int(np.argmax(f1s))
    return float(grid[i]), float(f1s[i])

# 4a) Simple mean
mean_oof = probs_mat.mean(axis=1)
thr_mean, f1_mean = tune_threshold(y_true, mean_oof)
print(f"\n[Mean] OOF macro-F1 = {f1_mean:.4f} @ thr={thr_mean:.3f}")

# 4b) Rank average (robust when scales differ)
rank_oof = np.mean(np.argsort(np.argsort(probs_mat, axis=0), axis=0), axis=1) / (len(y_true)-1)
thr_rank, f1_rank = tune_threshold(y_true, rank_oof)
print(f"[Rank-Avg] OOF macro-F1 = {f1_rank:.4f} @ thr={thr_rank:.3f}")

# 4c) Weighted average
#    - If results/models.json exists and has val_f1 per model, use them as soft weights; else uniform.
weights = None
meta_path = "results/models.json"
if os.path.exists(meta_path):
    try:
        meta = json.load(open(meta_path))
        # meta can be list of dicts or dict keyed by model
        valf1_by_model = {}
        if isinstance(meta, list):
            for r in meta:
                if "model" in r and "val_f1" in r: valf1_by_model[r["model"]] = r["val_f1"]
        elif isinstance(meta, dict):
            for k,v in meta.items():
                if isinstance(v, dict) and "val_f1" in v: valf1_by_model[k] = v["val_f1"]
        w = []
        for m in models:
            w.append(max(1e-6, float(valf1_by_model.get(m, 1.0))))
        weights = np.array(w, dtype=float)
        weights = weights / weights.sum()
        print("\nUsing val_f1-based weights:", dict(zip(models, np.round(weights,4))))
    except Exception as e:
        print("Could not parse results/models.json; falling back to uniform weights.", e)

if weights is None:
    weights = np.ones(len(models), dtype=float) / len(models)

wavg_oof = np.average(probs_mat, axis=1, weights=weights)
thr_wavg, f1_wavg = tune_threshold(y_true, wavg_oof)
print(f"[Weighted-Avg] OOF macro-F1 = {f1_wavg:.4f} @ thr={thr_wavg:.3f}")

# 4d) Logistic Regression stacker (acts as Platt-style calibrator too)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

X_oof = probs_mat
y_oof = y_true
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validated meta-probs to avoid bias
meta_oof = np.zeros_like(y_oof, dtype=float)
coefs = []
for tr, va in skf.split(X_oof, y_oof):
    clf = LogisticRegression(
        solver="lbfgs", max_iter=200, C=1.0, class_weight="balanced", n_jobs=None
    )
    clf.fit(X_oof[tr], y_oof[tr])
    meta_oof[va] = clf.predict_proba(X_oof[va])[:,1]
    coefs.append(clf.coef_.ravel())

thr_meta, f1_meta = tune_threshold(y_oof, meta_oof)
coef_mean = np.mean(coefs, axis=0)
print(f"[LR Stacker] OOF macro-F1 = {f1_meta:.4f} @ thr={thr_meta:.3f}")
print(" Meta weights (mean coef):", dict(zip(models, np.round(coef_mean,4))))

# ========= 5) Pick the winner on OOF =========
candidates = [
    ("mean", f1_mean, thr_mean, mean_oof),
    ("rank", f1_rank, thr_rank, rank_oof),
    ("wavg", f1_wavg, thr_wavg, wavg_oof),
    ("stacker", f1_meta, thr_meta, meta_oof),
]
winner = sorted(candidates, key=lambda x: x[1], reverse=True)[0]
WIN_NAME, WIN_F1, WIN_THR, WIN_OOF = winner
print(f"\n=== WINNER: {WIN_NAME} | OOF macro-F1={WIN_F1:.4f} @ thr={WIN_THR:.3f} ===")

from sklearn.metrics import confusion_matrix, classification_report
print("Winner confusion matrix:\n", confusion_matrix(y_true, (WIN_OOF >= WIN_THR).astype(int)))
print(classification_report(y_true, (WIN_OOF >= WIN_THR).astype(int), digits=4))

# ========= 6) Build test predictions in the same way =========
test_mat = test_df_probs[[c for c in test_df_probs.columns if c.startswith("prob_")]].values

if WIN_NAME == "mean":
    test_probs = test_mat.mean(axis=1)
elif WIN_NAME == "rank":
    test_probs = np.mean(np.argsort(np.argsort(test_mat, axis=0), axis=0), axis=1) / (len(test_mat)-1)
elif WIN_NAME == "wavg":
    test_probs = np.average(test_mat, axis=1, weights=weights)
else:
    # Fit LR on full OOF to get final stacker, then apply to test
    clf_full = LogisticRegression(solver="lbfgs", max_iter=200, C=1.0, class_weight="balanced")
    clf_full.fit(X_oof, y_oof)
    test_probs = clf_full.predict_proba(test_mat)[:,1]

test_pred = (test_probs >= WIN_THR).astype(int)

# ========= 7) Validate and write submission =========
submission = sample.copy()
submission[TARGET_OUT] = test_pred.astype(int)

errors = []
if list(submission.columns) != list(sample.columns):
    errors.append(f"Columns mismatch. Expected {list(sample.columns)}, got {list(submission.columns)}")
if len(submission) != len(sample):
    errors.append(f"Row count mismatch. Expected {len(sample)}, got {len(submission)}")
if not submission[ID_COL].equals(sample[ID_COL]):
    if set(submission[ID_COL]) != set(sample[ID_COL]):
        missing = list(sorted(set(sample[ID_COL]) - set(submission[ID_COL])))[:5]
        extra   = list(sorted(set(submission[ID_COL]) - set(sample[ID_COL])))[:5]
        errors.append(f"ID set differs. Missing: {missing} | Extra: {extra}")
    else:
        errors.append("ID order differs from sample. Must match sample_submission order.")
if submission[TARGET_OUT].isna().any():
    errors.append("Target has NaNs.")
u = set(np.unique(submission[TARGET_OUT]))
if not u.issubset({0,1}):
    errors.append(f"Target invalid values {sorted(u)}; must be 0/1.")
if errors:
    print("❌ Submission invalid:"); [print(" -", e) for e in errors]; raise SystemExit(1)

if IS_KAGGLE:
    submission.to_csv(OUT_KAGGLE, index=False)
    print(f"✅ Saved Kaggle file: {OUT_KAGGLE}")
submission.to_csv(OUT_LOCAL, index=False)
print(f"✅ Saved local copy : {OUT_LOCAL}")

# ========= 8) Log run info =========
run_info = {
    "task": "05_ensemble",
    "time": datetime.now().isoformat(timespec="seconds"),
    "models": models,
    "winner": WIN_NAME,
    "oof_f1_macro": float(WIN_F1),
    "threshold": float(WIN_THR),
}
with open("results/run_05_ensemble.json","w") as f:
    json.dump(run_info, f, indent=2)

print("\nFinal summary:")
print(f" Winner: {WIN_NAME} | OOF F1(macro)={WIN_F1:.4f} | thr={WIN_THR:.3f}")
print(" First 5 submission rows:\n", submission.head())


Python: 3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]
NumPy : 1.26.4
Pandas: 2.2.3
📄 Using:
  train.csv → /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/train.csv
  test.csv  → /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/test.csv
  sample    → /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/sample_submission.csv
[NAMES] ID_COL=row_id | TARGET_OUT=rule_violation
Train shape: (2029, 9)
Test  shape: (10, 8)
Sample shape: (10, 2)

Models detected:
 - logreg_tfidf_feats
 - xgb_tfidf_feats

OOF matrix shape: (2029, 3) | Test matrix shape: (10, 3)

Model correlation matrix (OOF probs):
[[1.    0.859]
 [0.859 1.   ]]

[Mean] OOF macro-F1 = 0.7364 @ thr=0.483
[Rank-Avg] OOF macro-F1 = 0.7391 @ thr=0.438

Using val_f1-based weights: {'logreg_tfidf_feats': 0.5, 'xgb_tfidf_feats': 0.5}
[Weighted-Avg] OOF