In [41]:
# 04_advanced_models.ipynb — Jigsaw Agile Community Rules (XGBoost + submit)

# Works locally and on Kaggle (Internet OFF). Produces /kaggle/working/submission.csv on Kaggle.

# --- NEW IMPORTS (added 2025-09-24 for running 05_ensemble model) ---
import os, json, gc, re
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import joblib
import xgboost as xgb
# ------------------------------------------------------

# ========= 0) Imports & environment info =========

import sys, os, glob, re, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import xgboost as xgb

print("Python:", sys.version)
print("NumPy :", np.__version__)
print("Pandas:", pd.__version__)


# ===== DEFINE X and y (REPLACEMENT BLOCK) =====
import os, re, json, joblib
import numpy as np, pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

# 0) Where processed features live (keep as-is)
PROC_DIR = "data/processed"

# 1) Robust data path resolution
def first_existing_dir(candidates, required_files=("train.csv","test.csv","sample_submission.csv")):
    for d in candidates:
        try:
            if d and all(os.path.exists(os.path.join(d, f)) for f in required_files):
                return d
        except Exception:
            pass
    return None

DATA_DIR_CANDIDATES = [
    "data/raw",  # project-local default
    "/Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw",  # your local absolute path
    "/kaggle/input/jigsaw-competition/data/raw",  # if you staged a copy on Kaggle
    "/kaggle/input/jigsaw-toxic-comment-classification-challenge",  # common Kaggle dataset root
]

DATA_DIR = first_existing_dir(DATA_DIR_CANDIDATES)
if DATA_DIR is None:
    raise FileNotFoundError(
        "Could not find train/test/sample CSVs. "
        "Either copy them to ./data/raw or update DATA_DIR_CANDIDATES with your absolute path."
    )
print(f"[DATA] Using DATA_DIR = {DATA_DIR}")

train_path  = os.path.join(DATA_DIR, "train.csv")
test_path   = os.path.join(DATA_DIR, "test.csv")
sample_path = os.path.join(DATA_DIR, "sample_submission.csv")

objective="binary:logistic",
    eval_metric="logloss",      # keep simple; we tune F1 on OOF afterwards
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)
print(f"[XGB] Using scale_pos_weight={scale_pos_weight:.3f} (pos_rate={pos_rate:.4f})")

# ========= 6) Build & validate submission =========
# Ensure we have test_pred; if not, derive from test_prob (with tuned threshold if available)
if "test_pred" not in globals():
    if "test_prob" in globals():
        thr = best_thr if "best_thr" in globals() else 0.5
        test_pred = (test_prob >= thr).astype(int)
        print(f"[SUB] Derived test_pred from test_prob with thr={thr:.3f}")
    else:
        raise RuntimeError("Neither test_pred nor test_prob found. Run the model/test prediction cell first.")

# Ensure sample + names exist
if "sample" not in globals():
    sample = pd.read_csv(sample_path)
if "TARGET_OUT" not in globals():
    TARGET_OUT = sample.columns[1] if sample.shape[1] >= 2 else "prediction"
if "ID_COL" not in globals():
    ID_COL = sample.columns[0]

# Build submission with exact columns/ordering as sample
submission = sample.copy()
submission[TARGET_OUT] = test_pred.astype(int)

# Sanity checks
errors = []
if list(submission.columns) != list(sample.columns):
    errors.append("Submission columns don't match sample exactly.")
if submission[TARGET_OUT].isna().any():
    errors.append("Found NaNs in prediction column.")
if errors:
    raise ValueError(" | ".join(errors))

# Save
os.makedirs("submissions", exist_ok=True)
sub_path = "submissions/submission.csv"
submission.to_csv(sub_path, index=False)
print(f"[SUB] Wrote {sub_path} with shape {submission.shape}")


# ========= 6) Build & validate submission =========
# Ensure we have test_pred; if not, derive from test_prob (with tuned threshold if available)
if "test_pred" not in globals():
    if "test_prob" in globals():
        thr = best_thr if "best_thr" in globals() else 0.5
        test_pred = (test_prob >= thr).astype(int)
        print(f"[SUB] Derived test_pred from test_prob with thr={thr:.3f}")
    else:
        raise RuntimeError("Neither test_pred nor test_prob found. Run the model/test prediction cell first.")

# Ensure sample + names exist
if "sample" not in globals():
    sample = pd.read_csv(sample_path)
if "TARGET_OUT" not in globals():
    TARGET_OUT = sample.columns[1] if sample.shape[1] >= 2 else "prediction"
if "ID_COL" not in globals():
    ID_COL = sample.columns[0]

# Build submission with exact columns/ordering as sample
submission = sample.copy()
submission[TARGET_OUT] = test_pred.astype(int)

# Sanity checks
errors = []
if list(submission.columns) != list(sample.columns):
    errors.append("Submission columns don't match sample exactly.")
if submission[TARGET_OUT].isna().any():
    errors.append("Found NaNs in prediction column.")
if errors:
    raise ValueError(" | ".join(errors))

# Save
os.makedirs("submissions", exist_ok=True)
sub_path = "submissions/submission.csv"
submission.to_csv(sub_path, index=False)
print(f"[SUB] Wrote {sub_path} with shape {submission.shape}")


for fold,(tr,va) in enumerate(skf.split(X_combined, y), 1):
    print(f"[XGB OOF] Fold {fold}/{N_FOLDS}")
    model = xgb.XGBClassifier(**params)  # reuse your 'params'
    model.fit(X_combined[tr], y[tr], eval_set=[(X_combined[va], y[va])],
              verbose=False, early_stopping_rounds=50)
    oof_prob[va] = model.predict_proba(X_combined[va])[:,1]
    test_prob_folds.append(model.predict_proba(X_test)[:,1])

test_prob = np.mean(np.column_stack(test_prob_folds), axis=1)

# Save files for ensembling
pd.DataFrame({"row_id": train_ids, "prob": oof_prob}).to_csv("results/oof/xgb_tfidf_feats_oof.csv", index=False)
pd.DataFrame({"row_id": test_ids,  "prob": test_prob}).to_csv("results/test_probs/xgb_tfidf_feats_test.csv", index=False)
print("Saved OOF/test probs for XGB → results/oof/xgb_tfidf_feats_oof.csv & results/test_probs/xgb_tfidf_feats_test.csv")

# Update results/models.json so 05_ensemble can weight models by val_f1
import json, os
meta_path = "results/models.json"
model_key = "xgb_tfidf_feats"
try:
    meta = json.load(open(meta_path)) if os.path.exists(meta_path) else {}
except Exception:
    meta = {}
# If file was a list, coerce to dict keyed by model
if isinstance(meta, list):
    meta = {d["model"]: d for d in meta if isinstance(d, dict) and "model" in d}
meta[model_key] = {"val_f1": float(val_f1)}
os.makedirs("results", exist_ok=True)
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)
print(f"Logged {model_key} val_f1={val_f1:.4f} → results/models.json")


# ========= 6) Build & validate submission =========
submission = sample.copy()
submission[TARGET_OUT] = test_pred.astype(int)

errors = []
if list(submission.columns) != list(sample.columns):
    errors.append(f"Columns mismatch. Expected {list(sample.columns)}, got {list(submission.columns)}")
if len(submission) != len(sample):
    errors.append(f"Row count mismatch. Expected {len(sample)}, got {len(submission)}")
if not submission[ID_COL].equals(sample[ID_COL]):
    if set(submission[ID_COL]) != set(sample[ID_COL]):
        missing = list(sorted(set(sample[ID_COL]) - set(submission[ID_COL])))[:5]
        extra   = list(sorted(set(submission[ID_COL]) - set(sample[ID_COL])))[:5]
        errors.append(f"ID set differs. Missing: {missing} | Extra: {extra}")
    else:
        errors.append("ID order differs from sample. Must match sample_submission order.")
if submission[TARGET_OUT].isna().any():
    errors.append("Target has NaNs.")
u = set(np.unique(submission[TARGET_OUT]))
if not u.issubset({0,1}):
    errors.append(f"Target invalid values {sorted(u)}; must be 0/1.")

if errors:
    print("❌ Submission invalid:")
    for e in errors: print(" -", e)
    raise SystemExit(1)

# ========= 7) Save submission (Kaggle + local) =========
if IS_KAGGLE:
    submission.to_csv(OUT_KAGGLE, index=False)
    print(f"✅ Saved Kaggle file: {OUT_KAGGLE}")
submission.to_csv(OUT_LOCAL, index=False)
print(f"✅ Saved local copy : {OUT_LOCAL}")

print(f"\nModel used: {'XGBoost' if use_xgb else 'LogisticRegression'}")
print(f"Validation F1 (macro): {val_f1:.4f} at threshold {best_threshold:.3f}")
print("Final submission head:\n", submission.head())


IndentationError: unexpected indent (4140818716.py, line 70)