In [53]:
# 04_advanced_models.ipynb — Jigsaw Agile Community Rules (XGBoost + submit)

# Works locally and on Kaggle (Internet OFF). Produces /kaggle/working/submission.csv on Kaggle.

# --- NEW IMPORTS (added 2025-09-24 for running 05_ensemble model) ---
import sys, os, glob, re, warnings
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import joblib
import xgboost as xgb
#

print("Python:", sys.version)
print("NumPy :", np.__version__)
print("Pandas:", pd.__version__)


# ===== DEFINE X and y (REPLACEMENT BLOCK) =====
import os, re, json, joblib
import numpy as np, pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

# 0) Where processed features live (keep as-is)
PROC_DIR = "data/processed"

# 1) Robust data path resolution
def first_existing_dir(candidates, required_files=("train.csv","test.csv","sample_submission.csv")):
    for d in candidates:
        try:
            if d and all(os.path.exists(os.path.join(d, f)) for f in required_files):
                return d
        except Exception:
            pass
    return None

DATA_DIR_CANDIDATES = [
    "data/raw",  # project-local default
    "/Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw",  # your local absolute path
    "/kaggle/input/jigsaw-competition/data/raw",  # if you staged a copy on Kaggle
    "/kaggle/input/jigsaw-toxic-comment-classification-challenge",  # common Kaggle dataset root
]

DATA_DIR = first_existing_dir(DATA_DIR_CANDIDATES)
if DATA_DIR is None:
    raise FileNotFoundError(
        "Could not find train/test/sample CSVs. "
        "Either copy them to ./data/raw or update DATA_DIR_CANDIDATES with your absolute path."
    )
print(f"[DATA] Using DATA_DIR = {DATA_DIR}")

train_path  = os.path.join(DATA_DIR, "train.csv")
test_path   = os.path.join(DATA_DIR, "test.csv")
sample_path = os.path.join(DATA_DIR, "sample_submission.csv")

# 2) Try processed features; otherwise build TF-IDF from raw
X, y = None, None
try:
    X = joblib.load(os.path.join(PROC_DIR, "X_tfidf.pkl"))
    y = joblib.load(os.path.join(PROC_DIR, "y.pkl"))
    X_test_full = joblib.load(os.path.join(PROC_DIR, "X_test_tfidf.pkl"))
    print(f"[PROC] Loaded processed features: X={X.shape}, y={y.shape}, X_test={X_test_full.shape}")
except Exception as e:
    print("[PROC] Processed features not found; will build TF-IDF from raw…", e)
    train_df  = pd.read_csv(train_path)
    test_df   = pd.read_csv(test_path)
    sample    = pd.read_csv(sample_path)

    # Heuristics to find ID/TEXT/TARGET columns
    id_candidates = [c for c in train_df.columns if "id" in c.lower() or "row" in c.lower()]
    ID_COL = id_candidates[0] if id_candidates else train_df.columns[0]

    text_candidates = [c for c in train_df.columns if train_df[c].dtype == "object"]
    pref = [c for c in text_candidates if c.lower() in {"text","comment_text","content","message"}]
    TEXT_COL = pref[0] if pref else (text_candidates[0] if text_candidates else None)
    if TEXT_COL is None:
        raise ValueError("Could not detect a TEXT column. Set TEXT_COL manually.")

    num_cols = [c for c in train_df.columns if pd.api.types.is_numeric_dtype(train_df[c]) and c != ID_COL]
    # Prefer common binary label names; otherwise find any {0,1} column
    for cand in ["label","target","toxic","is_toxic"]:
        if cand in train_df.columns and cand in num_cols:
            TARGET_COL = cand
            break
    else:
        TARGET_COL = None
        for c in num_cols:
            vals = set(pd.Series(train_df[c]).dropna().unique().tolist())
            if vals.issubset({0,1}):
                TARGET_COL = c
                break
        if TARGET_COL is None:
            raise ValueError("Could not detect a binary TARGET column. Set TARGET_COL manually.")

    print(f"[COLUMNS] ID_COL={ID_COL} | TEXT_COL={TEXT_COL} | TARGET_COL={TARGET_COL}")

    def clean_text(s):
        if pd.isna(s): return ""
        s = str(s)
        s = re.sub(r"\s+", " ", s)
        return s.strip()

    train_df[TEXT_COL] = train_df[TEXT_COL].map(clean_text)
    test_df[TEXT_COL]  = test_df[TEXT_COL].map(clean_text)

    tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=15000, min_df=2)
    X_tr_text = tfidf.fit_transform(train_df[TEXT_COL].values)
    X_te_text = tfidf.transform(test_df[TEXT_COL].values)

    # Optional: stack numeric aux features saved by 03 (if present)
    aux_tr_path = os.path.join(PROC_DIR, "aux_train.npy")
    aux_te_path = os.path.join(PROC_DIR, "aux_test.npy")
    if os.path.exists(aux_tr_path) and os.path.exists(aux_te_path):
        aux_tr = np.load(aux_tr_path); aux_te = np.load(aux_te_path)
        if aux_tr.ndim == 1: aux_tr = aux_tr[:, None]
        if aux_te.ndim == 1: aux_te = aux_te[:, None]
        X = sparse.hstack([X_tr_text, sparse.csr_matrix(aux_tr)], format="csr")
        X_test_full = sparse.hstack([X_te_text, sparse.csr_matrix(aux_te)], format="csr")
        print(f"[FEATS] TF-IDF + AUX → X={X.shape}, test={X_test_full.shape}")
    else:
        X = X_tr_text
        X_test_full = X_te_text
        print(f"[FEATS] TF-IDF only → X={X.shape}, test={X_test_full.shape}")

    y = train_df[TARGET_COL].astype(int).values

    # Save for reuse (optional)
    os.makedirs(PROC_DIR, exist_ok=True)
    try:
        joblib.dump(X, os.path.join(PROC_DIR, "X_tfidf.pkl"))
        joblib.dump(y, os.path.join(PROC_DIR, "y.pkl"))
        joblib.dump(X_test_full, os.path.join(PROC_DIR, "X_test_tfidf.pkl"))
        joblib.dump(tfidf, os.path.join(PROC_DIR, "tfidf_vectorizer.pkl"))
        joblib.dump({"ID_COL":ID_COL,"TEXT_COL":TEXT_COL,"TARGET_COL":TARGET_COL}, os.path.join(PROC_DIR, "column_meta.pkl"))
        print("[PROC] Saved TF-IDF features to data/processed/")
    except Exception as e2:
        print("[PROC] Skipping joblib dump:", e2)

# --- alias for downstream cells expecting `X_test` ---
X_combined = X
X_test = X_test_full


assert X.shape[0] == len(y), "X and y length mismatch"
# =====================================================


# 1) Try to load prebuilt matrices from 03_feature_engineering (if available)
X, y = None, None
try:
    X = joblib.load(os.path.join(PROC_DIR, "X_tfidf.pkl"))
    y = joblib.load(os.path.join(PROC_DIR, "y.pkl"))
    print(f"Loaded processed features: X shape={X.shape}, y shape={y.shape}")
except Exception as e:
    print("Processed features not found, will build TF-IDF from raw CSV…", e)

# 2) If not found, build from raw CSV now
if X is None or y is None:
    train_path = os.path.join(DATA_DIR, "train.csv")
    test_path  = os.path.join(DATA_DIR, "test.csv")
    sample_path = os.path.join(DATA_DIR, "sample_submission.csv")

    train_df  = pd.read_csv(train_path)
    test_df   = pd.read_csv(test_path)
    sample    = pd.read_csv(sample_path)

    # Heuristics to find text/label/id cols
    # (Adjust if your column names are different)
    id_candidates = [c for c in train_df.columns if "id" in c.lower() or "row" in c.lower()]
    ID_COL = id_candidates[0] if id_candidates else train_df.columns[0]

    # pick the first long string column as text
    text_candidates = [c for c in train_df.columns if train_df[c].dtype == "object"]
    # prefer common names if present
    pref = [c for c in text_candidates if c.lower() in {"text", "comment_text", "content", "message"}]
    TEXT_COL = pref[0] if pref else (text_candidates[0] if text_candidates else None)
    if TEXT_COL is None:
        raise ValueError("Could not detect a TEXT column. Please set TEXT_COL manually.")

    # pick a binary numeric label column
    num_cols = [c for c in train_df.columns if pd.api.types.is_numeric_dtype(train_df[c])]
    # exclude typical non-label numerics like id-like columns
    blacklist = {ID_COL}
    num_cols = [c for c in num_cols if c not in blacklist]
    # try common names first
    for cand in ["label", "target", "toxic", "is_toxic"]:
        if cand in train_df.columns and cand in num_cols:
            TARGET_COL = cand
            break
    else:
        # fallback: any numeric column with only {0,1}
        TARGET_COL = None
        for c in num_cols:
            vals = set(pd.Series(train_df[c]).dropna().unique().tolist())
            if vals.issubset({0,1}):
                TARGET_COL = c
                break
        if TARGET_COL is None:
            raise ValueError("Could not detect a binary TARGET column. Please set TARGET_COL manually.")

    print(f"ID_COL={ID_COL} | TEXT_COL={TEXT_COL} | TARGET_COL={TARGET_COL}")

    # basic clean
    def clean_text(s):
        if pd.isna(s): return ""
        s = str(s)
        s = re.sub(r"\s+", " ", s)
        return s.strip()

    train_df[TEXT_COL] = train_df[TEXT_COL].map(clean_text)
    test_df[TEXT_COL]  = test_df[TEXT_COL].map(clean_text)

    # TF-IDF features (1,2)-grams as per your earlier setup
    tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=15000, min_df=2)
    X_tr_text = tfidf.fit_transform(train_df[TEXT_COL].values)
    X_te_text = tfidf.transform(test_df[TEXT_COL].values)

    # If you have numeric aux features from 03 (e.g., lengths, counts), try to load & hstack
    aux_tr_path = os.path.join(PROC_DIR, "aux_train.npy")
    aux_te_path = os.path.join(PROC_DIR, "aux_test.npy")
    if os.path.exists(aux_tr_path) and os.path.exists(aux_te_path):
        aux_tr = np.load(aux_tr_path)
        aux_te = np.load(aux_te_path)
        # ensure 2D
        if aux_tr.ndim == 1: aux_tr = aux_tr[:, None]
        if aux_te.ndim == 1: aux_te = aux_te[:, None]
        X = sparse.hstack([X_tr_text, sparse.csr_matrix(aux_tr)], format="csr")
        X_test_full = sparse.hstack([X_te_text, sparse.csr_matrix(aux_te)], format="csr")
        print(f"HSTACK with aux feats → X shape={X.shape}, test shape={X_test_full.shape}")
    else:
        X = X_tr_text
        X_test_full = X_te_text
        print(f"TF-IDF only → X shape={X.shape}, test shape={X_test_full.shape}")

    y = train_df[TARGET_COL].astype(int).values

    # Save for reuse (optional)
    os.makedirs(PROC_DIR, exist_ok=True)
    try:
        joblib.dump(X, os.path.join(PROC_DIR, "X_tfidf.pkl"))
        joblib.dump(y, os.path.join(PROC_DIR, "y.pkl"))
        joblib.dump(X_test_full, os.path.join(PROC_DIR, "X_test_tfidf.pkl"))
        joblib.dump(tfidf, os.path.join(PROC_DIR, "tfidf_vectorizer.pkl"))
    except Exception as e:
        print("Skipping joblib dump:", e)

# Small sanity check
assert X.shape[0] == len(y), "X and y length mismatch"
# =======================================================================


X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train/valid shapes:", X_tr.shape, X_va.shape, y_tr.shape, y_va.shape)
print("Pos rate (train/valid):", y_tr.mean().round(4), y_va.mean().round(4))


# Preferred: XGBoost with early stopping; Fallback: Logistic Regression
use_xgb = True
best_threshold = 0.5
val_f1 = None

try:
    print("Using XGBoost …")
    xgb_params = dict(
        max_depth=8,
        learning_rate=0.07,
        n_estimators=800,              # large cap; early stopping will trim
        objective="binary:logistic",
        eval_metric="logloss",
        colsample_bytree=0.8,
        subsample=0.9,
        min_child_weight=1,
        reg_lambda=1.0,
        random_state=42,
        tree_method="hist",
        n_jobs=-1
    )
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=False,
        early_stopping_rounds=50
    )
    # Probabilities and dynamic threshold for F1(macro)
    va_prob = model.predict_proba(X_va)[:, 1]
    # Scan thresholds to maximise macro-F1 on validation
    thr_grid = np.linspace(0.2, 0.8, 61)  # coarse but fine for small set
    f1s = []
    for t in thr_grid:
        f1s.append(f1_score(y_va, (va_prob >= t).astype(int), average="macro"))
    best_idx = int(np.argmax(f1s))
    best_threshold = float(thr_grid[best_idx])
    val_f1 = float(f1s[best_idx])
    print(f"Best threshold (val) = {best_threshold:.3f} | Val F1(macro) = {val_f1:.4f}")

    # Confusion matrix at best threshold
    y_pred_va = (va_prob >= best_threshold).astype(int)
    print("Validation confusion matrix:\n", confusion_matrix(y_va, y_pred_va))
    print(classification_report(y_va, y_pred_va, digits=4))

    # Refit on ALL data with best n_estimators (best_iteration_) if available
    best_n = getattr(model, "best_iteration", None)
    if best_n is None:
        best_n = getattr(model, "best_ntree_limit", None)
    if best_n is None:
        best_n = xgb_params["n_estimators"]
    else:
        best_n = int(best_n) + 1

    model_final = xgb.XGBClassifier(**{**xgb_params, "n_estimators": best_n})
    model_final.fit(X, y, verbose=False)

    # Predict test with tuned threshold
    test_prob = model_final.predict_proba(X_test)[:, 1]
    test_pred = (test_prob >= best_threshold).astype(int)

except Exception as e:
    warnings.warn(f"XGBoost unavailable or errored ({e}). Falling back to Logistic Regression.")
    use_xgb = False
    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression(
        solver="saga",
        penalty="l2",
        class_weight="balanced",
        max_iter=3000,
        n_jobs=-1,
        random_state=42
    )
    lr.fit(X_tr, y_tr)
    va_prob = lr.predict_proba(X_va)[:, 1]
    # threshold tuning
    thr_grid = np.linspace(0.2, 0.8, 61)
    f1s = [f1_score(y_va, (va_prob >= t).astype(int), average="macro") for t in thr_grid]
    best_idx = int(np.argmax(f1s))
    best_threshold = float(thr_grid[best_idx])
    val_f1 = float(f1s[best_idx])
    print(f"[LR] Best threshold (val) = {best_threshold:.3f} | Val F1(macro) = {val_f1:.4f}")

    y_pred_va = (va_prob >= best_threshold).astype(int)
    print("Validation confusion matrix:\n", confusion_matrix(y_va, y_pred_va))
    print(classification_report(y_va, y_pred_va, digits=4))

    # Train on all & predict test
    lr.fit(X, y)
    test_prob = lr.predict_proba(X_test)[:, 1]
    test_pred = (test_prob >= best_threshold).astype(int)

# === OOF + test probabilities for ensembling (XGBoost) ===
import numpy as np, os, pandas as pd
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

IS_KAGGLE = os.path.exists("/kaggle/input")
os.makedirs("results/oof", exist_ok=True)
os.makedirs("results/test_probs", exist_ok=True)

ID_COL = sample.columns[0]        # assumes you already loaded sample_submission.csv
train_ids = train_df[ID_COL].values
test_ids  = sample[ID_COL].values

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_prob = np.zeros(len(train_df), dtype=float)
test_prob_folds = []

for fold,(tr,va) in enumerate(skf.split(X_combined, y), 1):
    print(f"[XGB OOF] Fold {fold}/{N_FOLDS}")
    model = xgb.XGBClassifier(**params)  # reuse your 'params'
    model.fit(X_combined[tr], y[tr], eval_set=[(X_combined[va], y[va])],
              verbose=False, early_stopping_rounds=50)
    oof_prob[va] = model.predict_proba(X_combined[va])[:,1]
    test_prob_folds.append(model.predict_proba(X_test)[:,1])

test_prob = np.mean(np.column_stack(test_prob_folds), axis=1)

# Save files for ensembling
pd.DataFrame({"row_id": train_ids, "prob": oof_prob}).to_csv("results/oof/xgb_tfidf_feats_oof.csv", index=False)
pd.DataFrame({"row_id": test_ids,  "prob": test_prob}).to_csv("results/test_probs/xgb_tfidf_feats_test.csv", index=False)
print("Saved OOF/test probs for XGB → results/oof/xgb_tfidf_feats_oof.csv & results/test_probs/xgb_tfidf_feats_test.csv")

# === OOF + test probabilities for ensembling (Logistic Regression on TF-IDF) ===
import os, json
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Preconditions: we rely on names already created earlier in 04:
# X (csr_matrix), y (1D array), X_test (csr_matrix), train_df, sample, ID_COL
for name in ["X","y","X_test","train_df","sample","ID_COL"]:
    if name not in globals():
        raise RuntimeError(f"Missing `{name}` — run the features/paths cells first.")

os.makedirs("results/oof", exist_ok=True)
os.makedirs("results/test_probs", exist_ok=True)

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_prob = np.zeros(X.shape[0], dtype=float)
test_prob_folds = []

# A solid baseline config for sparse TF-IDF (binary)
lr = LogisticRegression(
    penalty="l2",
    C=2.0,
    solver="liblinear",   # good for sparse, binary
    class_weight="balanced",
    max_iter=2000,
    n_jobs=-1,
)

for fold, (tr, va) in enumerate(skf.split(X, y), 1):
    print(f"[LR OOF] Fold {fold}/{N_FOLDS}")
    lr.fit(X[tr], y[tr])
    oof_prob[va] = lr.predict_proba(X[va])[:, 1]
    test_prob_folds.append(lr.predict_proba(X_test)[:, 1])

test_prob = np.mean(np.column_stack(test_prob_folds), axis=1)

# Tune threshold on OOF (use existing helper if present)
def _tune_thr(y_true, p):
    if "tune_threshold" in globals():
        thr, f1 = tune_threshold(y_true, p, lo=0.25, hi=0.75, steps=201)
        return float(thr), float(f1)
    grid = np.linspace(0.25, 0.75, 201)
    scores = [f1_score(y_true, (p >= t).astype(int)) for t in grid]
    i = int(np.argmax(scores))
    return float(grid[i]), float(scores[i])

best_thr, val_f1 = _tune_thr(y, oof_prob)
print(f"[LR OOF] Best OOF F1={val_f1:.4f} at thr={best_thr:.3f}")

# Save for 05_ensemble
oof_path   = "results/oof/logreg_tfidf_feats_oof.csv"
test_path  = "results/test_probs/logreg_tfidf_feats_test.csv"
pd.DataFrame({"row_id": train_df[ID_COL].values, "prob": oof_prob}).to_csv(oof_path, index=False)
pd.DataFrame({"row_id": sample[ID_COL].values, "prob": test_prob}).to_csv(test_path, index=False)
print(f"[LR SAVE] Wrote {oof_path} and {test_path}")

# Log val_f1 so 05 can do weighted blends
meta_path = "results/models.json"
model_key = "logreg_tfidf_feats"
try:
    meta = json.load(open(meta_path)) if os.path.exists(meta_path) else {}
except Exception:
    meta = {}
if isinstance(meta, list):
    meta = {d["model"]: d for d in meta if isinstance(d, dict) and "model" in d}
meta[model_key] = {"val_f1": float(val_f1)}
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)
print(f"[LR SAVE] Logged {model_key} val_f1={val_f1:.4f} → results/models.json")


# Update results/models.json so 05_ensemble can weight models by val_f1
import json, os
meta_path = "results/models.json"
model_key = "xgb_tfidf_feats"
try:
    meta = json.load(open(meta_path)) if os.path.exists(meta_path) else {}
except Exception:
    meta = {}
# If file was a list, coerce to dict keyed by model
if isinstance(meta, list):
    meta = {d["model"]: d for d in meta if isinstance(d, dict) and "model" in d}
meta[model_key] = {"val_f1": float(val_f1)}
os.makedirs("results", exist_ok=True)
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)
print(f"Logged {model_key} val_f1={val_f1:.4f} → results/models.json")


# ========= 6) Build & validate submission =========
submission = sample.copy()
submission[TARGET_OUT] = test_pred.astype(int)

errors = []
if list(submission.columns) != list(sample.columns):
    errors.append(f"Columns mismatch. Expected {list(sample.columns)}, got {list(submission.columns)}")
if len(submission) != len(sample):
    errors.append(f"Row count mismatch. Expected {len(sample)}, got {len(submission)}")
if not submission[ID_COL].equals(sample[ID_COL]):
    if set(submission[ID_COL]) != set(sample[ID_COL]):
        missing = list(sorted(set(sample[ID_COL]) - set(submission[ID_COL])))[:5]
        extra   = list(sorted(set(submission[ID_COL]) - set(sample[ID_COL])))[:5]
        errors.append(f"ID set differs. Missing: {missing} | Extra: {extra}")
    else:
        errors.append("ID order differs from sample. Must match sample_submission order.")
if submission[TARGET_OUT].isna().any():
    errors.append("Target has NaNs.")
u = set(np.unique(submission[TARGET_OUT]))
if not u.issubset({0,1}):
    errors.append(f"Target invalid values {sorted(u)}; must be 0/1.")

if errors:
    print("❌ Submission invalid:")
    for e in errors: print(" -", e)
    raise SystemExit(1)

# ========= 7) Save submission (Kaggle + local) =========
if IS_KAGGLE:
    submission.to_csv(OUT_KAGGLE, index=False)
    print(f"✅ Saved Kaggle file: {OUT_KAGGLE}")
submission.to_csv(OUT_LOCAL, index=False)
print(f"✅ Saved local copy : {OUT_LOCAL}")

print(f"\nModel used: {'XGBoost' if use_xgb else 'LogisticRegression'}")
print(f"Validation F1 (macro): {val_f1:.4f} at threshold {best_threshold:.3f}")
print("Final submission head:\n", submission.head())


Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
NumPy : 1.26.4
Pandas: 2.2.3
[DATA] Using DATA_DIR = /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw
[PROC] Loaded processed features: X=(2029, 9966), y=(2029,), X_test=(10, 9966)
Loaded processed features: X shape=(2029, 9966), y shape=(2029,)
Train/valid shapes: (1623, 9966) (406, 9966) (1623,) (406,)
Pos rate (train/valid): 0.5083 0.5074
Using XGBoost …




Best threshold (val) = 0.440 | Val F1(macro) = 0.7373
Validation confusion matrix:
 [[134  66]
 [ 40 166]]
              precision    recall  f1-score   support

           0     0.7701    0.6700    0.7166       200
           1     0.7155    0.8058    0.7580       206

    accuracy                         0.7389       406
   macro avg     0.7428    0.7379    0.7373       406
weighted avg     0.7424    0.7389    0.7376       406

[XGB OOF] Fold 1/5




[XGB OOF] Fold 2/5
[XGB OOF] Fold 3/5
[XGB OOF] Fold 4/5
[XGB OOF] Fold 5/5
Saved OOF/test probs for XGB → results/oof/xgb_tfidf_feats_oof.csv & results/test_probs/xgb_tfidf_feats_test.csv
[LR OOF] Fold 1/5
[LR OOF] Fold 2/5
[LR OOF] Fold 3/5
[LR OOF] Fold 4/5
[LR OOF] Fold 5/5
[LR OOF] Best OOF F1=0.7685 at thr=0.445
[LR SAVE] Wrote results/oof/logreg_tfidf_feats_oof.csv and results/test_probs/logreg_tfidf_feats_test.csv
[LR SAVE] Logged logreg_tfidf_feats val_f1=0.7685 → results/models.json
Logged xgb_tfidf_feats val_f1=0.7685 → results/models.json




NameError: name 'TARGET_OUT' is not defined