In [2]:
# 06_neural_advanced.ipynb — Task 3.2
# Advanced neural variants (Keras) with dual-run support (local + Kaggle)
# - Loads data (auto-detect)
# - Reuses engineered features from 03_feature_engineering
# - Two model variants: Stacked BiGRU + Attention, and CNN+BiGRU+MultiHeadAttention
# - Mini hyperparam sweep (seq_len, batch, dropout)
# - Threshold tune (macro-F1), write Kaggle /kaggle/working/submission.csv and local submissions/submission.csv

# ========= 0) Imports & environment =========
import os, sys, re, glob, json, math, warnings
from datetime import datetime
import numpy as np
import pandas as pd

print("Python:", sys.version)
print("NumPy :", np.__version__)
print("Pandas:", pd.__version__)

IS_KAGGLE = os.path.exists("/kaggle/input")
KAGGLE_DIR = "/kaggle/input/jigsaw-agile-community-rules"
KAGGLE_WORKING = "/kaggle/working" if IS_KAGGLE else None
OUT_KAGGLE = os.path.join(KAGGLE_WORKING, "submission.csv") if IS_KAGGLE else None

os.makedirs("submissions", exist_ok=True)
OUT_LOCAL = "submissions/submission.csv"
os.makedirs("results", exist_ok=True)
os.makedirs("models", exist_ok=True)

# ========= 1) Require TensorFlow (Kaggle TF image or local TF install) =========
try:
    import tensorflow as tf
    from tensorflow import keras
    print("TensorFlow:", tf.__version__)
    print("Devices:", tf.config.list_physical_devices())
except Exception as e:
    raise RuntimeError(
        "TensorFlow is required for Task 3.2.\n"
        "On Kaggle: set Image=TensorFlow, Accelerator=None/CPU is fine. Internet OFF.\n"
        "On Mac (Apple Silicon): use tensorflow-macos 2.16.x + tensorflow-metal.\n"
        f"Import error: {e}"
    )

# Optional: quieter logs
import logging
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL","2")
tf.get_logger().setLevel(logging.ERROR)
tf.config.set_soft_device_placement(True)

# ========= 2) Data loaders (Kaggle-first, then local fallbacks) =========
CANDIDATE_DIRS = [
    ".", "data/raw", "../data/raw", "../../data/raw",
    "jigsaw-agile-community-rules", "../jigsaw-agile-community-rules", "../../jigsaw-agile-community-rules"
]
def _candidate_paths(filename: str):
    paths = []
    if os.path.exists(KAGGLE_DIR):
        paths.append(os.path.join(KAGGLE_DIR, filename))
    for d in CANDIDATE_DIRS:
        paths.append(os.path.join(d, filename))
    paths.extend(glob.glob(f"**/{filename}", recursive=True))
    seen, out = set(), []
    for p in paths:
        ap = os.path.abspath(p)
        if ap not in seen and os.path.exists(ap):
            seen.add(ap); out.append(ap)
    return out

def read_first_csv(filename: str):
    found = _candidate_paths(filename)
    if not found:
        raise FileNotFoundError(f"Could not find {filename} in Kaggle folder or local fallbacks.")
    print(f"📄 Loading {filename} from: {found[0]}")
    return pd.read_csv(found[0])

train_df = read_first_csv("train.csv")
test_df  = read_first_csv("test.csv")
sample   = read_first_csv("sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)
print("Sample shape:", sample.shape)

# ========= 3) Column detection =========
TEXT_COL = next((c for c in ["comment_text","body","text"] if c in train_df.columns), None)
TARGET_COL = next((c for c in ["rule_violation","target","label"] if c in train_df.columns), None)
ID_COL, TARGET_OUT = sample.columns[0], sample.columns[1]
assert TEXT_COL and TARGET_COL, "Expected text and target columns."

print(f"TEXT_COL  = {TEXT_COL}")
print(f"TARGET_COL= {TARGET_COL}")
print(f"ID_COL    = {ID_COL} | TARGET_OUT = {TARGET_OUT}")

# ========= 4) Data prep & split =========
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report

SEED = 42
rng = np.random.default_rng(SEED)

X_text = train_df[TEXT_COL].fillna("").astype(str).values
y = train_df[TARGET_COL].astype(int).values
X_test_text = test_df[TEXT_COL].fillna("").astype(str).values

X_tr_text, X_va_text, y_tr, y_va = train_test_split(
    X_text, y, test_size=0.2, random_state=SEED, stratify=y
)

def compute_class_weights(y_array):
    n = len(y_array); pos = int((y_array==1).sum()); neg = n - pos
    return {0: n/(2*neg), 1: n/(2*pos)}
CLASS_WEIGHTS = compute_class_weights(y_tr)
print("Class weights:", CLASS_WEIGHTS)

# ========= 5) Load engineered features from Task 2 (or compute if missing) =========
import os, re, numpy as np, pandas as pd

# write to both local project path and Kaggle-working so later notebooks can see them
PKL_DIRS = ["data/processed", "/kaggle/working/data/processed"] if os.path.exists("/kaggle") else ["data/processed"]
for d in PKL_DIRS: os.makedirs(d, exist_ok=True)

TR_FEATS_PKL = "data/processed/train_features.pkl"
TE_FEATS_PKL = "data/processed/test_features.pkl"

def _extract_features_minimal(df, text_col):
    s = df[text_col].fillna("").astype(str)
    # helpers
    def ratio_safe(num, den):
        den = np.maximum(den, 1)
        return num / den

    # counts
    char_count = s.str.len().astype(np.int32)
    word_count = s.str.split().str.len().fillna(0).astype(np.int32)
    uniq_word_count = s.apply(lambda x: len(set(x.split())) if x else 0).astype(np.int32)
    avg_word_len = ratio_safe(char_count, word_count).astype(np.float32)

    # casings & punctuation
    caps_count = s.str.count(r"[A-Z]")
    caps_ratio = ratio_safe(caps_count, char_count).astype(np.float32)
    all_caps_words = s.str.count(r"\b[A-Z]{2,}\b").astype(np.int32)

    excl_count = s.str.count("!")
    ques_count = s.str.count(r"\?")
    dots3_count = s.str.count(r"\.\.\.")
    punct_count = s.str.count(r"[^\w\s]")  # crude punctuation
    punct_ratio = ratio_safe(punct_count, char_count).astype(np.float32)
    excl_ratio = ratio_safe(excl_count, char_count).astype(np.float32)
    ques_ratio = ratio_safe(ques_count, char_count).astype(np.float32)

    # reddit / markup / links / quotes
    has_user = s.str.contains(r"u/\w+", regex=True).astype(np.int8)
    has_sub  = s.str.contains(r"r/\w+", regex=True).astype(np.int8)
    has_url  = s.str.contains(r"http", regex=True).astype(np.int8)
    quote_count = s.str.count(r"^>|\n>", flags=re.MULTILINE).astype(np.int32)
    md_links = s.str.count(r"\[[^\]]+\]\([^)]+\)").astype(np.int32)

    # lexical tokens
    you_count = s.str.count(r"\byou\b", flags=re.IGNORECASE).astype(np.int32)
    i_count   = s.str.count(r"\bi\b", flags=re.IGNORECASE).astype(np.int32)
    num_count = s.str.count(r"\d").astype(np.int32)
    negate_count = s.str.count(r"\b(no|not|never|n't)\b", flags=re.IGNORECASE).astype(np.int32)

    you_ratio = ratio_safe(you_count, word_count).astype(np.float32)
    i_ratio   = ratio_safe(i_count, word_count).astype(np.float32)
    num_ratio = ratio_safe(num_count, word_count).astype(np.float32)

    # lexical diversity (unique / total)
    lexical_diversity = ratio_safe(uniq_word_count, word_count).astype(np.float32)

    feats = pd.DataFrame({
        "char_count": char_count,
        "word_count": word_count,
        "uniq_word_count": uniq_word_count,
        "avg_word_len": avg_word_len,
        "caps_ratio": caps_ratio,
        "all_caps_words": all_caps_words,
        "excl_count": excl_count,
        "ques_count": ques_count,
        "dots3_count": dots3_count,
        "punct_count": punct_count,
        "punct_ratio": punct_ratio,
        "excl_ratio": excl_ratio,
        "ques_ratio": ques_ratio,
        "has_user_mention": has_user,
        "has_subreddit_mention": has_sub,
        "has_url": has_url,
        "quote_count": quote_count,
        "md_links": md_links,
        "you_count": you_count,
        "i_count": i_count,
        "num_count": num_count,
        "negate_count": negate_count,
        "you_ratio": you_ratio,
        "i_ratio": i_ratio,
        "num_ratio": num_ratio,
        "lexical_diversity": lexical_diversity,
    }).astype({
        # keep dtypes tight where possible
        "all_caps_words":"int32","excl_count":"int32","ques_count":"int32","dots3_count":"int32",
        "punct_count":"int32","quote_count":"int32","md_links":"int32","you_count":"int32",
        "i_count":"int32","num_count":"int32","negate_count":"int32",
        "has_user_mention":"int8","has_subreddit_mention":"int8","has_url":"int8",
    })

    # Ensure consistent column order
    return feats.reindex(sorted(feats.columns), axis=1)

# Load or compute
if os.path.exists(TR_FEATS_PKL) and os.path.exists(TE_FEATS_PKL):
    train_features = pd.read_pickle(TR_FEATS_PKL)
    test_features  = pd.read_pickle(TE_FEATS_PKL)
    print("✅ Loaded engineered features from 03.")
else:
    print("⚙️  Computing engineered features (03 fallback)…")
    train_features = _extract_features_minimal(train_df, TEXT_COL)
    test_features  = _extract_features_minimal(test_df,  TEXT_COL)

    # Save to both places so future runs find them
    for base in PKL_DIRS:
        pf_tr = os.path.join(base, "train_features.pkl")
        pf_te = os.path.join(base, "test_features.pkl")
        train_features.to_pickle(pf_tr)
        test_features.to_pickle(pf_te)
        print("💾 Saved:", pf_tr, "and", pf_te)

print(f"Features ready. Train: {train_features.shape} | Test: {test_features.shape}")

from sklearn.preprocessing import StandardScaler
feat_scaler = StandardScaler()
feats_tr_all = feat_scaler.fit_transform(train_features.values.astype(np.float32))
feats_te     = feat_scaler.transform(test_features.values.astype(np.float32))

# keep split consistent
X_idx = np.arange(len(X_text))
idx_tr, idx_va, _, _ = train_test_split(X_idx, y, test_size=0.2, random_state=SEED, stratify=y)
feats_tr_split = feats_tr_all[idx_tr]
feats_va_split = feats_tr_all[idx_va]

# ========= 6) FAST_DEBUG toggle + TextVectorization =========
FAST_DEBUG = False  # set True to smoke-test quickly
MAX_TOKENS = 30000
SEQ_LEN = 160 if FAST_DEBUG else 200
HAS_GPU = bool(tf.config.list_physical_devices('GPU'))
BATCH = 32 if FAST_DEBUG or not HAS_GPU else 64
EPOCHS = 4 if FAST_DEBUG else 14  # early stopping will cap earlier

print(f"[FAST_DEBUG={FAST_DEBUG}] HAS_GPU={HAS_GPU} | SEQ_LEN={SEQ_LEN} | BATCH={BATCH} | EPOCHS={EPOCHS}")

AUTOTUNE = tf.data.AUTOTUNE
text_vec = keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS, output_mode="int", output_sequence_length=SEQ_LEN,
    standardize="lower_and_strip_punctuation", split="whitespace"
)
text_vec.adapt(tf.data.Dataset.from_tensor_slices(X_tr_text).batch(256))

# ========= 7) tf.data with auxiliary features =========
def make_ds_with_feats(x_arr, feats_arr, y_arr=None, train=False):
    if y_arr is None:
        ds = tf.data.Dataset.from_tensor_slices((x_arr, feats_arr))
        if train: ds = ds.shuffle(len(x_arr), seed=SEED)
        ds = ds.batch(BATCH).map(lambda txt,f: (text_vec(txt), f), num_parallel_calls=AUTOTUNE)
    else:
        ds = tf.data.Dataset.from_tensor_slices((x_arr, feats_arr, y_arr))
        if train: ds = ds.shuffle(len(x_arr), seed=SEED)
        ds = ds.batch(BATCH).map(lambda txt,f,y: ((text_vec(txt), f), y), num_parallel_calls=AUTOTUNE)
    return ds.prefetch(AUTOTUNE)

ds_tr = make_ds_with_feats(X_tr_text, feats_tr_split, y_tr, train=True)
ds_va = make_ds_with_feats(X_va_text, feats_va_split, y_va, train=False)

# sanity
for batch in ds_tr.take(1):
    (tok_ids, aux_feats), yb = batch
    print("Sample batch:", tok_ids.shape, aux_feats.shape, yb.shape)

# ========= 8) Model builders (two variants) =========
L2 = keras.regularizers.l2(1e-5)

def build_v1_stacked_bigru_attn(vocab_size, n_aux, embed_dim=128, gru_units=96, dropout=0.35):
    tok_in = keras.Input(shape=(SEQ_LEN,), dtype="int32", name="tok_ids")
    aux_in = keras.Input(shape=(n_aux,), dtype="float32", name="aux_feats")

    emb = keras.layers.Embedding(vocab_size, embed_dim, mask_zero=False,
                                 embeddings_regularizer=L2, name="emb")(tok_in)

    x = keras.layers.Bidirectional(keras.layers.GRU(gru_units, return_sequences=True, dropout=0.2), name="bigru_1")(emb)
    x = keras.layers.Bidirectional(keras.layers.GRU(gru_units//2, return_sequences=True, dropout=0.2), name="bigru_2")(x)

    # Lightweight attention: Query from pooled context over Keys/Values = x
    q = keras.layers.GlobalAveragePooling1D(name="attn_query_pool")(x)
    q = keras.layers.Reshape((1, q.shape[-1]))(q)
    attn_out = keras.layers.Attention(name="attn")([q, x])  # shape (None,1,d)
    attn_out = keras.layers.Flatten()(attn_out)

    aux_h = keras.layers.Dense(64, activation="relu", kernel_regularizer=L2)(aux_in)
    aux_h = keras.layers.Dropout(0.2)(aux_h)

    h = keras.layers.Concatenate(name="fusion")([attn_out, aux_h])
    h = keras.layers.Dropout(dropout)(h)
    h = keras.layers.Dense(192, activation="relu", kernel_regularizer=L2)(h)
    h = keras.layers.Dropout(dropout)(h)
    out = keras.layers.Dense(1, activation="sigmoid", name="out")(h)

    model = keras.Model([tok_in, aux_in], out, name="V1_StackedBiGRU_Attn")
    model.compile(optimizer=keras.optimizers.Adam(2e-3),
                  loss="binary_crossentropy",
                  metrics=[keras.metrics.AUC(name="auc"),
                           keras.metrics.Precision(name="prec"),
                           keras.metrics.Recall(name="rec")])
    return model

def build_v2_cnn_bigru_mha(vocab_size, n_aux, embed_dim=128, gru_units=96, dropout=0.35, heads=4):
    tok_in = keras.Input(shape=(SEQ_LEN,), dtype="int32", name="tok_ids")
    aux_in = keras.Input(shape=(n_aux,), dtype="float32", name="aux_feats")

    emb = keras.layers.Embedding(vocab_size, embed_dim, mask_zero=False,
                                 embeddings_regularizer=L2, name="emb")(tok_in)

    # CNN path
    c = keras.layers.SpatialDropout1D(0.2)(emb)
    c = keras.layers.Conv1D(128, 3, padding="same", activation="relu", kernel_regularizer=L2)(c)
    c = keras.layers.Conv1D(128, 5, padding="same", activation="relu", kernel_regularizer=L2)(c)
    gmp, gap = keras.layers.GlobalMaxPooling1D()(c), keras.layers.GlobalAveragePooling1D()(c)
    c = keras.layers.Concatenate()([gmp, gap])

    # BiGRU with MHA
    r = keras.layers.Bidirectional(keras.layers.GRU(gru_units, return_sequences=True, dropout=0.2))(emb)
    r = keras.layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim//heads, dropout=0.1)(r, r)
    r = keras.layers.GlobalMaxPooling1D()(r)

    aux_h = keras.layers.Dense(64, activation="relu", kernel_regularizer=L2)(aux_in)
    aux_h = keras.layers.Dropout(0.2)(aux_h)

    h = keras.layers.Concatenate()([c, r, aux_h])
    h = keras.layers.Dropout(dropout)(h)
    h = keras.layers.Dense(192, activation="relu", kernel_regularizer=L2)(h)
    h = keras.layers.Dropout(dropout)(h)
    out = keras.layers.Dense(1, activation="sigmoid", name="out")(h)

    model = keras.Model([tok_in, aux_in], out, name="V2_CNN_BiGRU_MHA")
    model.compile(optimizer=keras.optimizers.Adam(2e-3),
                  loss="binary_crossentropy",
                  metrics=[keras.metrics.AUC(name="auc"),
                           keras.metrics.Precision(name="prec"),
                           keras.metrics.Recall(name="rec")])
    return model

# ========= 9) Tiny hyperparam sweep driver =========
VOCAB_SIZE = MAX_TOKENS + 2
N_AUX = feats_tr_all.shape[1]

grid = [
    # (variant, seq_len, batch, dropout)
    ("V1_StackedBiGRU_Attn", SEQ_LEN, BATCH, 0.35),
    ("V2_CNN_BiGRU_MHA",    SEQ_LEN, BATCH, 0.35),
]
# Optionally expand grid when not in FAST_DEBUG:
if not FAST_DEBUG:
    grid += [
        ("V1_StackedBiGRU_Attn", SEQ_LEN, max(16, BATCH//2), 0.35),
        ("V2_CNN_BiGRU_MHA",    SEQ_LEN, max(16, BATCH//2), 0.30),
    ]

def train_and_eval(variant, seq_len, batch, dropout):
    global SEQ_LEN, BATCH
    SEQ_LEN, BATCH = seq_len, batch

    # Rebuild datasets with the new SEQ_LEN/BATCH
    ds_tr_local = make_ds_with_feats(X_tr_text, feats_tr_split, y_tr, train=True)
    ds_va_local = make_ds_with_feats(X_va_text, feats_va_split, y_va, train=False)

    # Build model
    if variant == "V1_StackedBiGRU_Attn":
        model = build_v1_stacked_bigru_attn(VOCAB_SIZE, N_AUX, embed_dim=128, gru_units=96, dropout=dropout)
    else:
        model = build_v2_cnn_bigru_mha(VOCAB_SIZE, N_AUX, embed_dim=128, gru_units=96, dropout=dropout, heads=4)

    # Warm-up (compile kernels)
    _ = model.predict(ds_tr_local.take(1), verbose=0)

    ckpt_dir = (KAGGLE_WORKING if IS_KAGGLE else "models")
    os.makedirs(ckpt_dir, exist_ok=True)
    ckpt_path = os.path.join(ckpt_dir, f"{variant}_best.keras")

    callbacks = [
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True, verbose=1),
        keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=5e-5, verbose=1),
        keras.callbacks.ModelCheckpoint(ckpt_path, monitor="val_loss", save_best_only=True, verbose=1)
    ]

    history = model.fit(
        ds_tr_local, validation_data=ds_va_local, epochs=EPOCHS,
        class_weight=CLASS_WEIGHTS, callbacks=callbacks, verbose=1
    )

    # Tune threshold on validation
    va_probs = model.predict(ds_va_local, verbose=0).ravel()
    thr_grid = np.linspace(0.30, 0.70, 81)
    f1s = [f1_score(y_va, (va_probs >= t).astype(int), average="macro") for t in thr_grid]
    best_idx = int(np.argmax(f1s))
    best_thr = float(thr_grid[best_idx])
    val_f1 = float(f1s[best_idx])

    print(f"[{variant}] SEQ_LEN={seq_len} BATCH={batch} drop={dropout} | Val F1(macro)={val_f1:.4f} @ thr={best_thr:.3f}")
    return model, val_f1, best_thr

# Run the sweep and keep the best
best = {"variant": None, "f1": -1.0, "thr": 0.5, "seq_len": SEQ_LEN, "batch": BATCH, "dropout": 0.35, "model": None}
for (v, sl, bs, dr) in grid:
    model, f1v, thr = train_and_eval(v, sl, bs, dr)
    if f1v > best["f1"]:
        best.update({"variant": v, "f1": f1v, "thr": thr, "seq_len": sl, "batch": bs, "dropout": dr, "model": model})

print("\n=== BEST CONFIG ===")
print(best)

# ========= 10) Predict test explicitly (two-input) & build submission =========
print("Predicting test with best model …")
# Re-tokenise test with current SEQ_LEN
import tensorflow as tf

# base_vec is the vectorizer you trained/used for this run
base_vec = text_vec  # whatever you called it during training

def clone_vectorizer_with_length(base_vec, seq_len: int):
    # Grab the config of the trained vectorizer
    cfg = base_vec.get_config().copy()
    # Overwrite the only thing we need to change
    cfg["output_sequence_length"] = int(seq_len)

    # Build a new vectorizer using the same preprocessing settings
    new_vec = tf.keras.layers.TextVectorization(
        max_tokens=cfg.get("max_tokens", None),
        standardize=cfg.get("standardize", None),
        split=cfg.get("split", None),
        ngrams=cfg.get("ngrams", None),
        output_mode=cfg.get("output_mode", "int"),
        output_sequence_length=cfg.get("output_sequence_length", None),
        pad_to_max_tokens=cfg.get("pad_to_max_tokens", False),
        vocabulary=None,  # we’ll set it explicitly just below
    )
    # Copy the exact learned vocabulary so token IDs match training
    new_vec.set_vocabulary(base_vec.get_vocabulary())
    return new_vec

# Rebuild a vectorizer that matches the best model’s expected seq length
text_vec_te = clone_vectorizer_with_length(base_vec, best["seq_len"])

# Vectorise the test texts using the cloned vectorizer
X_te_tok = text_vec_te(tf.constant(X_test_text))

# Predict using the best model
test_probs = best["model"].predict([X_te_tok, feats_te], batch_size=best["batch"], verbose=0).ravel()

test_pred  = (test_probs >= best["thr"]).astype(int)

submission = sample.copy()
submission[TARGET_OUT] = test_pred.astype(int)

# Validate submission
errors = []
if list(submission.columns) != list(sample.columns):
    errors.append(f"Columns mismatch. Expected {list(sample.columns)}, got {list(submission.columns)}")
if len(submission) != len(sample):
    errors.append(f"Row count mismatch. Expected {len(sample)}, got {len(submission)}")
if not submission[ID_COL].equals(sample[ID_COL]):
    if set(submission[ID_COL]) != set(sample[ID_COL]):
        missing = list(sorted(set(sample[ID_COL]) - set(submission[ID_COL])))[:5]
        extra   = list(sorted(set(submission[ID_COL]) - set(sample[ID_COL])))[:5]
        errors.append(f"ID set differs. Missing: {missing} | Extra: {extra}")
    else:
        errors.append("ID order differs from sample. Must match sample_submission order.")
if submission[TARGET_OUT].isna().any():
    errors.append("Target has NaNs.")
u = set(np.unique(submission[TARGET_OUT]))
if not u.issubset({0,1}):
    errors.append(f"Target invalid values {sorted(u)}; must be 0/1.")
if errors:
    print("❌ Submission invalid:"); [print(" -", e) for e in errors]; raise SystemExit(1)

if IS_KAGGLE:
    submission.to_csv(OUT_KAGGLE, index=False)
    print(f"✅ Saved Kaggle file: {OUT_KAGGLE}")
submission.to_csv(OUT_LOCAL, index=False)
print(f"✅ Saved local copy : {OUT_LOCAL}")

# ========= 11) Log run info =========
run_info = {
    "task": "3.2_neural_advanced",
    "best": {
        "variant": best["variant"],
        "val_f1_macro": float(best["f1"]),
        "threshold": float(best["thr"]),
        "seq_len": int(best["seq_len"]),
        "batch": int(best["batch"]),
        "dropout": float(best["dropout"]),
    },
    "seed": SEED,
    "time": datetime.now().isoformat(timespec="seconds")
}
with open("results/run_06_neural_advanced.json","w") as f:
    json.dump(run_info, f, indent=2)

print("\nModel used:", best["variant"])
print(f"Validation F1 (macro): {best['f1']:.4f} at threshold {best['thr']:.3f}")
print("Final submission head:\n", submission.head())


Python: 3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]
NumPy : 1.26.4
Pandas: 2.2.3
TensorFlow: 2.16.2
Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
📄 Loading train.csv from: /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/train.csv
📄 Loading test.csv from: /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/test.csv
📄 Loading sample_submission.csv from: /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/sample_submission.csv
Train shape: (2029, 9)
Test  shape: (10, 8)
Sample shape: (10, 2)
TEXT_COL  = body
TARGET_COL= rule_violation
ID_COL    = row_id | TARGET_OUT = rule_violation
Class weights: {0: 1.0169172932330828, 1: 0.9836363636363636}
✅ Loaded engineered features from 03.
Features ready. Train: (2029, 4