In [4]:
# 03_feature_engineering.ipynb — Jigsaw Agile Community Rules (Local + Kaggle safe)
# - Auto-finds data files locally or in Kaggle
# - Builds lightweight features (no external deps)
# - Persists features + MI ranking for Week 2 notes

# ===== 0) Imports & env info =====
import sys, os, glob, re
import numpy as np
import pandas as pd
print("Python:", sys.version)
print("NumPy :", np.__version__)
print("Pandas:", pd.__version__)

# Where to save outputs
os.makedirs("data/processed", exist_ok=True)
os.makedirs("results", exist_ok=True)

# ===== 1) Data file discovery (NO need to restructure) =====
# Optional: if you know exactly where your CSVs live locally, set this:
DATA_ROOT = ""  # e.g. r"/Users/michael/projects/jigsaw-competition/data/raw"  (leave empty to auto-detect)

KAGGLE_DIR = "/kaggle/input/jigsaw-agile-community-rules"
CANDIDATE_DIRS = [
    ".", "..", "../..", "../../..",
    "data/raw", "../data/raw", "../../data/raw",
    "jigsaw-agile-community-rules", "../jigsaw-agile-community-rules"
]

if DATA_ROOT:
    # Prioritise your explicit local path
    CANDIDATE_DIRS.insert(0, DATA_ROOT)

def _candidate_paths(filename: str):
    paths = []
    # Kaggle location
    if os.path.exists(KAGGLE_DIR):
        paths.append(os.path.join(KAGGLE_DIR, filename))
    # Common local locations
    for d in CANDIDATE_DIRS:
        paths.append(os.path.join(d, filename))
    # Recursive glob (last resort)
    paths.extend(glob.glob(f"**/{filename}", recursive=True))
    # Deduplicate while preserving order, include only existing
    seen, out = set(), []
    for p in paths:
        ap = os.path.abspath(p)
        if ap not in seen and os.path.exists(ap):
            seen.add(ap); out.append(ap)
    return out

def read_first_csv(filename: str):
    found = _candidate_paths(filename)
    if not found:
        print(f"\n❌ Could not find '{filename}'.")
        print("Searched relative to:", os.getcwd())
        print("Tried these dirs:\n" + "\n".join(" - " + os.path.abspath(d) for d in CANDIDATE_DIRS))
        print("\nFix options:")
        print("  1) Put train.csv/test.csv/sample_submission.csv under 'data/raw/'")
        print("  2) Or set DATA_ROOT above to the folder containing them")
        raise FileNotFoundError(filename)
    print(f"📄 Loading {filename} from: {found[0]}")
    return pd.read_csv(found[0])

train_df = read_first_csv("train.csv")
test_df  = read_first_csv("test.csv")
sample   = read_first_csv("sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)
print("Sample shape:", sample.shape)

# ===== 2) Robust column detection =====
TEXT_COL = next((c for c in ["comment_text", "body", "text"] if c in train_df.columns), None)
TARGET_COL = next((c for c in ["rule_violation", "target", "label"] if c in train_df.columns), None)
ID_COL = next((c for c in ["row_id", "id", "ID"] if c in sample.columns), None)

if TEXT_COL is None:
    raise ValueError("Couldn't find a text column in train_df (expected one of: comment_text/body/text).")
if TARGET_COL is None:
    raise ValueError("Couldn't find a target column in train_df (expected one of: rule_violation/target/label).")
if ID_COL is None:
    raise ValueError("Couldn't find an ID column in sample_submission (expected row_id/id/ID).")

print(f"TEXT_COL  = {TEXT_COL}")
print(f"TARGET_COL= {TARGET_COL}")
print(f"ID_COL    = {ID_COL}")

# ===== 3) Lightweight, dependency-free feature extractor =====
from sklearn.feature_selection import mutual_info_classif

NEG_WORDS = {
    "ban","banned","remove","removed","delete","deleted","violation","warn","warning",
    "report","flag","hate","toxic","idiot","stupid","dumb","trash","nonsense","shut","shutup",
    "racist","sexist","harass","abuse","spam","brigade","rule","rules","automod","mod","moderator"
}
POS_WORDS = {"please","thanks","thank","appreciate","sorry","kindly","cheers"}
QUESTION_WORDS = {"why","how","what","when","where","which","who"}
NEGATIONS = {"not","no","never","n't"}

EMOJI_RE = re.compile(r"[\U0001F300-\U0001FAFF]")
REPEAT_CHAR_RE = re.compile(r"(.)\1{2,}")         # loooool, ???!!!
MD_LINK_RE = re.compile(r"\[[^\]]+\]\([^)]+\)")   # [text](url)

def _safe_div(num, den):
    den = np.maximum(den, 1)
    return num / den

def _count_tokens(text, vocab):
    toks = text.lower().split()
    return sum(t in vocab for t in toks)

def extract_features_v2(df, text_col):
    s = df[text_col].fillna("").astype(str)
    feats = pd.DataFrame(index=df.index)

    # Basic length / density
    feats["char_count"]  = s.str.len()
    feats["word_count"]  = s.str.split().str.len().astype("int64")
    feats["uniq_word_count"] = s.apply(lambda x: len(set(x.lower().split())))
    feats["lexical_diversity"] = _safe_div(feats["uniq_word_count"], feats["word_count"])
    feats["avg_word_len"] = _safe_div(feats["char_count"], feats["word_count"])

    # Casing
    feats["upper_count"] = s.str.count(r"[A-Z]")
    feats["caps_ratio"]  = _safe_div(feats["upper_count"], feats["char_count"])
    feats["all_caps_words"] = s.str.count(r"\b[A-Z]{2,}\b")

    # Punctuation / structure
    feats["excl_count"]  = s.str.count("!")
    feats["ques_count"]  = s.str.count(r"\?")
    feats["dots_count"]  = s.str.count(r"\.")
    feats["ellipsis_count"] = s.str.count(r"\.\.\.")
    feats["multi_excl"]  = s.str.count(r"!!+")
    feats["multi_ques"]  = s.str.count(r"\?\?+")
    feats["mix_punct"]   = s.str.count(r"[!?]{2,}")
    feats["punct_ratio"] = _safe_div(feats["excl_count"] + feats["ques_count"] + feats["dots_count"], feats["char_count"])

    # Repeats / elongations
    feats["repeat_char"] = s.apply(lambda x: len(REPEAT_CHAR_RE.findall(x)))

    # Reddit / markdown cues
    feats["has_user_mention"]      = s.str.contains(r"u/\w+", case=False, regex=True).astype("int8")
    feats["has_subreddit_mention"] = s.str.contains(r"r/\w+", case=False, regex=True).astype("int8")
    feats["quote_count"]           = s.str.count(r"^>|\n>", flags=re.MULTILINE)
    feats["code_ticks"]            = s.str.count(r"`")
    feats["md_links"]              = s.apply(lambda x: len(MD_LINK_RE.findall(x))).astype("int16")

    # URLs / emails / numbers
    feats["has_url"]     = s.str.contains(r"http[s]?://", case=False, regex=True).astype("int8")
    feats["email_count"] = s.str.count(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
    feats["digit_count"] = s.str.count(r"\d")
    feats["num_ratio"]   = _safe_div(feats["digit_count"], feats["char_count"])

    # Emoji
    feats["emoji_count"] = s.apply(lambda x: len(EMOJI_RE.findall(x))).astype("int16")

    # Lexicon cues
    feats["neg_lex_count"] = s.apply(lambda x: _count_tokens(x, NEG_WORDS)).astype("int16")
    feats["pos_lex_count"] = s.apply(lambda x: _count_tokens(x, POS_WORDS)).astype("int16")
    feats["neg_lex_ratio"] = _safe_div(feats["neg_lex_count"], feats["word_count"])
    feats["pos_lex_ratio"] = _safe_div(feats["pos_lex_count"], feats["word_count"])

    # Pronouns / stance
    feats["you_count"] = s.str.count(r"\byou\b", flags=re.IGNORECASE)
    feats["i_count"]   = s.str.count(r"\bi\b",   flags=re.IGNORECASE)
    feats["you_ratio"] = _safe_div(feats["you_count"], feats["word_count"])
    feats["i_ratio"]   = _safe_div(feats["i_count"],   feats["word_count"])

    # Questions & negations
    feats["wh_q_count"]  = s.apply(lambda x: _count_tokens(x, QUESTION_WORDS)).astype("int16")
    feats["negate_count"]= s.apply(lambda x: _count_tokens(x, NEGATIONS)).astype("int16")

    # Start/End cues
    feats["starts_with_quote"] = s.str.match(r'^\s*["\']').astype("int8")
    feats["ends_with_q"]       = s.str.endswith("?").astype("int8")
    feats["ends_with_excl"]    = s.str.endswith("!").astype("int8")

    # Ratios
    feats["excl_ratio"] = _safe_div(feats["excl_count"], feats["char_count"])
    feats["ques_ratio"] = _safe_div(feats["ques_count"], feats["char_count"])

    # Cleanup
    feats = feats.replace([np.inf, -np.inf], 0).fillna(0)
    return feats

# ===== 4) Extract features and persist =====
train_features = extract_features_v2(train_df, TEXT_COL)
test_features  = extract_features_v2(test_df,  TEXT_COL)

train_features.to_pickle("data/processed/train_features.pkl")
test_features.to_pickle("data/processed/test_features.pkl")

print("✅ Train features:", train_features.shape)
print("✅ Test  features:", test_features.shape)

# ===== 5) Mutual Information ranking =====
y = train_df[TARGET_COL].astype(int).values
mi = mutual_info_classif(train_features.values, y, discrete_features=False, random_state=42)
mi_rank = pd.DataFrame({"feature": train_features.columns, "mi": mi}).sort_values("mi", ascending=False)
mi_rank.to_csv("results/feature_mi_rank.csv", index=False)

print("\nTop 15 features by mutual information:")
print(mi_rank.head(15))

# Optional: summary file for your project doc
with open("results/FEATURES_SUMMARY.txt", "w") as f:
    f.write(f"TEXT_COL={TEXT_COL}\nTARGET_COL={TARGET_COL}\nID_COL={ID_COL}\n")
    f.write(f"n_train_features={train_features.shape[1]}\n")
    f.write("TOP_15_MI_FEATURES:\n")
    for row in mi_rank.head(15).itertuples(index=False):
        f.write(f"- {row.feature}: {row.mi:.6f}\n")

print("\nSaved:")
print(" - data/processed/train_features.pkl")
print(" - data/processed/test_features.pkl")
print(" - results/feature_mi_rank.csv")
print(" - results/FEATURES_SUMMARY.txt")


Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
NumPy : 1.26.4
Pandas: 2.2.3
📄 Loading train.csv from: /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/train.csv
📄 Loading test.csv from: /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/test.csv
📄 Loading sample_submission.csv from: /Users/michaelmaclennan/Documents/Learning & Education/2025-04 AI & ML/jigsaw-competition/data/raw/sample_submission.csv
Train shape: (2029, 9)
Test  shape: (10, 8)
Sample shape: (10, 2)
TEXT_COL  = body
TARGET_COL= rule_violation
ID_COL    = row_id
✅ Train features: (2029, 42)
✅ Test  features: (10, 42)

Top 15 features by mutual information:
              feature        mi
4        avg_word_len  0.067606
21           md_links  0.044872
0          char_count  0.041142
6          caps_ratio  0.040016
40         excl_ratio  0.036055
41         ques_ratio  0.035856
15 

In [10]:
# ===== 6) Build and save Kaggle-ready submission.csv (works local + Kaggle) =====
import os, glob, re
import numpy as np
import pandas as pd

# 6.1 — Ensure we know where to save
KAGGLE_WORKING = "/kaggle/working"
IS_KAGGLE = os.path.exists("/kaggle/input")
OUT_KAGGLE = os.path.join(KAGGLE_WORKING, "submission.csv") if IS_KAGGLE else None
os.makedirs("submissions", exist_ok=True)
OUT_LOCAL = "submissions/submission.csv"

# We expect these from earlier cells; but re-detect defensively if needed
if "sample" not in globals():
    # try to load sample again
    def _find_one(name):
        cand = []
        if os.path.exists("/kaggle/input/jigsaw-agile-community-rules"):
            cand.append(f"/kaggle/input/jigsaw-agile-community-rules/{name}")
        cand += [name, f"data/raw/{name}"]
        for p in cand + glob.glob(f"**/{name}", recursive=True):
            if os.path.exists(p):
                return pd.read_csv(p)
        raise FileNotFoundError(name)
    sample = _find_one("sample_submission.csv")

ID_COL, TARGET_OUT = sample.columns.tolist()

if "TEXT_COL" not in globals():
    TEXT_COL = next((c for c in ["comment_text","body","text"] if c in train_df.columns), None)
if "TARGET_COL" not in globals():
    TARGET_COL = next((c for c in ["rule_violation","target","label"] if c in train_df.columns), None)
assert TEXT_COL is not None and TARGET_COL is not None, "Need TEXT_COL and TARGET_COL."

# 6.2 — Try to use an uploaded predictions CSV (fast path on Kaggle)
preds_df = None
preds_paths_priority = [
    "/kaggle/input/submission-v1-baseline/submission_v1_baseline.csv",  # your known upload
]
# generic search under /kaggle/input for any submission-like csv (excluding sample_submission)
if IS_KAGGLE:
    for root, _, files in os.walk("/kaggle/input"):
        for fn in files:
            f = os.path.join(root, fn)
            if fn.lower().endswith(".csv") and "sample_submission" not in fn.lower():
                if "submission" in fn.lower() or "pred" in fn.lower():
                    preds_paths_priority.append(f)

def _load_first_existing(paths):
    for p in paths:
        if os.path.exists(p):
            try:
                df = pd.read_csv(p)
                print(f"Using predictions from: {p} | shape={df.shape} | cols={list(df.columns)}")
                return df
            except Exception as e:
                print(f"Skip {p} ({e})")
    return None

preds_df = _load_first_existing(preds_paths_priority)

def _build_submission_from_preds(sample_df, preds_df):
    """Map arbitrary preds file into sample schema robustly."""
    id_col_candidates = [ID_COL, "row_id","id","ID","Row_ID"]
    target_col_candidates = [TARGET_OUT,"rule_violation","target","prediction","pred","prob","label"]
    id_in = next((c for c in id_col_candidates if c in preds_df.columns), None)
    tgt_in = next((c for c in target_col_candidates if c in preds_df.columns), None)

    if id_in is not None and tgt_in is not None:
        # Dedup by ID (take last)
        if preds_df[id_in].duplicated().any():
            preds_base = preds_df.drop_duplicates(subset=[id_in], keep="last").copy()
        else:
            preds_base = preds_df.copy()
        tmp = preds_base[[id_in, tgt_in]].rename(columns={id_in: ID_COL, tgt_in: TARGET_OUT})
        sub = sample_df[[ID_COL]].merge(tmp, on=ID_COL, how="left")
    else:
        # No ID in preds: fallback to by-order (must match length)
        assert len(preds_df) == len(sample_df), "Preds length must match sample_submission for order-based mapping."
        # choose first numeric/boolean-like column
        use_col = None
        for c in preds_df.columns:
            if pd.api.types.is_numeric_dtype(preds_df[c]) or preds_df[c].dtype == bool:
                use_col = c; break
        if use_col is None:
            use_col = preds_df.columns[0]
        sub = sample_df.copy()
        sub[TARGET_OUT] = preds_df[use_col].values

    # Coerce to 0/1 ints
    vals = pd.to_numeric(sub[TARGET_OUT], errors="coerce").fillna(0)
    if not np.array_equal(np.unique(vals), [0,1]):
        if vals.min() >= 0 and vals.max() <= 1:
            vals = (vals >= 0.5).astype(int)
        else:
            vals = vals.round().clip(0,1).astype(int)
    sub[TARGET_OUT] = vals
    return sub

submission = None
if preds_df is not None:
    try:
        submission = _build_submission_from_preds(sample, preds_df)
        print("Built submission from uploaded predictions.")
    except Exception as e:
        print("Failed to build from uploaded predictions, will train a quick baseline instead:", e)

# 6.3 — If no preds available, train a tiny baseline (TF-IDF + LR, scale only engineered features) and predict
print("Training quick baseline (TF-IDF + LR; scaling engineered features only) to produce submission.csv ...")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy import sparse as sp
import numpy as np

# Ensure features exist (if this block is run standalone)
if "train_features" not in globals() or "test_features" not in globals():
    train_features = extract_features_v2(train_df, TEXT_COL)
    test_features  = extract_features_v2(test_df,  TEXT_COL)

# 1) TF-IDF on text (sparse)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=3, max_df=0.9)
X_text      = tfidf.fit_transform(train_df[TEXT_COL].fillna("").astype(str))     # sparse CSR
X_text_test = tfidf.transform( test_df[TEXT_COL].fillna("").astype(str))

# 2) Scale ONLY the engineered features (they're small & dense)
feats_train = train_features.values.astype(np.float32)
feats_test  = test_features.values.astype(np.float32)

scaler = StandardScaler()      # dense scaler is fine here (42 cols)
feats_train_scaled = scaler.fit_transform(feats_train)
feats_test_scaled  = scaler.transform(feats_test)

# 3) Convert scaled features to sparse and hstack with TF-IDF
X_feat      = sp.csr_matrix(feats_train_scaled)
X_feat_test = sp.csr_matrix(feats_test_scaled)

X      = sp.hstack([X_text, X_feat], format="csr")   # ensure CSR
X_test = sp.hstack([X_text_test, X_feat_test], format="csr")

y = train_df[TARGET_COL].astype(int).values

# 4) Logistic Regression tuned for sparse, high-dim data
lr = LogisticRegression(
    solver="saga",            # robust for large sparse design matrices
    penalty="l2",
    class_weight="balanced",
    max_iter=3000,
    n_jobs=-1,
    random_state=42
)

lr.fit(X, y)
preds = lr.predict(X_test).astype(int)

# 5) Build submission in sample order/IDs
submission = sample.copy()
submission[TARGET_OUT] = preds


# 6.4 — Validate strictly
errors = []
if list(submission.columns) != list(sample.columns):
    errors.append(f"Columns mismatch. Expected {list(sample.columns)}, got {list(submission.columns)}")
if len(submission) != len(sample):
    errors.append(f"Row count mismatch. Expected {len(sample)}, got {len(submission)}")
if not submission[ID_COL].equals(sample[ID_COL]):
    if set(submission[ID_COL]) != set(sample[ID_COL]):
        missing = list(sorted(set(sample[ID_COL]) - set(submission[ID_COL])))[:5]
        extra   = list(sorted(set(submission[ID_COL]) - set(sample[ID_COL])))[:5]
        errors.append(f"ID set differs. Missing: {missing} | Extra: {extra}")
    else:
        errors.append("ID order differs from sample. Must match sample_submission order.")
if submission[TARGET_OUT].isna().any():
    errors.append("Target has NaNs.")
u = set(np.unique(submission[TARGET_OUT]))
if not u.issubset({0,1}):
    errors.append(f"Target has invalid values {sorted(u)}; must be 0/1.")

if errors:
    print("❌ Submission invalid:")
    for e in errors: print(" -", e)
    raise SystemExit(1)

# 6.5 — Save to Kaggle working (if present) and local
if IS_KAGGLE:
    submission.to_csv(OUT_KAGGLE, index=False)
    print(f"✅ Saved Kaggle file: {OUT_KAGGLE}")
if OUT_LOCAL:
    submission.to_csv(OUT_LOCAL, index=False)
    print(f"✅ Saved local copy : {OUT_LOCAL}")

# 6.6 — Final sanity print
print("Final submission shape/cols:", submission.shape, list(submission.columns))
print("Target dtype/unique:", submission[TARGET_OUT].dtype, sorted(submission[TARGET_OUT].unique()))
print("Head:\n", submission.head())


Training quick baseline (TF-IDF + LR; scaling engineered features only) to produce submission.csv ...
✅ Saved local copy : submissions/submission.csv
Final submission shape/cols: (10, 2) ['row_id', 'rule_violation']
Target dtype/unique: int64 [0, 1]
Head:
    row_id  rule_violation
0    2029               0
1    2030               0
2    2031               0
3    2032               1
4    2033               1
