<a href="https://colab.research.google.com/github/parhamalikhan/Credit-Card-Fraud-Detection/blob/main/Untitled20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
from pathlib import Path

drive.mount("/content/drive")

REAL_DIR = "/content/drive/MyDrive/Master's and PhD applications for INDIMA"
AI_DIR   = "/content/drive/MyDrive/ai generated"

print("REAL exists?", Path(REAL_DIR).exists(), "| is_dir?", Path(REAL_DIR).is_dir())
print("AI   exists?", Path(AI_DIR).exists(),   "| is_dir?", Path(AI_DIR).is_dir())


In [None]:
!pip -q uninstall -y fitz pymupdf PyMuPDF > /dev/null 2>&1
!pip -q install "PyMuPDF==1.26.7" "pillow<12" pytesseract tqdm scikit-learn joblib
!apt-get -y install tesseract-ocr > /dev/null


In [None]:
import re, json, joblib
import numpy as np
import pandas as pd
from pathlib import Path

try:
    from tqdm import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

import fitz  # PyMuPDF
from PIL import Image
import pytesseract

ALLOWED_EXT = {".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}

# Extraction config
OCR_PDF_PAGES     = 5
MIN_DIGITAL_CHARS = 80
PDF_RENDER_DPI    = 250  # try 300 if OCR weak

print("PyMuPDF OK:", fitz.__doc__[:60])
print("Tesseract:", pytesseract.get_tesseract_version())
print("Pillow:", Image.__version__)


In [None]:
def discover_files(root_dir: str):
    root = Path(root_dir)
    out = []
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in ALLOWED_EXT:
            out.append(p)
    return out

def infer_domain_from_path(file_path: str, root_dir: str) -> str:
    try:
        rp = Path(file_path).resolve()
        rr = Path(root_dir).resolve()
        rel = rp.relative_to(rr)
        return rel.parts[0] if len(rel.parts) else "root"
    except Exception:
        return "unknown"

def infer_doc_type_from_path(p: str) -> str:
    s = p.lower()
    if "handbook" in s:
        return "handbook"
    if "transcript" in s:
        return "transcript"
    if "degree" in s or "certificate" in s or "diploma" in s:
        return "degree"
    return "unknown"

def build_df(real_dir: str, ai_dir: str) -> pd.DataFrame:
    real_files = discover_files(real_dir)
    ai_files   = discover_files(ai_dir)

    rows = []
    for p in real_files:
        rows.append({
            "path": str(p),
            "y": 0,
            "file_type": p.suffix.lower().lstrip("."),
            "source_dir": "REAL",
            "domain": infer_domain_from_path(str(p), real_dir),
            "doc_type": infer_doc_type_from_path(str(p)),
        })
    for p in ai_files:
        rows.append({
            "path": str(p),
            "y": 1,
            "file_type": p.suffix.lower().lstrip("."),
            "source_dir": "AI",
            "domain": infer_domain_from_path(str(p), ai_dir),
            "doc_type": infer_doc_type_from_path(str(p)),
        })

    return pd.DataFrame(rows)

df = build_df(REAL_DIR, AI_DIR)

print("DF shape:", df.shape)
print(df.head(3))
print("\nfile_type counts:\n", df["file_type"].value_counts(dropna=False))
print("\ndoc_type x y:\n", pd.crosstab(df["doc_type"], df["y"], dropna=False))


In [None]:
import re
import numpy as np
import pandas as pd

try:
    from tqdm import tqdm
except Exception:
    def tqdm(x, **kwargs): return x

# PyMuPDF + OCR imports
import fitz
from PIL import Image
import pytesseract

OCR_PDF_PAGES     = 5
MIN_DIGITAL_CHARS = 80
PDF_RENDER_DPI    = 250

def normalize_text(t: str) -> str:
    if not t:
        return ""
    t = t.replace("\x00", " ")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

def word_count(text: str) -> int:
    if not text:
        return 0
    return len(re.findall(r"\w+", text))

def extract_pdf_digital(pdf_path: str) -> str:
    try:
        doc = fitz.open(pdf_path)
        parts = [(page.get_text("text") or "") for page in doc]
        return normalize_text("\n".join(parts))
    except Exception:
        return ""

def ocr_image_pil(img) -> str:
    try:
        config = "--oem 3 --psm 6"
        return normalize_text(pytesseract.image_to_string(img, config=config))
    except Exception:
        return ""

def ocr_pdf_first_pages(pdf_path: str, max_pages=5, dpi=250) -> str:
    try:
        doc = fitz.open(pdf_path)
        n = min(len(doc), max_pages)
        out = []
        for i in range(n):
            pix = doc[i].get_pixmap(dpi=dpi)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            out.append(ocr_image_pil(img))
        return normalize_text("\n".join([x for x in out if x]))
    except Exception:
        return ""

def extract_text_one(path: str, file_type: str):
    ft = (file_type or "").lower()

    if ft in {"png","jpg","jpeg","tif","tiff"}:
        try:
            img = Image.open(path)
            txt = ocr_image_pil(img)
            wc = word_count(txt)
            if wc == 0:
                return "", "none", True, 0, "image_ocr_empty"
            return txt, "ocr", True, wc, ""
        except Exception as e:
            return "", "none", True, 0, f"image_exception:{type(e).__name__}"

    if ft == "pdf":
        digital = extract_pdf_digital(path)
        if len(digital) >= MIN_DIGITAL_CHARS:
            wc = word_count(digital)
            return digital, "digital", False, wc, ""

        ocr_txt = ocr_pdf_first_pages(path, max_pages=OCR_PDF_PAGES, dpi=PDF_RENDER_DPI)
        wc = word_count(ocr_txt)
        if wc == 0:
            return "", "none", True, 0, "pdf_digital_low_and_ocr_empty"
        return ocr_txt, "ocr", True, wc, ""

    return "", "none", False, 0, "unsupported_file_type"

def run_extraction(df_in: pd.DataFrame) -> pd.DataFrame:
    df_out = df_in.copy()
    for c in ["doc_text","text_source","is_scanned","word_count","drop_reason"]:
        if c not in df_out.columns:
            df_out[c] = None

    for i, row in tqdm(df_out.iterrows(), total=len(df_out), desc="Extracting"):
        txt, src, scanned, wc, reason = extract_text_one(row["path"], row["file_type"])
        df_out.at[i, "doc_text"] = txt
        df_out.at[i, "text_source"] = src
        df_out.at[i, "is_scanned"] = bool(scanned)
        df_out.at[i, "word_count"] = int(wc)
        df_out.at[i, "drop_reason"] = reason or ""
    return df_out

print("✅ run_extraction is defined")


In [None]:
cols = ["doc_text","text_source","is_scanned","word_count","drop_reason"]

df = run_extraction(df)

print(df["text_source"].value_counts(dropna=False))
print(df["drop_reason"].value_counts(dropna=False).head(10))


In [None]:
# If doc_type exists (it does in your df), we finalize doc_type_final:
df["doc_type_final"] = df["doc_type"].copy()
df.loc[df["doc_type_final"]=="unknown", "doc_type_final"] = "degree"

print("doc_type_final counts:\n", df["doc_type_final"].value_counts(dropna=False))
print("\ndoc_type_final x y:\n", pd.crosstab(df["doc_type_final"], df["y"], dropna=False))


In [None]:
print("TOTAL files:", len(df))
print("\ntext_source:\n", df["text_source"].value_counts(dropna=False))
print("\nword_count:\n", df["word_count"].describe(percentiles=[.1,.25,.5,.75,.9]))
print("\ndrop_reason top:\n", df["drop_reason"].value_counts(dropna=False).head(20))

# Debug export (never lose "why we got fewer docs")
df_debug_none = df[df["text_source"]=="none"][["path","y","file_type","source_dir","domain","doc_type_final","drop_reason","word_count"]].copy()
df_debug_none.to_csv("dropped_files_debug.csv", index=False)
print("\nSaved: dropped_files_debug.csv | rows:", len(df_debug_none))


In [None]:
MIN_WORDS = 10  # start with 10; later you can try 30

df_degree_all  = df[df["doc_type_final"]=="degree"].copy()
df_degree_text = df_degree_all[df_degree_all["word_count"].fillna(0) >= MIN_WORDS].copy()

print("degree_all :", df_degree_all.shape)
print("degree_text:", df_degree_text.shape)

print("\ny counts (degree_text):\n", df_degree_text["y"].value_counts(dropna=False))
print("\ntext_source (degree_text):\n", df_degree_text["text_source"].value_counts(dropna=False))

if len(df_degree_text) == 0:
    raise ValueError("No degree samples with enough text. Lower MIN_WORDS or improve OCR.")


In [None]:
import numpy as np
import pandas as pd

# Fix dtype issues that cause the FutureWarning
df_degree_all["word_count"]  = pd.to_numeric(df_degree_all["word_count"], errors="coerce").fillna(0).astype(int)
df_degree_text["word_count"] = pd.to_numeric(df_degree_text["word_count"], errors="coerce").fillna(0).astype(int)

# Make sure domain exists
if "domain" not in df_degree_text.columns:
    df_degree_text["domain"] = "unknown"

print("degree_all:", df_degree_all.shape, "| text:", df_degree_text.shape)
print("domain unique:", df_degree_text["domain"].nunique())
print(pd.crosstab(df_degree_text["text_source"], df_degree_text["y"], normalize="columns"))


In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report

def threshold_at_fpr(y_true, y_score, target_fpr=0.05):
    y_true = np.asarray(y_true); y_score = np.asarray(y_score)
    neg = y_score[y_true == 0]
    return float(np.quantile(neg, 1 - target_fpr)) if len(neg) else 0.5

def groupkfold_oof_proba(model, X, y, groups, n_splits=5):
    gkf = GroupKFold(n_splits=n_splits)
    oof = np.zeros(len(y), dtype=float)
    for tr, te in gkf.split(X, y, groups=groups):
        model.fit(X[tr], y[tr])
        oof[te] = model.predict_proba(X[te])[:, 1]
    return oof


MODEL 1 — TF-IDF (word+char) + LR with domain-holdout CV


In [None]:
import numpy as np
import pandas as pd

dom = df_degree_text.groupby("domain")["y"].agg(
    n="count",
    n_ai="sum",
    n_human=lambda s: (s==0).sum()
).sort_values(["n_human","n_ai"], ascending=True)

print(dom.head(30))
print("\n#domains:", dom.shape[0])
print("#single-class domains:", ( (dom["n_human"]==0) | (dom["n_ai"]==0) ).sum())


In [None]:
import re
from pathlib import Path

REAL_ROOT = Path(REAL_DIR).resolve()
AI_ROOT   = Path(AI_DIR).resolve()

def group_id_from_path(path_str: str, source_dir: str) -> str:
    p = Path(path_str).resolve()

    # REAL: group by student folder (best leakage control)
    if source_dir == "REAL":
        try:
            rel = p.relative_to(REAL_ROOT)
            parts = list(rel.parts)
        except Exception:
            parts = list(p.parts)

        # pick first folder that looks like studentXXX
        for part in parts:
            if re.match(r"(?i)^student", part):
                return f"REAL_{part}"
        # fallback: parent folder name
        return f"REAL_{p.parent.name}"

    # AI: group by CERT timestamp/batch if present
    if source_dir == "AI":
        m = re.search(r"CERT-(\d+)", p.name)
        if m:
            return f"AI_CERT_{m.group(1)}"
        # fallback: parent folder
        return f"AI_{p.parent.name}"

    return "unknown"

df_degree_text["group_id"] = [
    group_id_from_path(p, s) for p, s in zip(df_degree_text["path"], df_degree_text["source_dir"])
]

print("group_id unique:", df_degree_text["group_id"].nunique())
tmp = df_degree_text.groupby("group_id")["y"].agg(n="count", n_ai="sum", n_human=lambda x: (x==0).sum())
print("single-class groups:", ((tmp["n_ai"]==0) | (tmp["n_human"]==0)).sum(), " / ", tmp.shape[0])
print(tmp.sort_values("n", ascending=False).head(10))


In [None]:
PAD = "zzpadzz"
print("✅ PAD defined:", PAD)


In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# uses your PAD token (zzpadzz) from fixed-length step
tfidf_feats = FeatureUnion([
    ("word", TfidfVectorizer(
        max_features=20000,
        ngram_range=(1,2),
        min_df=2,
        binary=True,
        stop_words=[PAD],   # ignore padding token
    )),
    ("char", TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3,5),
        min_df=2,
        binary=True,
    )),
])

tfidf_lr = Pipeline([
    ("tfidf", tfidf_feats),
    ("lr", LogisticRegression(max_iter=4000))
])

print("✅ tfidf_lr is defined")


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report

X = df_degree_text["doc_text"].fillna("").astype(str).values
y = df_degree_text["y"].astype(int).values
groups = df_degree_text["group_id"].astype(str).values

# your existing tfidf_lr pipeline is assumed defined
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X, y, groups=groups):
    tfidf_lr.fit(X[tr], y[tr])
    oof[te] = tfidf_lr.predict_proba(X[te])[:,1]

print("TFIDF StratifiedGroupKFold ROC-AUC:", roc_auc_score(y, oof))
print("TFIDF StratifiedGroupKFold PR-AUC :", average_precision_score(y, oof))

def threshold_at_fpr(y_true, y_score, target_fpr=0.05):
    y_true = np.asarray(y_true); y_score = np.asarray(y_score)
    neg = y_score[y_true==0]
    return float(np.quantile(neg, 1-target_fpr)) if len(neg) else 0.5

thr = threshold_at_fpr(y, oof, 0.05)
pred = (oof >= thr).astype(int)

print("thr (~5% FPR):", thr)
print("Confusion:\n", confusion_matrix(y, pred))
print(classification_report(y, pred, digits=3))


Fit final TFIDF model + save (deployment model)

In [None]:
import joblib, json

X_all = df_degree_text["doc_text"].fillna("").astype(str).values
y_all = df_degree_text["y"].astype(int).values

tfidf_lr.fit(X_all, y_all)
joblib.dump(tfidf_lr, "tfidf_lr_degree.joblib")
json.dump(
    {"min_words": MIN_WORDS, "thr_fpr5": float(thr), "eval": "StratifiedGroupKFold(group_id)"},
    open("tfidf_lr_degree_meta.json","w"),
    indent=2
)
print("Saved final TFIDF: tfidf_lr_degree.joblib + meta")


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

def grouped_oof_proba(model, X, y, groups, n_splits=5):
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof = np.zeros(len(y), dtype=float)
    for tr, te in sgkf.split(X, y, groups=groups):
        model.fit(X[tr], y[tr])
        oof[te] = model.predict_proba(X[te])[:,1]
    return oof

def report(name, y, proba):
    print(name)
    print("  ROC-AUC:", roc_auc_score(y, proba))
    print("  PR-AUC :", average_precision_score(y, proba))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

meta = df_degree_text.copy()
meta["file_type"] = meta["file_type"].astype(str)
meta["text_source"] = meta["text_source"].astype(str)

X_meta = meta[["word_count","file_type","text_source"]]
y = meta["y"].astype(int).values
groups = meta["group_id"].astype(str).values

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["file_type","text_source"]),
        ("num", "passthrough", ["word_count"]),
    ]
)

meta_lr = Pipeline([("pre", pre), ("lr", LogisticRegression(max_iter=5000))])

# grouped CV
from sklearn.model_selection import StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X_meta, y, groups=groups):
    meta_lr.fit(X_meta.iloc[tr], y[tr])
    oof[te] = meta_lr.predict_proba(X_meta.iloc[te])[:,1]

report("META-ONLY (word_count + file_type + text_source)", y, oof)


In [None]:
df_degree_ocr = df_degree_text[df_degree_text["text_source"]=="ocr"].copy()
print(df_degree_ocr.shape, df_degree_ocr["y"].value_counts())


In [None]:
import re

def clean_ocr_text(t: str) -> str:
    t = t or ""
    t = t.lower()
    t = re.sub(r"\b\d{4,}\b", " <num> ", t)     # remove long IDs / years / serials
    t = re.sub(r"\s+", " ", t).strip()
    return t

df_degree_text["doc_text_clean"] = df_degree_text["doc_text"].apply(clean_ocr_text)


OCR-only extraction for ALL degree files (no digital extract)

In [None]:
import re
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract
import fitz

OCR_PDF_PAGES = 3      # start with 3 (faster); try 5 if needed
PDF_RENDER_DPI = 250
MAX_WORDS = 400        # length normalization (removes word_count shortcut)

def normalize_text(t: str) -> str:
    if not t:
        return ""
    t = t.replace("\x00", " ")
    t = re.sub(r"\s+", " ", t).strip()
    return t

def clean_text_for_fairness(t: str) -> str:
    # remove obvious IDs / serials (optional but helps)
    t = (t or "").lower()
    t = re.sub(r"\b\d{4,}\b", " <num> ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def truncate_words(t: str, max_words: int) -> str:
    w = (t or "").split()
    return " ".join(w[:max_words])

def ocr_image_pil(img: Image.Image) -> str:
    config = "--oem 3 --psm 6"
    return normalize_text(pytesseract.image_to_string(img, config=config))

def ocr_pdf_pages(pdf_path: str, max_pages=3, dpi=250) -> str:
    doc = fitz.open(pdf_path)
    n = min(len(doc), max_pages)
    out = []
    for i in range(n):
        pix = doc[i].get_pixmap(dpi=dpi)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        out.append(ocr_image_pil(img))
    return normalize_text(" ".join([x for x in out if x]))

def extract_ocr_only(path: str, file_type: str):
    ft = (file_type or "").lower()
    try:
        if ft == "pdf":
            t = ocr_pdf_pages(path, max_pages=OCR_PDF_PAGES, dpi=PDF_RENDER_DPI)
        elif ft in {"png","jpg","jpeg","tif","tiff"}:
            img = Image.open(path).convert("RGB")
            t = ocr_image_pil(img)
        else:
            return "", "none", 0, "unsupported_file_type"

        t = clean_text_for_fairness(t)
        t = truncate_words(t, MAX_WORDS)

        wc = len(re.findall(r"\w+", t))
        if wc == 0:
            return "", "none", 0, "ocr_empty"
        return t, "ocr", wc, ""

    except Exception as e:
        return "", "none", 0, f"ocr_exception:{type(e).__name__}"


In [None]:
from tqdm import tqdm

df_degree_all = df[df["doc_type_final"]=="degree"].copy()

df_degree_all["doc_text_ocrall"] = ""
df_degree_all["text_source_ocrall"] = "none"
df_degree_all["word_count_ocrall"] = 0
df_degree_all["drop_reason_ocrall"] = ""

for i, row in tqdm(df_degree_all.iterrows(), total=len(df_degree_all), desc="OCR-all degrees"):
    t, src, wc, reason = extract_ocr_only(row["path"], row["file_type"])
    df_degree_all.at[i, "doc_text_ocrall"] = t
    df_degree_all.at[i, "text_source_ocrall"] = src
    df_degree_all.at[i, "word_count_ocrall"] = int(wc)
    df_degree_all.at[i, "drop_reason_ocrall"] = reason

print(df_degree_all["text_source_ocrall"].value_counts(dropna=False))


In [None]:
MIN_WORDS_OCRALL = 10

df_degree_text_ocrall = df_degree_all[df_degree_all["word_count_ocrall"] >= MIN_WORDS_OCRALL].copy()

print("degree_all:", df_degree_all.shape)
print("degree_text_ocrall:", df_degree_text_ocrall.shape)
print(df_degree_text_ocrall["y"].value_counts())
print(pd.crosstab(df_degree_text_ocrall["file_type"], df_degree_text_ocrall["y"], normalize="columns"))


In [None]:
import re
from pathlib import Path

REAL_ROOT = Path(REAL_DIR).resolve()
AI_ROOT   = Path(AI_DIR).resolve()

def group_id_from_path(path_str: str, source_dir: str) -> str:
    p = Path(path_str).resolve()

    if source_dir == "REAL":
        try:
            rel = p.relative_to(REAL_ROOT)
            parts = list(rel.parts)
        except Exception:
            parts = list(p.parts)

        for part in parts:
            if re.match(r"(?i)^student", part):
                return f"REAL_{part}"
        return f"REAL_{p.parent.name}"

    if source_dir == "AI":
        m = re.search(r"CERT-(\d+)", p.name)
        if m:
            return f"AI_CERT_{m.group(1)}"
        return f"AI_{p.parent.name}"

    return "unknown"

def ensure_group_id(df_):
    if df_ is None or len(df_) == 0:
        return df_
    if "group_id" not in df_.columns:
        df_["group_id"] = [
            group_id_from_path(p, s) for p, s in zip(df_["path"], df_["source_dir"])
        ]
    return df_

df_degree_text = ensure_group_id(df_degree_text)
df_degree_all  = ensure_group_id(df_degree_all) if "df_degree_all" in globals() else df_degree_all
df_degree_text_ocrall = ensure_group_id(df_degree_text_ocrall) if "df_degree_text_ocrall" in globals() else df_degree_text_ocrall

print("group_id added.")
print("df_degree_text has group_id?", "group_id" in df_degree_text.columns)
if "df_degree_text_ocrall" in globals():
    print("df_degree_text_ocrall has group_id?", "group_id" in df_degree_text_ocrall.columns)


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

meta = df_degree_text_ocrall.copy()
meta["file_type"] = meta["file_type"].astype(str)

X_meta = meta[["word_count_ocrall","file_type"]]
y = meta["y"].astype(int).values
groups = meta["group_id"].astype(str).values

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["file_type"]),
    ("num", "passthrough", ["word_count_ocrall"]),
])

meta_lr = Pipeline([("pre", pre), ("lr", LogisticRegression(max_iter=5000))])

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

for tr, te in sgkf.split(X_meta, y, groups=groups):
    meta_lr.fit(X_meta.iloc[tr], y[tr])
    oof[te] = meta_lr.predict_proba(X_meta.iloc[te])[:,1]

print("META-only (word_count_ocrall + file_type) ROC-AUC:", roc_auc_score(y, oof))
print("META-only PR-AUC:", average_precision_score(y, oof))


In [None]:
df_pdf = df_degree_text_ocrall[df_degree_text_ocrall["file_type"]=="pdf"].copy()
print("PDF-only:", df_pdf.shape)
print(df_pdf["y"].value_counts())


In [None]:
import re
from pathlib import Path

if "group_id" not in df.columns:
    REAL_ROOT = Path(REAL_DIR).resolve()
    AI_ROOT   = Path(AI_DIR).resolve()

    def group_id_from_path(path_str: str, source_dir: str) -> str:
        p = Path(path_str).resolve()
        if source_dir == "REAL":
            try:
                rel = p.relative_to(REAL_ROOT)
                parts = list(rel.parts)
            except Exception:
                parts = list(p.parts)
            for part in parts:
                if re.match(r"(?i)^student", part):
                    return f"REAL_{part}"
            return f"REAL_{p.parent.name}"
        if source_dir == "AI":
            m = re.search(r"CERT-(\d+)", p.name)
            if m:
                return f"AI_CERT_{m.group(1)}"
            return f"AI_{p.parent.name}"
        return "unknown"

    df["group_id"] = [group_id_from_path(p, s) for p, s in zip(df["path"], df["source_dir"])]

print("group_id exists?", "group_id" in df.columns, "| unique:", df["group_id"].nunique())


In [None]:
from pathlib import Path
from PIL import Image
import fitz

RENDER_DIR = Path("degree_proxy_images")
RENDER_DIR.mkdir(exist_ok=True)

def make_proxy_image(path_str: str, file_type: str, dpi=200) -> str:
    p = Path(path_str)
    out = RENDER_DIR / (p.stem + "_p0.png")
    if out.exists():
        return str(out)

    ft = (file_type or "").lower()
    try:
        if ft == "pdf":
            doc = fitz.open(str(p))
            if len(doc) == 0:
                return ""
            pix = doc[0].get_pixmap(dpi=dpi)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            img.save(out)
            return str(out)

        if ft in {"png","jpg","jpeg","tif","tiff"}:
            img = Image.open(str(p)).convert("RGB")
            img.save(out)
            return str(out)

        return ""
    except Exception:
        return ""

deg_all = df[df["doc_type_final"]=="degree"].copy()

if "proxy_img_path" not in deg_all.columns:
    deg_all["proxy_img_path"] = ""

if (deg_all["proxy_img_path"] == "").any():
    deg_all["proxy_img_path"] = [
        (pp if isinstance(pp, str) and pp else make_proxy_image(p, ft, dpi=200))
        for p, ft, pp in zip(deg_all["path"], deg_all["file_type"], deg_all["proxy_img_path"])
    ]

print("deg_all:", deg_all.shape, "| proxy empty:", (deg_all["proxy_img_path"]=="").sum())
deg_all = deg_all[deg_all["proxy_img_path"]!=""].copy()
print("deg_all usable:", deg_all.shape)


In [None]:
from tqdm import tqdm
import pandas as pd

# Require your extract_ocr_only to exist
assert "extract_ocr_only" in globals(), "extract_ocr_only() is not defined in this runtime."

for col, default in [
    ("doc_text_norm",""),
    ("word_count_norm",0),
    ("drop_reason_norm",""),
]:
    if col not in deg_all.columns:
        deg_all[col] = default

need = (deg_all["word_count_norm"].fillna(0).astype(int) == 0) & (deg_all["doc_text_norm"].fillna("") == "")
print("Need OCR-normalization rows:", int(need.sum()))

for i, row in tqdm(deg_all[need].iterrows(), total=int(need.sum()), desc="OCR normalized degrees"):
    # we call your existing OCR extractor on proxy images
    t, src, wc, reason = extract_ocr_only(row["proxy_img_path"], "png")
    deg_all.at[i, "doc_text_norm"] = t
    deg_all.at[i, "word_count_norm"] = int(wc)
    deg_all.at[i, "drop_reason_norm"] = reason or ""

MIN_WORDS_NORM = 10
deg_text_norm = deg_all[deg_all["word_count_norm"].fillna(0).astype(int) >= MIN_WORDS_NORM].copy()

print("deg_text_norm:", deg_text_norm.shape)
print("y counts:\n", deg_text_norm["y"].value_counts(dropna=False))


LEAKAGE CHECKS

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X_meta = deg_text_norm[["word_count_norm"]]
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

clf = LogisticRegression(max_iter=5000)
for tr, te in sgkf.split(X_meta, y, groups=groups):
    clf.fit(X_meta.iloc[tr], y[tr])
    oof[te] = clf.predict_proba(X_meta.iloc[te])[:,1]

print("META-only (length only) ROC-AUC:", roc_auc_score(y, oof))
print("META-only (length only) PR-AUC :", average_precision_score(y, oof))


In [None]:
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X = deg_text_norm["doc_text_norm"].fillna("").astype(str).values
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

rng = np.random.RandomState(42)
y_shuf = rng.permutation(y)

tfidf = FeatureUnion([
    ("word", TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)),
    ("char", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2)),
])
model = Pipeline([("tfidf", tfidf), ("lr", LogisticRegression(max_iter=4000))])

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X, y_shuf, groups=groups):
    model.fit(X[tr], y_shuf[tr])
    oof[te] = model.predict_proba(X[te])[:,1]

print("SHUFFLE ROC-AUC:", roc_auc_score(y_shuf, oof))
print("SHUFFLE PR-AUC :", average_precision_score(y_shuf, oof))


TFIDF model on normalized text (grouped CV + threshold @5% FPR)

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report

def threshold_at_fpr(y_true, y_score, target_fpr=0.05):
    y_true = np.asarray(y_true); y_score = np.asarray(y_score)
    neg = y_score[y_true==0]
    return float(np.quantile(neg, 1-target_fpr)) if len(neg) else 0.5

X = deg_text_norm["doc_text_norm"].fillna("").astype(str).values
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

tfidf = FeatureUnion([
    ("word", TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)),
    ("char", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2)),
])
tfidf_lr_norm = Pipeline([("tfidf", tfidf), ("lr", LogisticRegression(max_iter=4000))])

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X, y, groups=groups):
    tfidf_lr_norm.fit(X[tr], y[tr])
    oof[te] = tfidf_lr_norm.predict_proba(X[te])[:,1]

print("TFIDF normalized ROC-AUC:", roc_auc_score(y, oof))
print("TFIDF normalized PR-AUC :", average_precision_score(y, oof))

thr_norm = threshold_at_fpr(y, oof, 0.05)
pred = (oof >= thr_norm).astype(int)

print("thr (~5% FPR):", thr_norm)
print("Confusion:\n", confusion_matrix(y, pred))
print(classification_report(y, pred, digits=3))


In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# choose the dataset you want to analyze:
#   - df_degree_text        (original)
#   - deg_text_norm         (proxy+OCR normalized)
meta_df = deg_text_norm.copy()  # change if needed

# build metadata table
meta_df["file_type"] = meta_df["file_type"].astype(str)
X_meta = meta_df[["word_count_norm", "file_type"]]  # for normalized run
y = meta_df["y"].astype(int).values

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["file_type"]),
    ("num", "passthrough", ["word_count_norm"]),
])

meta_lr = Pipeline([("pre", pre), ("lr", LogisticRegression(max_iter=5000))])
meta_lr.fit(X_meta, y)

ohe = meta_lr.named_steps["pre"].named_transformers_["cat"]
feat_names = list(ohe.get_feature_names_out(["file_type"])) + ["word_count_norm"]

coefs = meta_lr.named_steps["lr"].coef_.ravel()
imp = pd.DataFrame({"feature": feat_names, "coef": coefs, "abs_coef": np.abs(coefs)}).sort_values("abs_coef", ascending=False)

print(imp.head(30))


In [None]:
# Use your ablation dataset
deg_text_norm = deg_text_norm.copy()
deg_text_norm["file_type_const"] = "img"  # constant for all

# Sanity: should be only one value
print(deg_text_norm["file_type_const"].value_counts())


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X_meta = deg_text_norm[["word_count_norm"]]  # remove file_type entirely
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

lr = LogisticRegression(max_iter=5000)
for tr, te in sgkf.split(X_meta, y, groups=groups):
    lr.fit(X_meta.iloc[tr], y[tr])
    oof[te] = lr.predict_proba(X_meta.iloc[te])[:,1]

print("META-only (length only; file_type removed) ROC-AUC:", roc_auc_score(y, oof))
print("META-only (length only; file_type removed) PR-AUC :", average_precision_score(y, oof))


In [None]:
import pandas as pd
import numpy as np

tmp = deg_text_norm.copy()
tmp["wc_bin"] = pd.qcut(tmp["word_count_norm"].clip(upper=4000), q=6, duplicates="drop")

parts = []
for b, g in tmp.groupby("wc_bin"):
    n0 = (g["y"]==0).sum()
    n1 = (g["y"]==1).sum()
    n = min(n0, n1)
    if n >= 3:
        parts.append(g[g["y"]==0].sample(n, random_state=42))
        parts.append(g[g["y"]==1].sample(n, random_state=42))

deg_len_matched = pd.concat(parts).sample(frac=1, random_state=42).reset_index(drop=True)

print("deg_len_matched:", deg_len_matched.shape)
print(deg_len_matched["y"].value_counts())
print(deg_len_matched.groupby("y")["word_count_norm"].describe()[["mean","std","min","max"]])


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X_meta = deg_len_matched[["word_count_norm"]]
y = deg_len_matched["y"].astype(int).values
groups = deg_len_matched["group_id"].astype(str).values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

lr = LogisticRegression(max_iter=5000)
for tr, te in sgkf.split(X_meta, y, groups=groups):
    lr.fit(X_meta.iloc[tr], y[tr])
    oof[te] = lr.predict_proba(X_meta.iloc[te])[:,1]

print("META-only (length-matched) ROC-AUC:", roc_auc_score(y, oof))
print("META-only (length-matched) PR-AUC :", average_precision_score(y, oof))


In [None]:
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X = deg_len_matched["doc_text_norm"].fillna("").astype(str).values
y = deg_len_matched["y"].astype(int).values
groups = deg_len_matched["group_id"].astype(str).values

rng = np.random.RandomState(42)
y_shuf = rng.permutation(y)

tfidf = FeatureUnion([
    ("word", TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)),
    ("char", TfidfVectorizer(analyzer="char_wb", max_features=40000, ngram_range=(3,5), min_df=2)),
])
model = Pipeline([("tfidf", tfidf), ("lr", LogisticRegression(max_iter=4000))])

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X, y_shuf, groups=groups):
    model.fit(X[tr], y_shuf[tr])
    oof[te] = model.predict_proba(X[te])[:,1]

print("SHUFFLE (len-matched) ROC-AUC:", roc_auc_score(y_shuf, oof))
print("SHUFFLE (len-matched) PR-AUC :", average_precision_score(y_shuf, oof))


In [None]:
import re

K_WORDS = 200
PAD = "zzpadzz"  # will be ignored by vectorizer via stop_words

def fixed_len_text(t: str, k=200, pad=PAD) -> str:
    t = (t or "").strip().lower()
    toks = re.findall(r"[a-zA-Z]+|<num>", t)  # keep words + <num> if you already use it
    if len(toks) >= k:
        toks = toks[:k]
    else:
        toks = toks + [pad] * (k - len(toks))
    return " ".join(toks)

# Use the ablation dataset with proxy OCR:
# deg_text_norm (you already have)
deg_text_norm["doc_text_fixed"] = deg_text_norm["doc_text_norm"].fillna("").astype(str).apply(lambda x: fixed_len_text(x, K_WORDS, PAD))

print("doc_text_fixed created. Example word_count_fixed:",
      deg_text_norm["doc_text_fixed"].str.split().str.len().value_counts().head())


In [None]:
import numpy as np
import pandas as pd

deg_text_norm["word_count_fixed"] = deg_text_norm["doc_text_fixed"].str.split().str.len().astype(int)

print(deg_text_norm["word_count_fixed"].describe())
print(deg_text_norm["word_count_fixed"].value_counts().head())


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X_meta = deg_text_norm[["word_count_fixed"]]  # should be constant => near random
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

lr = LogisticRegression(max_iter=5000)
for tr, te in sgkf.split(X_meta, y, groups=groups):
    lr.fit(X_meta.iloc[tr], y[tr])
    oof[te] = lr.predict_proba(X_meta.iloc[te])[:,1]

print("META-only (word_count_fixed) ROC-AUC:", roc_auc_score(y, oof))
print("META-only (word_count_fixed) PR-AUC :", average_precision_score(y, oof))


In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X = deg_text_norm["doc_text_fixed"].fillna("").astype(str).values
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

rng = np.random.RandomState(42)
y_shuf = rng.permutation(y)

# Binary TF + ignore PAD token => reduces length/frequency artifacts
vec = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=1,
    binary=True,
    stop_words=[PAD],
    norm="l2"
)

model = Pipeline([("tfidf", vec), ("lr", LogisticRegression(max_iter=4000))])

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

for tr, te in sgkf.split(X, y_shuf, groups=groups):
    model.fit(X[tr], y_shuf[tr])
    oof[te] = model.predict_proba(X[te])[:,1]

print("SHUFFLE (fixed-length full) ROC-AUC:", roc_auc_score(y_shuf, oof))
print("SHUFFLE (fixed-length full) PR-AUC :", average_precision_score(y_shuf, oof))


TFIDF + LR (word+char) on doc_text_fixed

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report

X = deg_text_norm["doc_text_fixed"].fillna("").astype(str).values
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

tfidf = FeatureUnion([
    ("word", TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2,
                             binary=True, stop_words=[PAD])),
    ("char", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2,
                             binary=True)),
])

clf = Pipeline([("tfidf", tfidf), ("lr", LogisticRegression(max_iter=4000))])

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

for tr, te in sgkf.split(X, y, groups=groups):
    clf.fit(X[tr], y[tr])
    oof[te] = clf.predict_proba(X[te])[:,1]

print("TFIDF (fixed-length) ROC-AUC:", roc_auc_score(y, oof))
print("TFIDF (fixed-length) PR-AUC :", average_precision_score(y, oof))

# threshold at ~5% FPR
thr = float(np.quantile(oof[y==0], 0.95))
pred = (oof >= thr).astype(int)

print("thr (~5% FPR):", thr)
print("Confusion:\n", confusion_matrix(y, pred))
print(classification_report(y, pred, digits=3))


PLM Embeddings + LR on doc_text_fixed

In [None]:
!pip -q install -U sentence-transformers

import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = deg_text_norm["doc_text_fixed"].fillna("").astype(str).tolist()
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

X_emb = emb.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

clf = LogisticRegression(max_iter=5000)
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X_emb, y, groups=groups):
    clf.fit(X_emb[tr], y[tr])
    oof[te] = clf.predict_proba(X_emb[te])[:,1]

print("EMB (fixed-length) ROC-AUC:", roc_auc_score(y, oof))
print("EMB (fixed-length) PR-AUC :", average_precision_score(y, oof))


RoBERTa fine-tune (group holdout) on doc_text_fixed

In [None]:
!pip -q install -U transformers datasets accelerate

import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import roc_auc_score, average_precision_score

model_ckpt = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

idx = np.arange(len(deg_text_norm))
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(idx, y, groups=groups))

train_df = deg_text_norm.iloc[train_idx][["doc_text_fixed","y"]].rename(columns={"doc_text_fixed":"doc_text","y":"label"})
test_df  = deg_text_norm.iloc[test_idx][["doc_text_fixed","y"]].rename(columns={"doc_text_fixed":"doc_text","y":"label"})

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
test_ds  = Dataset.from_pandas(test_df, preserve_index=False)

def tok(batch):
    return tokenizer(batch["doc_text"], truncation=True, max_length=512)

train_ds = train_ds.map(tok, batched=True).remove_columns(["doc_text"])
test_ds  = test_ds.map(tok, batched=True).remove_columns(["doc_text"])

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)

args = TrainingArguments(
    output_dir="ft_out_fixed",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=test_ds, tokenizer=tokenizer)
trainer.train()

pred = trainer.predict(test_ds).predictions
proba = np.exp(pred)[:,1] / np.exp(pred).sum(axis=1)
y_test = test_df["label"].astype(int).values

print("RoBERTa (fixed-length) ROC-AUC:", roc_auc_score(y_test, proba))
print("RoBERTa (fixed-length) PR-AUC :", average_precision_score(y_test, proba))


In [None]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

X_all = deg_text_norm["doc_text_fixed"].fillna("").astype(str).values
y_all = deg_text_norm["y"].astype(int).values
groups_all = deg_text_norm["group_id"].astype(str).values
idx = np.arange(len(deg_text_norm))

def find_good_group_split(test_size=0.2, max_tries=200, seed=42):
    rng = np.random.RandomState(seed)
    for t in range(max_tries):
        rs = int(rng.randint(0, 10_000_000))
        gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=rs)
        tr, te = next(gss.split(idx, y_all, groups=groups_all))
        y_te = y_all[te]
        if len(np.unique(y_te)) == 2:
            return tr, te, rs
    raise RuntimeError("Could not find a group split with both classes in test. Reduce test_size or change group_id strategy.")

train_idx, test_idx, used_seed = find_good_group_split(test_size=0.2, seed=42)
print("✅ Found split seed:", used_seed)
print("Train size:", len(train_idx), "Test size:", len(test_idx))
print("Test y counts:", dict(zip(*np.unique(y_all[test_idx], return_counts=True))))


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

model_ckpt = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

train_df = deg_text_norm.iloc[train_idx][["doc_text_fixed","y"]].rename(columns={"doc_text_fixed":"doc_text","y":"label"})
test_df  = deg_text_norm.iloc[test_idx][["doc_text_fixed","y"]].rename(columns={"doc_text_fixed":"doc_text","y":"label"})

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
test_ds  = Dataset.from_pandas(test_df, preserve_index=False)

def tok(batch):
    return tokenizer(batch["doc_text"], truncation=True, max_length=256)

train_ds = train_ds.map(tok, batched=True).remove_columns(["doc_text"])
test_ds  = test_ds.map(tok, batched=True).remove_columns(["doc_text"])

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)

args = TrainingArguments(
    output_dir="ft_out_fixed_quick",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,        # lighter than 2
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",          # avoid wandb prompt
)

trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=test_ds, tokenizer=tokenizer)
trainer.train()

pred = trainer.predict(test_ds).predictions
proba = np.exp(pred)[:,1] / np.exp(pred).sum(axis=1)
y_test = test_df["label"].astype(int).values

print("RoBERTa (fixed-length) ROC-AUC:", roc_auc_score(y_test, proba))
print("RoBERTa (fixed-length) PR-AUC :", average_precision_score(y_test, proba))


In [None]:
import json, os
from pathlib import Path

outdir = Path("artifacts_roberta_fixed")
outdir.mkdir(exist_ok=True)

# save model + tokenizer
trainer.save_model(str(outdir))
tokenizer.save_pretrained(str(outdir))

# save metadata
meta = {
    "model_ckpt": "roberta-base",
    "text_col": "doc_text_fixed",
    "label_col": "y",
    "group_col": "group_id",
    "train_size": int(len(train_idx)),
    "test_size": int(len(test_idx)),
    "roc_auc": float(0.8997668997668997),
    "pr_auc": float(0.9510900050081298),
    "split_seed": int(used_seed),
}
with open(outdir/"meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved to:", outdir)


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

X = deg_text_norm["doc_text_fixed"].fillna("").astype(str).values
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(y), dtype=float)

for tr, te in sgkf.split(X, y, groups=groups):
    tfidf_lr.fit(X[tr], y[tr])
    oof[te] = tfidf_lr.predict_proba(X[te])[:,1]

print("TFIDF (fixed-length) ROC-AUC:", roc_auc_score(y, oof))
print("TFIDF (fixed-length) PR-AUC :", average_precision_score(y, oof))


In [None]:
!pip -q install -U sentence-transformers

import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = deg_text_norm["doc_text_fixed"].fillna("").astype(str).tolist()
y = deg_text_norm["y"].astype(int).values
groups = deg_text_norm["group_id"].astype(str).values

X_emb = emb.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

clf = LogisticRegression(max_iter=5000)
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X_emb, y, groups=groups):
    clf.fit(X_emb[tr], y[tr])
    oof[te] = clf.predict_proba(X_emb[te])[:,1]

print("MiniLM-EMB (fixed-length) ROC-AUC:", roc_auc_score(y, oof))
print("MiniLM-EMB (fixed-length) PR-AUC :", average_precision_score(y, oof))


In [None]:
import numpy as np
import pandas as pd

need_cols = ["doc_text_fixed","proxy_img_path","y","group_id"]
missing = [c for c in need_cols if c not in deg_text_norm.columns]
assert not missing, f"Missing columns in deg_text_norm: {missing}"

# drop rows with missing proxy images (should be none if you created proxies)
deg_mm = deg_text_norm[deg_text_norm["proxy_img_path"].fillna("") != ""].copy()
print("deg_mm:", deg_mm.shape, "| y:", deg_mm["y"].value_counts().to_dict())


Extract CLIP image embeddings for proxy_img_path

In [None]:
!pip -q install -U transformers pillow

import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_name = "openai/clip-vit-base-patch32"

clip_model = CLIPModel.from_pretrained(clip_name).to(device)
clip_proc  = CLIPProcessor.from_pretrained(clip_name)

paths = deg_mm["proxy_img_path"].astype(str).tolist()

@torch.no_grad()
def clip_encode_images(paths, batch_size=32):
    embs = []
    for i in range(0, len(paths), batch_size):
        batch_paths = paths[i:i+batch_size]
        imgs = [Image.open(p).convert("RGB") for p in batch_paths]
        inputs = clip_proc(images=imgs, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        feats = clip_model.get_image_features(**inputs)
        feats = torch.nn.functional.normalize(feats, p=2, dim=1)
        embs.append(feats.cpu().numpy())
    return np.vstack(embs)

X_img = clip_encode_images(paths, batch_size=32)
print("X_img:", X_img.shape)


mage-only model (LR) with StratifiedGroupKFold OOF

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score

y = deg_mm["y"].astype(int).values
groups = deg_mm["group_id"].astype(str).values

img_lr = LogisticRegression(max_iter=5000)
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

oof_img = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X_img, y, groups=groups):
    img_lr.fit(X_img[tr], y[tr])
    oof_img[te] = img_lr.predict_proba(X_img[te])[:,1]

print("IMG-only ROC-AUC:", roc_auc_score(y, oof_img))
print("IMG-only PR-AUC :", average_precision_score(y, oof_img))


Text-only TFIDF OOF on the SAME deg_mm order

In [None]:
# If you ever re-ran and lost it, run:
# PAD = "zzpadzz"
# (and your tfidf_lr definition cell)

from sklearn.metrics import roc_auc_score, average_precision_score

X_txt = deg_mm["doc_text_fixed"].fillna("").astype(str).values

oof_txt = np.zeros(len(y), dtype=float)
for tr, te in sgkf.split(X_txt, y, groups=groups):
    tfidf_lr.fit(X_txt[tr], y[tr])
    oof_txt[te] = tfidf_lr.predict_proba(X_txt[te])[:,1]

print("TFIDF-only ROC-AUC:", roc_auc_score(y, oof_txt))
print("TFIDF-only PR-AUC :", average_precision_score(y, oof_txt))


Late fusion (no extra training): best weighted average

In [None]:
Late fusion (no extra training): best weighted average

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

def eval_fusion(alpha):
    fused = alpha*oof_txt + (1-alpha)*oof_img
    return (
        roc_auc_score(y, fused),
        average_precision_score(y, fused),
        fused
    )

alphas = np.linspace(0, 1, 21)  # 0.0..1.0 step 0.05
best = None
for a in alphas:
    ra, pr, _ = eval_fusion(a)
    cand = (pr, ra, a)
    if best is None or cand > best:
        best = cand

best_pr, best_roc, best_a = best
_, _, oof_fused = eval_fusion(best_a)

print("Best alpha (txt weight):", best_a)
print("FUSED ROC-AUC:", best_roc)
print("FUSED PR-AUC :", best_pr)


Threshold @ ~5% FPR + confusion (using fused OOF)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

thr = float(np.quantile(oof_fused[y==0], 0.95))  # ~5% FPR target
pred = (oof_fused >= thr).astype(int)

print("thr (~5% FPR):", thr)
print("Confusion:\n", confusion_matrix(y, pred))
print(classification_report(y, pred, digits=3))


Train final models on full data + save artifact

In [None]:
import joblib, json
from pathlib import Path

# Train final text model
tfidf_lr.fit(X_txt, y)

# Train final image model
img_lr.fit(X_img, y)

outdir = Path("artifacts_multimodal_fusion")
outdir.mkdir(exist_ok=True)

joblib.dump(tfidf_lr, outdir/"tfidf_lr_fixed.joblib")
joblib.dump(img_lr,  outdir/"clip_img_lr.joblib")

meta = {
    "dataset_rows": int(len(deg_mm)),
    "text_col": "doc_text_fixed",
    "img_col": "proxy_img_path",
    "label_col": "y",
    "group_col": "group_id",
    "clip_model": clip_name,
    "fusion": {"type": "weighted_average", "alpha_text": float(best_a), "thr_fpr05": float(thr)},
}

with open(outdir/"fusion_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved to:", outdir)


In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

print("IMG-only ROC-AUC:", roc_auc_score(y, oof_img))
print("IMG-only PR-AUC :", average_precision_score(y, oof_img))

print("FUSED ROC-AUC:", roc_auc_score(y, oof_fused))
print("FUSED PR-AUC :", average_precision_score(y, oof_fused))
print("Best alpha (txt weight):", best_a)


In [None]:
import joblib, json
from pathlib import Path
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score

outdir = Path("artifacts_multimodal_fusion")
outdir.mkdir(exist_ok=True)

# Train final models on full data
tfidf_lr.fit(X_txt, y)
img_lr.fit(X_img, y)

joblib.dump(tfidf_lr, outdir/"tfidf_lr_fixed.joblib")
joblib.dump(img_lr,  outdir/"clip_img_lr.joblib")

# threshold @ ~5% FPR on OOF fused
thr_fpr05 = float(np.quantile(oof_fused[y==0], 0.95))

meta = {
    "dataset_rows": int(len(y)),
    "label_col": "y",
    "group_col": "group_id",
    "text_col": "doc_text_fixed",
    "img_col": "proxy_img_path",
    "fusion": {
        "type": "weighted_average",
        "alpha_text": float(best_a),     # 0.05
        "alpha_img": float(1 - best_a),  # 0.95
        "thr_fpr05": thr_fpr05
    },
    "oof_metrics": {
        "tfidf": {"roc_auc": float(roc_auc_score(y, oof_txt)), "pr_auc": float(average_precision_score(y, oof_txt))},
        "img":   {"roc_auc": float(roc_auc_score(y, oof_img)), "pr_auc": float(average_precision_score(y, oof_img))},
        "fused": {"roc_auc": float(roc_auc_score(y, oof_fused)), "pr_auc": float(average_precision_score(y, oof_fused))}
    }
}

with open(outdir/"fusion_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("✅ Saved to:", outdir)
print("thr_fpr05:", thr_fpr05)


In [None]:
import numpy as np
import joblib
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

# assumes: clip_model, clip_proc already loaded; if not, reload them:
# clip_name = "openai/clip-vit-base-patch32"
# clip_model = CLIPModel.from_pretrained(clip_name).to(device)
# clip_proc = CLIPProcessor.from_pretrained(clip_name)

alpha = float(best_a)  # 0.05
thr = float(np.quantile(oof_fused[y==0], 0.95))

@torch.no_grad()
def clip_embed_one(img_path: str):
    img = Image.open(img_path).convert("RGB")
    inputs = clip_proc(images=[img], return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    feat = clip_model.get_image_features(**inputs)
    feat = torch.nn.functional.normalize(feat, p=2, dim=1)
    return feat.cpu().numpy()

def fused_predict_one(doc_text_fixed: str, proxy_img_path: str):
    # text score
    p_txt = float(tfidf_lr.predict_proba([doc_text_fixed])[0,1])
    # image score
    emb = clip_embed_one(proxy_img_path)
    p_img = float(img_lr.predict_proba(emb)[0,1])
    # fuse
    p = alpha * p_txt + (1-alpha) * p_img
    return {"p_txt": p_txt, "p_img": p_img, "p_fused": p, "pred": int(p >= thr), "thr": thr}

print("✅ fused_predict_one ready")


Sanity: dataset + alignment checks

In [None]:
import numpy as np
import pandas as pd

# Use the dataset that matches your OOF arrays (deg_mm in multimodal section)
assert "deg_mm" in globals(), "deg_mm not found. Use the dataframe you used for oof_txt/oof_img/oof_fused."
y = deg_mm["y"].astype(int).values
groups = deg_mm["group_id"].astype(str).values

print("N:", len(y), "| y counts:", dict(zip(*np.unique(y, return_counts=True))))
print("#groups:", len(np.unique(groups)))

# Alignment checks
for name in ["oof_txt", "oof_img", "oof_fused"]:
    assert name in globals(), f"{name} not found in globals()"
    arr = globals()[name]
    assert len(arr) == len(y), f"{name} length {len(arr)} != y length {len(y)}"

print("✅ OOF arrays aligned with y")


Common evaluator (ROC/PR + threshold @ 5% FPR + confusion stats)

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix

def thr_at_fpr(y_true, proba, target_fpr=0.05):
    y_true = np.asarray(y_true)
    proba = np.asarray(proba)
    neg = proba[y_true == 0]
    return float(np.quantile(neg, 1 - target_fpr))

def eval_probs(y_true, proba, target_fpr=0.05):
    y_true = np.asarray(y_true).astype(int)
    proba = np.asarray(proba).astype(float)
    thr = thr_at_fpr(y_true, proba, target_fpr=target_fpr)
    pred = (proba >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()

    fpr = fp / (fp + tn) if (fp + tn) else np.nan
    tpr = tp / (tp + fn) if (tp + fn) else np.nan
    precision = tp / (tp + fp) if (tp + fp) else np.nan

    return {
        "roc_auc": float(roc_auc_score(y_true, proba)),
        "pr_auc": float(average_precision_score(y_true, proba)),
        "thr_fpr05": float(thr),
        "FPR": float(fpr),
        "TPR": float(tpr),
        "precision@thr": float(precision),
        "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp),
        "N": int(len(y_true))
    }

res_txt   = eval_probs(y, oof_txt,   0.05)
res_img   = eval_probs(y, oof_img,   0.05)
res_fused = eval_probs(y, oof_fused, 0.05)

print("✅ eval functions ready")


Compare all models in one table (+ add RoBERTa holdout as separate row)


In [None]:
rows = []

rows.append({"model":"TFIDF-only (OOF, GroupCV)", **res_txt})
rows.append({"model":"IMG-only CLIP+LR (OOF, GroupCV)", **res_img})
rows.append({"model":"FUSED (OOF, GroupCV)", **res_fused, "alpha_text": float(best_a) if "best_a" in globals() else np.nan})

# Add RoBERTa holdout metrics you reported (not OOF)
# If you have the numbers stored, fill them; otherwise keep what you typed.
roberta_row = {
    "model": "RoBERTa FT (GroupHoldout)",
    "roc_auc": 0.8997668997668997,
    "pr_auc": 0.9510900050081298,
    "thr_fpr05": np.nan,
    "FPR": np.nan, "TPR": np.nan, "precision@thr": np.nan,
    "TN": np.nan, "FP": np.nan, "FN": np.nan, "TP": np.nan,
    "N": np.nan
}
rows.append(roberta_row)

cmp = pd.DataFrame(rows)
cmp = cmp[["model","roc_auc","pr_auc","FPR","TPR","precision@thr","thr_fpr05","TN","FP","FN","TP","N"]]
cmp = cmp.sort_values(["pr_auc","roc_auc"], ascending=False).reset_index(drop=True)

cmp
