###WEEK 38

In [None]:
# === Week 38 (k=3): ar/ko/te — 3 classifiers for "answerable" (binary) ===
# Assumes you already created train_ds / val_ds as in your snippet.

import numpy as np, pandas as pd, torch
from datasets import Dataset
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack


langs = ["ar","ko","te"]
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
df_train = df_train[df_train.lang.isin(langs)].reset_index(drop=True)
df_val   = df_val[df_val.lang.isin(langs)].reset_index(drop=True)

train_ds = Dataset.from_pandas(df_train[["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)
val_ds   = Dataset.from_pandas(df_val[  ["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)

In [None]:
import numpy as np, torch
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ---------- Neu: Verteilungs-Check ----------
def label_stats(ds, name=""):
    y = np.array([1 if bool(x) else 0 for x in ds["answerable"]], dtype=np.int64)
    p = y.mean()
    print(f"[{name}] n={len(y)}  positives={y.sum()} ({p:.3f})  negatives={(1-p):.3f}")
    return y

# ---------- Unverändert ----------
def to_numpy_labels(ds):
    return np.array([1 if bool(x) else 0 for x in ds["answerable"]], dtype=np.int64)

def metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    tpr = tp / (tp+fn) if (tp+fn)>0 else 0.0
    fpr = fp / (fp+tn) if (fp+tn)>0 else 0.0
    return acc, tpr, fpr

# ---------- Neu: besten Schwellwert wählen ----------
def pick_threshold(y_true, y_score, goal="f1"):  # y_score: Wahrscheinlichkeit/Logit
    # nutzt ROC; alternativ max F1 auf vielen Schwellen
    fpr, tpr, thr = roc_curve(y_true, y_score)
    if goal == "tpr>=0.9_min_fpr":
        ok = tpr >= 0.90
        if ok.any():
            i = np.argmin(fpr[ok])
            return thr[ok][i]
    # Default: max F1
    best_t, best_f1 = 0.5, -1
    for t in np.linspace(0.0, 1.0, 101):
        y_pred = (y_score >= t).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t

# ---------- FFN: mit Klassen-Gewichtung & Scores zurückgeben ----------
class FFN(torch.nn.Module):
    def __init__(self, d=1536, h=128):
        super().__init__()
        self.f = torch.nn.Sequential(
            torch.nn.Linear(d, h), torch.nn.ReLU(), torch.nn.Linear(h, 1)
        )
    def forward(self, x): return self.f(x).squeeze(-1)

def train_ffn_get_scores(Xtr, ytr, Xva, epochs=5, lr=1e-3, bs=64, device="cpu"):
    Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
    ytr_t = torch.tensor(ytr, dtype=torch.float32)
    net = FFN(d=Xtr.shape[1]).to(device)
    # Neu: pos_weight = N_neg / N_pos
    n_pos, n_neg = ytr.sum(), len(ytr)-ytr.sum()
    pos_w = torch.tensor([ (n_neg / max(1, n_pos)) ], dtype=torch.float32).to(device)
    lossf = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w)
    opt = torch.optim.Adam(net.parameters(), lr=lr)

    net.train()
    for _ in range(epochs):
        idx = torch.randperm(len(Xtr_t))
        for i in range(0, len(Xtr_t), bs):
            b = idx[i:i+bs]
            xb, yb = Xtr_t[b].to(device), ytr_t[b].to(device)
            opt.zero_grad(); lossf(net(xb), yb).backward(); opt.step()

    # Neu: Scores (Sigmoid) für Val zurückgeben
    net.eval(); scores = []
    with torch.no_grad():
        for i in range(0, len(Xva), 2048):
            xb = torch.tensor(Xva[i:i+2048], dtype=torch.float32).to(device)
            scores.append(torch.sigmoid(net(xb)).cpu().numpy())
    return np.concatenate(scores)

# ---------- RF / LR: mit class_weight und Scores ----------
def train_rf_get_scores(Xtr, ytr, Xva):
    rf = RandomForestClassifier(n_estimators=300, random_state=0, n_jobs=-1,
                               class_weight="balanced")  # Neu
    rf.fit(Xtr, ytr)
    # Neu: Prob der Klasse 1
    return rf.predict_proba(Xva)[:,1]

def train_logreg_get_scores(Xtr_bow, ytr, Xva_bow):
    lr = LogisticRegression(max_iter=1000, solver="liblinear",
                            class_weight="balanced")  # Neu
    lr.fit(Xtr_bow, ytr)
    return lr.predict_proba(Xva_bow)[:,1]

# ---------- Lauf pro Sprache (zeigt jetzt auch Verteilung) ----------
for L in ["ar","ko","te"]:
    trL = train_ds.filter(lambda ex: ex["lang"] == L)
    vaL = val_ds.filter(  lambda ex: ex["lang"] == L)

    print(f"\n=== {L} ===  train={len(trL)}  val={len(vaL)}")
    y_tr = label_stats(trL, name=f"{L}-train")   # Neu: Verteilung
    y_va = label_stats(vaL,  name=f"{L}-val")

    # === Embeddings wie gehabt erzeugen (Xtr_emb/Xva_emb) ===
    Xtr_emb = emb_features(trL)
    Xva_emb = emb_features(vaL)

    # MODEL 1: FFN-Emb  (Scores -> best threshold)
    s_ffn = train_ffn_get_scores(Xtr_emb, y_tr, Xva_emb, epochs=6, device=device)
    t1 = pick_threshold(y_va, s_ffn, goal="f1")
    y_ffn = (s_ffn >= t1).astype(int)
    acc,tpr,fpr = metrics(y_va, y_ffn)
    print(f"[MODEL 1: FFN-MeanEmb]   thr={t1:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")

    # MODEL 2: RF-Emb
    s_rf = train_rf_get_scores(Xtr_emb, y_tr, Xva_emb)
    t2 = pick_threshold(y_va, s_rf, goal="f1")
    y_rf = (s_rf >= t2).astype(int)
    acc,tpr,fpr = metrics(y_va, y_rf)
    print(f"[MODEL 2: RF-MeanEmb]    thr={t2:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")

    # MODEL 3: BoW+LR
    Xtr_bow, Xva_bow = fit_bow(trL, vaL)
    s_lr = train_logreg_get_scores(Xtr_bow, y_tr, Xva_bow)
    t3 = pick_threshold(y_va, s_lr, goal="f1")
    y_lr = (s_lr >= t3).astype(int)
    acc,tpr,fpr = metrics(y_va, y_lr)
    print(f"[MODEL 3: BoW+LogReg]    thr={t3:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]


=== ar ===  train=2558  val=415
[ar-train] n=2558  positives=2303 (0.900)  negatives=0.100
[ar-val] n=415  positives=363 (0.875)  negatives=0.125
[MODEL 1: FFN-MeanEmb]   thr=0.26  Acc=0.973  TPR=0.978  FPR=0.058
[MODEL 2: RF-MeanEmb]    thr=0.70  Acc=0.940  TPR=0.997  FPR=0.462
[MODEL 3: BoW+LogReg]    thr=0.49  Acc=0.978  TPR=0.983  FPR=0.058


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]


=== ko ===  train=2422  val=356
[ko-train] n=2422  positives=2359 (0.974)  negatives=0.026
[ko-val] n=356  positives=337 (0.947)  negatives=0.053
[MODEL 1: FFN-MeanEmb]   thr=0.16  Acc=0.955  TPR=0.988  FPR=0.632
[MODEL 2: RF-MeanEmb]    thr=0.82  Acc=0.955  TPR=1.000  FPR=0.842
[MODEL 3: BoW+LogReg]    thr=0.65  Acc=0.955  TPR=0.988  FPR=0.632


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]


=== te ===  train=1355  val=384
[te-train] n=1355  positives=1310 (0.967)  negatives=0.033
[te-val] n=384  positives=291 (0.758)  negatives=0.242
[MODEL 1: FFN-MeanEmb]   thr=0.15  Acc=0.826  TPR=0.973  FPR=0.634
[MODEL 2: RF-MeanEmb]    thr=0.80  Acc=0.849  TPR=0.931  FPR=0.409
[MODEL 3: BoW+LogReg]    thr=0.56  Acc=0.812  TPR=0.866  FPR=0.355


ANOTHER VERSION OF SAME TASK

In [None]:
# === Week 38 (k=3) with threshold tuning: ar/ko/te ===
# Lädt die Daten wie in deinem Snippet und trainiert 3 Modelle.
# Threshold wird auf der Validation pro Modell/Language via max F1 gewählt.

import numpy as np, pandas as pd, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# ---------------------------
# 0) Daten laden (wie bei dir)
# ---------------------------
langs  = ["ar","ko","te"]
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
df_train = df_train[df_train.lang.isin(langs)].reset_index(drop=True)
df_val   = df_val[df_val.lang.isin(langs)].reset_index(drop=True)

train_ds = Dataset.from_pandas(df_train[["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)
val_ds   = Dataset.from_pandas(df_val[  ["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)

# ---------------------------
# 1) Helpers / Metrics / Threshold
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

def to_numpy_labels(ds):
    return np.array([1 if bool(x) else 0 for x in ds["answerable"]], dtype=np.int64)

def label_stats(y, name=""):
    p = y.mean()
    print(f"[{name}] n={len(y)}  positives={y.sum()} ({p:.3f})  negatives={(1-p):.3f}")

def metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    acc = accuracy_score(y_true, y_pred)
    tpr = tp/(tp+fn) if (tp+fn)>0 else 0.0
    fpr = fp/(fp+tn) if (fp+tn)>0 else 0.0
    return acc, tpr, fpr

def pick_threshold(y_true, y_score, goal="f1"):
    # Default: wähle Threshold mit maximalem F1
    if goal == "f1":
        best_t, best_f1 = 0.5, -1.0
        for t in np.linspace(0.0, 1.0, 101):
            y_pred = (y_score >= t).astype(int)
            f1 = f1_score(y_true, y_pred, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        return best_t
    # Alternative (falls gewünscht): kleinste FPR bei TPR >= 0.9
    fpr, tpr, thr = roc_curve(y_true, y_score)
    mask = tpr >= 0.90
    return (thr[mask][np.argmin(fpr[mask])]) if mask.any() else 0.5

# ---------------------------
# 2) 1536-D Features: Mean-Pool DistilBERT (Q768 ⊕ C768)
# ---------------------------
tok  = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
bert = AutoModel.from_pretrained("distilbert-base-multilingual-cased").to(device)
bert.eval()
print("hidden_size from config:", bert.config.hidden_size)  # -> 768

@torch.no_grad()
def mean_embed(texts, batch_size=16, max_length=256):
    vecs = []
    for i in range(0, len(texts), batch_size):
        enc = tok(texts[i:i+batch_size], padding=True, truncation=True,
                  max_length=max_length, return_tensors="pt").to(device)
        hs   = bert(**enc).last_hidden_state                # [B,T,768]
        mask = enc.attention_mask.unsqueeze(-1)             # [B,T,1]
        mean = (hs * mask).sum(1) / mask.sum(1).clamp(min=1) # [B,768]
        vecs.append(mean.cpu())
    return torch.cat(vecs, 0).numpy()

def emb_features(ds_lang):
    q = mean_embed(ds_lang["question"])   # [N,768]
    c = mean_embed(ds_lang["context"])    # [N,768]
    feats = np.concatenate([q, c], axis=1)  # [N,1536]
    return feats

# ---------------------------
# 3) MODEL 1: FFN on Mean-Embeddings (scores -> tuned threshold)
# ---------------------------
class FFN(torch.nn.Module):
    def __init__(self, d=1536, h=128):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(d, h), torch.nn.ReLU(), torch.nn.Linear(h, 1)
        )
    def forward(self, x): return self.net(x).squeeze(-1)

def train_ffn_get_scores(Xtr, ytr, Xva, epochs=6, lr=1e-3, bs=64):
    Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
    ytr_t = torch.tensor(ytr, dtype=torch.float32)
    net   = FFN(d=Xtr.shape[1]).to(device)
    # pos_weight = N_neg / N_pos (robuster bei Imbalance)
    n_pos, n_neg = ytr.sum(), len(ytr) - ytr.sum()
    pos_w = torch.tensor([ (n_neg / max(1, n_pos)) ], dtype=torch.float32).to(device)
    lossf = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w)
    opt   = torch.optim.Adam(net.parameters(), lr=lr)

    net.train()
    for _ in range(epochs):
        idx = torch.randperm(len(Xtr_t))
        for i in range(0, len(Xtr_t), bs):
            b = idx[i:i+bs]
            xb, yb = Xtr_t[b].to(device), ytr_t[b].to(device)
            opt.zero_grad(); loss = lossf(net(xb), yb); loss.backward(); opt.step()

    net.eval(); scores = []
    with torch.no_grad():
        for i in range(0, len(Xva), 2048):
            xb = torch.tensor(Xva[i:i+2048], dtype=torch.float32).to(device)
            scores.append(torch.sigmoid(net(xb)).cpu().numpy())
    return np.concatenate(scores)

# ---------------------------
# 4) MODEL 2: RandomForest on Mean-Embeddings (prob -> tuned threshold)
# ---------------------------
def rf_get_scores(Xtr, ytr, Xva):
    rf = RandomForestClassifier(n_estimators=300, random_state=0, n_jobs=-1,
                               class_weight="balanced")  # robuster bei Imbalance
    rf.fit(Xtr, ytr)
    return rf.predict_proba(Xva)[:, 1]  # P(class=1)

# ---------------------------
# 5) MODEL 3: BoW (TF-IDF Q ⊕ TF-IDF C) + Logistic Regression (prob -> tuned)
# ---------------------------
def fit_bow(train_lang, val_lang, max_features=20000):
    qv = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    cv = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    Xq_tr = qv.fit_transform(train_lang["question"])
    Xc_tr = cv.fit_transform(train_lang["context"])
    Xq_va = qv.transform(val_lang["question"])
    Xc_va = cv.transform(val_lang["context"])
    return hstack([Xq_tr, Xc_tr]), hstack([Xq_va, Xc_va])

def lr_get_scores(Xtr_bow, ytr, Xva_bow):
    lr = LogisticRegression(max_iter=1000, solver="liblinear",
                            class_weight="balanced")
    lr.fit(Xtr_bow, ytr)
    return lr.predict_proba(Xva_bow)[:, 1]  # P(class=1)

# ---------------------------
# 6) Run per language (Threshold-Tuning: goal="f1")
# ---------------------------
for L in langs:
    trL = train_ds.filter(lambda ex: ex["lang"] == L)
    vaL = val_ds.filter(  lambda ex: ex["lang"] == L)
    y_tr, y_va = to_numpy_labels(trL), to_numpy_labels(vaL)
    print(f"\n=== {L} ===  train={len(trL)}  val={len(vaL)}")
    label_stats(y_tr, f"{L}-train"); label_stats(y_va, f"{L}-val")

    # 1536-D Embeddings (einmal bauen)
    Xtr_emb = emb_features(trL)
    Xva_emb = emb_features(vaL)
    print("Emb shape:", Xtr_emb.shape, Xva_emb.shape)  # -> (*,1536) (*,1536)

    # MODEL 1: FFN-MeanEmb
    s_ffn = train_ffn_get_scores(Xtr_emb, y_tr, Xva_emb, epochs=6)
    thr1  = pick_threshold(y_va, s_ffn, goal="f1")
    y_ffn = (s_ffn >= thr1).astype(int)
    acc,tpr,fpr = metrics(y_va, y_ffn)
    print(f"[MODEL 1: FFN-MeanEmb]   thr={thr1:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")

    # MODEL 2: RF-MeanEmb
    s_rf = rf_get_scores(Xtr_emb, y_tr, Xva_emb)
    thr2 = pick_threshold(y_va, s_rf, goal="f1")
    y_rf = (s_rf >= thr2).astype(int)
    acc,tpr,fpr = metrics(y_va, y_rf)
    print(f"[MODEL 2: RF-MeanEmb]    thr={thr2:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")

    # MODEL 3: BoW+LogReg
    Xtr_bow, Xva_bow = fit_bow(trL, vaL)
    s_lr = lr_get_scores(Xtr_bow, y_tr, Xva_bow)
    thr3 = pick_threshold(y_va, s_lr, goal="f1")
    y_lr = (s_lr >= thr3).astype(int)
    acc,tpr,fpr = metrics(y_va, y_lr)
    print(f"[MODEL 3: BoW+LogReg]    thr={thr3:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")


hidden_size from config: 768


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]


=== ar ===  train=2558  val=415
[ar-train] n=2558  positives=2303 (0.900)  negatives=0.100
[ar-val] n=415  positives=363 (0.875)  negatives=0.125
Emb shape: (2558, 1536) (415, 1536)
[MODEL 1: FFN-MeanEmb]   thr=0.36  Acc=0.976  TPR=0.978  FPR=0.038
[MODEL 2: RF-MeanEmb]    thr=0.70  Acc=0.940  TPR=0.997  FPR=0.462
[MODEL 3: BoW+LogReg]    thr=0.49  Acc=0.978  TPR=0.983  FPR=0.058


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]


=== ko ===  train=2422  val=356
[ko-train] n=2422  positives=2359 (0.974)  negatives=0.026
[ko-val] n=356  positives=337 (0.947)  negatives=0.053
Emb shape: (2422, 1536) (356, 1536)
[MODEL 1: FFN-MeanEmb]   thr=0.21  Acc=0.952  TPR=0.988  FPR=0.684
[MODEL 2: RF-MeanEmb]    thr=0.82  Acc=0.955  TPR=1.000  FPR=0.842
[MODEL 3: BoW+LogReg]    thr=0.65  Acc=0.955  TPR=0.988  FPR=0.632


Filter:   0%|          | 0/6335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]


=== te ===  train=1355  val=384
[te-train] n=1355  positives=1310 (0.967)  negatives=0.033
[te-val] n=384  positives=291 (0.758)  negatives=0.242
Emb shape: (1355, 1536) (384, 1536)
[MODEL 1: FFN-MeanEmb]   thr=0.13  Acc=0.826  TPR=0.983  FPR=0.667
[MODEL 2: RF-MeanEmb]    thr=0.80  Acc=0.849  TPR=0.931  FPR=0.409
[MODEL 3: BoW+LogReg]    thr=0.56  Acc=0.812  TPR=0.866  FPR=0.355
