In [1]:
##IMPORTS
from collections import Counter
from googletrans import Translator
from nltk.corpus import stopwords
from nltk import word_tokenize


from unidecode import unidecode
import torch
import pickle
import string
import nltk
import numpy as np 
import pandas as pd
import polars as pl
import regex as re
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [2]:
# ---------------------------
# 0) Daten laden (wie bei dir)
# ---------------------------
langs  = ["ko"]
splits = {'train': 'train.parquet', 'validation': 'validation.parquet', 'test': 'test.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
df_test = pd.read_json('week41qa.json') #pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["test"])
df_train = df_train[df_train.lang.isin(langs)].reset_index(drop=True)
df_val   = df_val[df_val.lang.isin(langs)].reset_index(drop=True)

train_ds = Dataset.from_pandas(df_train[["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)
val_ds   = Dataset.from_pandas(df_val[  ["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)
test_ds   = Dataset.from_pandas(df_test[  ["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)

### Week 36 (Part 1)

In [6]:
## Each language total words (not counting punctuation)
# tokenizer: split on \W+ (non-word chars); protect hyphens between letters/digits
# safeguard: build punctuation set from training+validation data, do not count these tokens as well
# KOREAN
ko_train_q = df_train[df_train["lang"] == "ko"]["question"].astype(str)
ko_val_q   = df_val[df_val["lang"] == "ko"]["question"].astype(str)
ko_test_q   = df_test[df_test["lang"] == "ko"]["question"].astype(str)

PUNCT_RE = re.compile(r"\p{P}", re.UNICODE)
SPLIT_RE = re.compile(r"\W+", re.UNICODE)          # tokenizer


ko_train_punct = Counter(ch for q in ko_train_q for ch in PUNCT_RE.findall(q))
ko_val_punct   = Counter(ch for q in ko_val_q for ch in PUNCT_RE.findall(q))
ko_test_punct   = Counter(ch for q in ko_val_q for ch in PUNCT_RE.findall(q))

print("Korean — TRAIN punctuation (char -> count):")
print(ko_train_punct.most_common())
print("Korean — VAL punctuation (char -> count):")
print(ko_val_punct.most_common())
print("Korean — TEST punctuation (char -> count):")
print(ko_test_punct.most_common())

HY = "HYPHENJOIN"                                  # placeholder for protected hyphens
PROTECT_HYPHEN = re.compile(r"(?<=[\p{L}\p{N}])-(?=[\p{L}\p{N}])", re.UNICODE)  # hyphen between letters/digits

# KOREAN
ko_train_q = df_train[df_train["lang"] == "ko"]["question"].astype(str)
ko_val_q   = df_val[df_val["lang"] == "ko"]["question"].astype(str)
ko_test_q   = df_test[df_test["lang"] == "ko"]["question"].astype(str)

ko_punct_set = set(ch for q in pd.concat([ko_train_q, ko_val_q]) for ch in PUNCT_RE.findall(q))

ko_train_tokens = []
for q in ko_train_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ko_punct_set]
    ko_train_tokens.extend(toks)

ko_val_tokens = []
for q in ko_val_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ko_punct_set]
    ko_val_tokens.extend(toks)

ko_test_tokens = []
for q in ko_test_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ko_punct_set]
    ko_test_tokens.extend(toks)

print()
print("Korean — TRAIN total words:", len(ko_train_tokens))
print("Korean — VAL total words:",   len(ko_val_tokens))
print("Korean — TEST total words:",   len(ko_test_tokens))

# ---- After tokenization for Korean ----
ko_numbers_train = sum(1 for t in ko_train_tokens if t.isdigit())
ko_numbers_val   = sum(1 for t in ko_val_tokens if t.isdigit())
ko_numbers_test   = sum(1 for t in ko_test_tokens if t.isdigit())

ko_hyphen_train = sum(1 for t in ko_train_tokens if "-" in t)
ko_hyphen_val   = sum(1 for t in ko_val_tokens if "-" in t)
ko_hyphen_test   = sum(1 for t in ko_test_tokens if "-" in t)

print()
print("Korean — numeric tokens (train):", ko_numbers_train)
print("Korean — numeric tokens (val):",   ko_numbers_val)
print("Korean — numeric tokens (test):",   ko_numbers_test)
print()
print("Korean — hyphenated tokens (train):", ko_hyphen_train)
print("Korean — hyphenated tokens (val):",   ko_hyphen_val)
print("Korean — hyphenated tokens (test):",   ko_hyphen_test)

Korean — TRAIN punctuation (char -> count):
[('?', 2420), (',', 23), ('.', 16), ("'", 6), ('"', 6), ('-', 5), (':', 2), ('/', 1), ('\\', 1), ('(', 1), (')', 1)]
Korean — VAL punctuation (char -> count):
[('?', 356), ('.', 9), (',', 3), ('-', 1)]
Korean — TEST punctuation (char -> count):
[('?', 356), ('.', 9), (',', 3), ('-', 1)]

Korean — TRAIN total words: 11858
Korean — VAL total words: 1736
Korean — TEST total words: 195

Korean — numeric tokens (train): 9
Korean — numeric tokens (val): 1
Korean — numeric tokens (test): 0

Korean — hyphenated tokens (train): 5
Korean — hyphenated tokens (val): 1
Korean — hyphenated tokens (test): 0


In [7]:
# Stats about answerable vs unanswerable questions

# Define languages and splits

split_dfs = {
    "train": df_train,
    "val":   df_val,
    "test":  df_test
}


rows = []
for split_name, df in split_dfs.items():
    for lang in langs:
        total = df[df["lang"] == lang].shape[0]
        ans   = df[(df["lang"] == lang) & (df["answerable"])].shape[0]
        unans = total - ans
        ratio = ans / total if total > 0 else 0
        rows.append([split_name, lang, total, ans, unans, ratio])

# Create summary DataFrame
summary = pd.DataFrame(rows, columns=["Split", "Language", "Total", "Answerable", "Unanswerable", "Answerable Ratio"])
print(summary.to_string(index=False))


Split Language  Total  Answerable  Unanswerable  Answerable Ratio
train       ko   2422        2359            63          0.973988
  val       ko    356         337            19          0.946629
 test       ko     45          36             9          0.800000


#### Week 36 - Rule-based classifier

In [10]:
nltk.download("stopwords", quiet=True)
# Needed for word_tokenize
try:
    nltk.download("punkt", quiet=True)
    # Some NLTK builds need this extra package
    nltk.download("punkt_tab", quiet=True)
except Exception:
    pass

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

EN_STOP = set(stopwords.words('english')) | set(string.punctuation)

# If you haven't already created these:
# df_train = pd.read_csv("path/to/train.csv")  # or load however you have it
# df_val   = pd.read_csv("path/to/val.csv")

# Languages you care about
LANGS = ["ko"]

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
DEVICE = 0 if torch.cuda.is_available() else -1
print("Using device idx for HF pipeline:", DEVICE)

MODEL_ID = "facebook/nllb-200-distilled-600M"
SRC_CODES = {"ar": "arb_Arab", "ko": "kor_Hang", "te": "tel_Telu"}
TGT_CODE = "eng_Latn"

# One translator pipeline reused for all batches
nllb = pipeline("translation", model=MODEL_ID, tokenizer=MODEL_ID, device=DEVICE)

PyTorch: 2.7.1+cu118
CUDA available: True
Using device idx for HF pipeline: 0


Device set to use cuda:0


In [11]:
import pandas as pd
import numpy as np
import string
import regex as re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# Setup
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
STOP_WORDS = set(stopwords.words('english')) | set(string.punctuation)

# Language-specific mapping
SRC_CODES = {'ko': 'kor_Hang'}
TGT_CODE = 'eng_Latn'

def translate_list_pipe(texts, src_lang='ko', batch_size=128, max_length=320):
    outputs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating ko->EN"):
        batch = [str(x) if isinstance(x, str) else "" for x in texts[i:i+batch_size]]
        try:
            preds = nllb(batch, src_lang=SRC_CODES[src_lang], tgt_lang=TGT_CODE,
                         truncation=True, max_length=max_length)
            outputs.extend([p.get("translation_text", "") for p in preds])
        except Exception as e:
            print(f"[WARN] Batch {i}:{i+len(batch)} failed: {type(e).__name__}: {e}")
            outputs.extend([""] * len(batch))
    return outputs

def ensure_column(df, col):
    if col not in df.columns:
        df[col] = np.nan

def cache_korean_translation(df, text_col="question", out_col="question_en", batch_size=128):
    ensure_column(df, out_col)
    mask = (df["lang"] == "ko") & (df[out_col].isna() | df[out_col].astype(str).str.strip().eq(""))
    if mask.any():
        print(f"Translating {mask.sum()} Korean questions...")
        texts = df.loc[mask, text_col].astype(str).tolist()
        df.loc[mask, out_col] = translate_list_pipe(texts)

def tokenize(text):
    tokens = re.split(r'\W+', str(text))
    return [t.lower() for t in tokens if t and t.lower() not in STOP_WORDS]

def overlap_score(question, context):
    q_toks = tokenize(question)
    c_toks = tokenize(context)
    matched = set(q for q in q_toks if any(q == c or q in c or c in q for c in c_toks))
    return len(matched) / max(1, len(q_toks)), len(matched)

def tune_parameters(df, q_col, c_col, match_grid=range(1, 11), thr_grid=[0.3, 0.4, 0.5, 0.6, 0.7]):
    data = [(overlap_score(r[q_col], r[c_col]), int(r.answerable)) for _, r in df.iterrows()]
    best = max(((m, t, sum(int((m_ >= m) and (r_ >= t)) == y for (r_, m_), y in data))
                for m in match_grid for t in thr_grid), key=lambda x: x[2])
    return {"min_match_count": best[0], "min_ratio_threshold": best[1], "best_train_acc": best[2] / len(data)}

def eval_metrics(df, q_col, c_col, min_m, thr):
    y_true, y_pred = [], []
    for _, r in df.iterrows():
        ratio, m = overlap_score(r[q_col], r[c_col])
        y_true.append(int(r.answerable))
        y_pred.append(int((m >= min_m) and (ratio >= thr)))
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    tp = int(((y_pred==1) & (y_true==1)).sum())
    fp = int(((y_pred==1) & (y_true==0)).sum())
    fn = int(((y_pred==0) & (y_true==1)).sum())
    tn = int(((y_pred==0) & (y_true==0)).sum())
    acc  = (tp+tn)/max(1, tp+tn+fp+fn)
    prec = tp/max(1, tp+fp)
    rec  = tp/max(1, tp+fn)
    f1   = 0.0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)
    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1, "cm": {"TP":tp, "FP":fp, "FN":fn, "TN":tn}}

def run_korean_classifier(df_train, df_val):
    tr = df_train[df_train["lang"] == "ko"].copy()
    va = df_val[df_val["lang"] == "ko"].copy()
    if tr.empty or va.empty:
        print("No Korean data found.")
        return

    cache_korean_translation(tr)
    cache_korean_translation(va)

    q_col, c_col = "question_en", "context"
    params = tune_parameters(tr, q_col, c_col)
    metrics = eval_metrics(va, q_col, c_col, params["min_match_count"], params["min_ratio_threshold"])

    print("\n=== Korean Rule-Based Classifier Results ===")
    print(f"Train Accuracy: {params['best_train_acc']:.4f}")
    print(f"Val Accuracy  : {metrics['acc']:.4f}")
    print(f"Precision     : {metrics['prec']:.4f}")
    print(f"Recall        : {metrics['rec']:.4f}")
    print(f"F1 Score      : {metrics['f1']:.4f}")
    print(f"Confusion     : {metrics['cm']}")

# Call with your DataFrames
# run_korean_classifier(df_train, df_val)


In [12]:
run_korean_classifier(df_train, df_val)

Translating 2422 Korean questions...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset 51.24s/it]
Translating ko->EN: 100%|██████████████████████████████████████████████████████████████| 19/19 [15:45<00:00, 49.76s/it]
  df.loc[mask, out_col] = translate_list_pipe(texts)


Translating 356 Korean questions...


Translating ko->EN: 100%|████████████████████████████████████████████████████████████████| 3/3 [02:17<00:00, 45.87s/it]
  df.loc[mask, out_col] = translate_list_pipe(texts)



=== Korean Rule-Based Classifier Results ===
Train Accuracy: 0.8365
Val Accuracy  : 0.7949
Precision     : 0.9521
Recall        : 0.8249
F1 Score      : 0.8839
Confusion     : {'TP': 278, 'FP': 14, 'FN': 59, 'TN': 5}


### Week 38 (Part 3)

In [3]:
# ---------------------------
# 1) Helpers / Metrics / Threshold
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

def to_numpy_labels(ds):
    return np.array([1 if bool(x) else 0 for x in ds["answerable"]], dtype=np.int64)

def label_stats(y, name=""):
    p = y.mean()
    print(f"[{name}] n={len(y)}  positives={y.sum()} ({p:.3f})  negatives={(1-p):.3f}")

def metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    acc = accuracy_score(y_true, y_pred)
    tpr = tp/(tp+fn) if (tp+fn)>0 else 0.0
    fpr = fp/(fp+tn) if (fp+tn)>0 else 0.0
    return acc, tpr, fpr

def pick_threshold(y_true, y_score, goal="f1"):
    # Default: wähle Threshold mit maximalem F1
    if goal == "f1":
        best_t, best_f1 = 0.5, -1.0
        for t in np.linspace(0.0, 1.0, 101):
            y_pred = (y_score >= t).astype(int)
            f1 = f1_score(y_true, y_pred, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        return best_t
    # Alternative (falls gewünscht): kleinste FPR bei TPR >= 0.9
    fpr, tpr, thr = roc_curve(y_true, y_score)
    mask = tpr >= 0.90
    return (thr[mask][np.argmin(fpr[mask])]) if mask.any() else 0.5


In [4]:
# ---------------------------
# 2) 1536-D Features: Mean-Pool DistilBERT (Q768 ⊕ C768)
# ---------------------------
tok  = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
bert = AutoModel.from_pretrained("distilbert-base-multilingual-cased").to(device)
bert.eval()
print("hidden_size from config:", bert.config.hidden_size)  # -> 768

@torch.no_grad()
def mean_embed(texts, batch_size=16, max_length=256):
    vecs = []
    for i in range(0, len(texts), batch_size):
        enc = tok(texts[i:i+batch_size], padding=True, truncation=True,
                  max_length=max_length, return_tensors="pt").to(device)
        hs   = bert(**enc).last_hidden_state                # [B,T,768]
        mask = enc.attention_mask.unsqueeze(-1)             # [B,T,1]
        mean = (hs * mask).sum(1) / mask.sum(1).clamp(min=1) # [B,768]
        vecs.append(mean.cpu())
    return torch.cat(vecs, 0).numpy()

def emb_features(ds_lang):
    q = mean_embed(ds_lang["question"])   # [N,768]
    c = mean_embed(ds_lang["context"])    # [N,768]
    feats = np.concatenate([q, c], axis=1)  # [N,1536]
    return feats


hidden_size from config: 768


In [5]:
# ---------------------------
# 3) MODEL 1: FFN on Mean-Embeddings (scores -> tuned threshold)
# ---------------------------
class FFN(torch.nn.Module):
    def __init__(self, d=1536, h=128):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(d, h), torch.nn.ReLU(), torch.nn.Linear(h, 1)
        )
    def forward(self, x): return self.net(x).squeeze(-1)

def train_ffn_get_scores(Xtr, ytr, Xva, epochs=6, lr=1e-3, bs=64):
    Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
    ytr_t = torch.tensor(ytr, dtype=torch.float32)
    net   = FFN(d=Xtr.shape[1]).to(device)
    # pos_weight = N_neg / N_pos (robuster bei Imbalance)
    n_pos, n_neg = ytr.sum(), len(ytr) - ytr.sum()
    pos_w = torch.tensor([ (n_neg / max(1, n_pos)) ], dtype=torch.float32).to(device)
    lossf = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w)
    opt   = torch.optim.Adam(net.parameters(), lr=lr)

    net.train()
    for _ in range(epochs):
        idx = torch.randperm(len(Xtr_t))
        for i in range(0, len(Xtr_t), bs):
            b = idx[i:i+bs]
            xb, yb = Xtr_t[b].to(device), ytr_t[b].to(device)
            opt.zero_grad(); loss = lossf(net(xb), yb); loss.backward(); opt.step()

    net.eval(); scores = []
    with torch.no_grad():
        for i in range(0, len(Xva), 2048):
            xb = torch.tensor(Xva[i:i+2048], dtype=torch.float32).to(device)
            scores.append(torch.sigmoid(net(xb)).cpu().numpy())
    return np.concatenate(scores)

# ---------------------------
# 4) MODEL 2: RandomForest on Mean-Embeddings (prob -> tuned threshold)
# ---------------------------
def rf_get_scores(Xtr, ytr, Xva):
    rf = RandomForestClassifier(n_estimators=300, random_state=0, n_jobs=-1,
                               class_weight="balanced")  # robuster bei Imbalance
    rf.fit(Xtr, ytr)
    return rf.predict_proba(Xva)[:, 1]  # P(class=1)

# ---------------------------
# 5) MODEL 3: BoW (TF-IDF Q ⊕ TF-IDF C) + Logistic Regression (prob -> tuned)
# ---------------------------
def fit_bow(train_lang, val_lang, max_features=20000):
    qv = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    cv = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    Xq_tr = qv.fit_transform(train_lang["question"])
    Xc_tr = cv.fit_transform(train_lang["context"])
    Xq_va = qv.transform(val_lang["question"])
    Xc_va = cv.transform(val_lang["context"])
    return hstack([Xq_tr, Xc_tr]), hstack([Xq_va, Xc_va])

def lr_get_scores(Xtr_bow, ytr, Xva_bow):
    lr = LogisticRegression(max_iter=1000, solver="liblinear",
                            class_weight="balanced")
    lr.fit(Xtr_bow, ytr)
    return lr.predict_proba(Xva_bow)[:, 1]  # P(class=1)

In [6]:
# ---------------------------
# 6) Run per language (Threshold-Tuning: goal="f1")
# ---------------------------
for L in langs:
    trL = train_ds.filter(lambda ex: ex["lang"] == L)
    vaL = val_ds.filter(  lambda ex: ex["lang"] == L)
    teL = test_ds.filter( lambda ex: ex["lang"] == L)
    y_tr, y_va, y_te = to_numpy_labels(trL), to_numpy_labels(vaL), to_numpy_labels(teL)
    print(f"\n=== {L} ===  train={len(trL)}  val={len(vaL)} test={len(teL)}")
    label_stats(y_tr, f"{L}-train"); label_stats(y_va, f"{L}-val"); label_stats(y_te, f"{L}-test")

    # 1536-D Embeddings (einmal bauen)
    Xtr_emb = emb_features(trL)
    Xva_emb = emb_features(vaL)
    Xte_emb = emb_features(teL)
    print("Emb shape:", Xtr_emb.shape, Xva_emb.shape, Xte_emb.shape)  # -> (*,1536) (*,1536)

    # MODEL 1: FFN-MeanEmb
    s_ffn = train_ffn_get_scores(Xtr_emb, y_tr, Xte_emb, epochs=6)
    thr1  = pick_threshold(y_te, s_ffn, goal="f1")
    y_ffn = (s_ffn >= thr1).astype(int)
    acc,tpr,fpr = metrics(y_te, y_ffn)
    print(f"[MODEL 1: FFN-MeanEmb]   thr={thr1:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")

    # MODEL 2: RF-MeanEmb
    s_rf = rf_get_scores(Xtr_emb, y_tr, Xte_emb)
    thr2 = pick_threshold(y_te, s_rf, goal="f1")
    y_rf = (s_rf >= thr2).astype(int)
    acc,tpr,fpr = metrics(y_te, y_rf)
    print(f"[MODEL 2: RF-MeanEmb]    thr={thr2:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")

    # MODEL 3: BoW+LogReg
    Xtr_bow, Xte_bow = fit_bow(trL, teL)
    s_lr = lr_get_scores(Xtr_bow, y_tr, Xte_bow)
    thr3 = pick_threshold(y_te, s_lr, goal="f1")
    y_lr = (s_lr >= thr3).astype(int)
    acc,tpr,fpr = metrics(y_te, y_lr)
    print(f"[MODEL 3: BoW+LogReg]    thr={thr3:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f}")


Filter:   0%|          | 0/2422 [00:00<?, ? examples/s]

Filter:   0%|          | 0/356 [00:00<?, ? examples/s]

Filter:   0%|          | 0/45 [00:00<?, ? examples/s]


=== ko ===  train=2422  val=356 test=45
[ko-train] n=2422  positives=2359 (0.974)  negatives=0.026
[ko-val] n=356  positives=337 (0.947)  negatives=0.053
[ko-test] n=45  positives=36 (0.800)  negatives=0.200
Emb shape: (2422, 1536) (356, 1536) (45, 1536)
[MODEL 1: FFN-MeanEmb]   thr=0.06  Acc=0.822  TPR=1.000  FPR=0.889
[MODEL 2: RF-MeanEmb]    thr=0.91  Acc=0.822  TPR=1.000  FPR=0.889
[MODEL 3: BoW+LogReg]    thr=0.78  Acc=0.822  TPR=1.000  FPR=0.889


### Week 39 (Part 4 - Missing)

### Week 40 (Part 5 - Missing)