In [72]:
import os
import re
import unicodedata
import json
from pathlib import Path
import numpy as np
import pandas as pd

# ---------------------------
# Config (EDIT THESE)
# ---------------------------

# Unified predictions CSV (semicolon-separated)
UNIFIED_CSV = Path("/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/quant_all_val_predictions_new.csv")
# If you want to run against the attached file in this chat instead:
# UNIFIED_CSV = Path("/mnt/data/quant_all_val_predictions_new.csv")

OUT_DIR = Path("/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/collision_redo")
OUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 1337

print("UNIFIED_CSV exists:", UNIFIED_CSV.exists(), "|", UNIFIED_CSV)
print("OUT_DIR:", OUT_DIR)


UNIFIED_CSV exists: True | /home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/quant_all_val_predictions_new.csv
OUT_DIR: /home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/collision_redo


Cell 2 — Load unified CSV and build normalized strings + UID

This normalization is intentionally minimal (and consistent with the sent_collisions_norm.csv logic): lowercasing + whitespace collapse + Unicode normalization. We also define “error” as pred_norm != lab_norm (this matches the behavior that excludes pure case/formatting differences).

In [73]:
def norm_str(s: object) -> str:
    """Minimal normalization used for collision matching."""
    if pd.isna(s):
        return ""
    s = str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

df = pd.read_csv(UNIFIED_CSV, sep=";")

required_cols = {"task", "fold", "sample_index", "prediction", "label"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Unified CSV is missing columns: {missing}. Found: {list(df.columns)}")

# Create UID identical in spirit to existing outputs: task|fold|sample_index
df["fold"] = df["fold"].astype(int)
df["sample_index"] = df["sample_index"].astype(int)
df["uid"] = df.apply(lambda r: f"{r['task']}|{r['fold']}|{r['sample_index']}", axis=1)

df["pred_norm"] = df["prediction"].map(norm_str)
df["lab_norm"]  = df["label"].map(norm_str)

# Optional (kept if available)
if "levenshtein_distance" in df.columns:
    df["lev_pred_gt"] = df["levenshtein_distance"].astype(int)
else:
    df["lev_pred_gt"] = np.nan  # keep column for schema compatibility

print("Rows:", len(df))
print(df.head(3))


Rows: 44802
   task  fold                                          json_path  \
0  sent     0  /home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...   
1  sent     0  /home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...   
2  sent     0  /home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...   

   sample_index                                     prediction  \
0             0                                  Du bist dran.   
1             1  Lass/Lasst uns hingehen!; Los, gehen wir hin!   
2             2                              Ich hole sie/ihn.   

                                           label  levenshtein_distance  \
0                                  Du bist dran.                     0   
1  Lass/Lasst uns hingehen!; Los, gehen wir hin!                     0   
2                              Ich hole sie/ihn.                     0   

        uid                                      pred_norm  \
0  sent|0|0                                  du bist dran.   
1  sent|0|1  lass/lasst uns h

Cell 3 — Build collisions CSV for BOTH tasks with the SAME schema as sent_collisions_norm.csv

This produces a single event table with columns:
fold, sample_index, uid, prediction, label, lev_pred_gt, match_fold, match_sample_index, match_uid, match_label, match_type

In [74]:
# Error samples are defined by normalized mismatch (matches original collision logic)
err = df[df["pred_norm"] != df["lab_norm"]].copy()

# Candidate match table: any ground-truth label can be a collision target
match_tbl = df[["task", "fold", "sample_index", "uid", "label", "lab_norm"]].copy()
match_tbl = match_tbl.rename(columns={
    "fold": "match_fold",
    "sample_index": "match_sample_index",
    "uid": "match_uid",
    "label": "match_label",
    "lab_norm": "match_lab_norm",
})

# Collision event: pred_norm of an error equals lab_norm of some other sample in same task
coll = err.merge(
    match_tbl,
    left_on=["task", "pred_norm"],
    right_on=["task", "match_lab_norm"],
    how="inner",
)

# Keep EXACT schema (plus match_type)
coll_out = coll.rename(columns={"levenshtein_distance": "lev_pred_gt"})  # if present already
coll_out["match_type"] = "pred_norm == lab_norm"

# Column order identical to sent_collisions_norm.csv
cols = [
    "fold", "sample_index", "uid", "prediction", "label", "lev_pred_gt",
    "match_fold", "match_sample_index", "match_uid", "match_label", "match_type"
]
coll_out = coll_out[cols].copy()

print("Collision events:", len(coll_out))
print("By task (inferred from uid prefix):")
print(coll_out["uid"].str.split("|", n=1, expand=True)[0].value_counts())

# Save
COLLISIONS_CSV = OUT_DIR / "collisions_norm_all_tasks.csv"
coll_out.to_csv(COLLISIONS_CSV, index=False)
print("Wrote:", COLLISIONS_CSV)


Collision events: 66043
By task (inferred from uid prefix):
0
word    41576
sent    24467
Name: count, dtype: int64
Wrote: /home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/collision_redo/collisions_norm_all_tasks.csv


In [75]:
# If match_type exists, keep only raw collisions (as per your decision).
coll_raw = coll.copy()
print("coll_raw shape:", coll_raw.shape)


coll_raw shape: (66043, 16)


Cell 4 — Check 1: collision label-frequency concentration (UID-weighted + event-weighted)

This regenerates the exact distributional table you described, and saves:

collision_gtfreq_summary.csv

a LaTeX table snippet you can paste into the report.

In [76]:
# Ground-truth label frequency per task (on normalized labels)
label_counts = (
    df.groupby(["task", "lab_norm"])
      .size()
      .rename("gt_label_count")
      .reset_index()
)

# Attach pred_norm and gt_label_count(pred_norm) to collision events
coll_aug = coll_out.merge(df[["uid", "task", "pred_norm"]], on="uid", how="left")
coll_aug = coll_aug.merge(
    label_counts,
    left_on=["task", "pred_norm"],
    right_on=["task", "lab_norm"],
    how="left",
)

assert coll_aug["gt_label_count"].isna().sum() == 0, "Unexpected: collision pred not found in label counts."

def summarize_counts(values: np.ndarray) -> dict:
    values = np.asarray(values, dtype=float)
    return {
        "n": int(values.size),
        "Median": int(np.median(values)),
        "p90": int(np.percentile(values, 90)),
        "p95": int(np.percentile(values, 95)),
    }

rows = []
for task in ["sent", "word"]:
    name = "Sentence" if task == "sent" else "Word"

    # UID-weighted: one per colliding error sample (uid)
    uid_vals = (
        coll_aug[coll_aug["task"] == task]
        .drop_duplicates("uid")["gt_label_count"]
        .to_numpy()
    )
    rows.append({"Dataset": name, "View": "UID-weighted", **summarize_counts(uid_vals)})

    # Event-weighted: one per collision event
    ev_vals = coll_aug[coll_aug["task"] == task]["gt_label_count"].to_numpy()
    rows.append({"Dataset": name, "View": "Event-weighted", **summarize_counts(ev_vals)})

summary_df = pd.DataFrame(rows)[["Dataset", "View", "n", "Median", "p90", "p95"]]
display(summary_df)

# Save CSV for report
SUMMARY_CSV = OUT_DIR / "collision_gtfreq_summary.csv"
summary_df.to_csv(SUMMARY_CSV, index=False)
print("Wrote:", SUMMARY_CSV)


Unnamed: 0,Dataset,View,n,Median,p90,p95
0,Sentence,UID-weighted,1712,15,21,23
1,Sentence,Event-weighted,24467,17,23,25
2,Word,UID-weighted,2845,14,26,29
3,Word,Event-weighted,41576,19,31,40


Wrote: /home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/collision_redo/collision_gtfreq_summary.csv


Cell 5 — “Most frequent collided predictions” for your “next steps” list

This produces a per-task ranking of collided predicted strings, together with gt_label_count.

In [77]:
top = (
    coll_aug.groupby(["task", "pred_norm", "gt_label_count"])
            .size()
            .rename("collision_event_count")
            .reset_index()
            .sort_values(["task", "collision_event_count"], ascending=[True, False])
)

TOP_CSV = OUT_DIR / "top_collided_predictions_by_task.csv"
top.to_csv(TOP_CSV, index=False)
print("Wrote:", TOP_CSV)

# Show top 15 per task
for task in ["sent", "word"]:
    print("\n=== Top collided predictions:", task, "===")
    display(top[top["task"] == task].head(15))


Wrote: /home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/collision_redo/top_collided_predictions_by_task.csv

=== Top collided predictions: sent ===


Unnamed: 0,task,pred_norm,gt_label_count,collision_event_count
344,sent,inlineskatesfahren ist toll.,17,187
561,sent,soziales netzwerk,21,168
481,sent,rappe/rappt wie ich.,27,162
259,sent,handlungsort; lage; standort,17,153
405,sent,los; ticket; eintrittskarte,17,153
717,sent,tour; fahrt; rundgang,19,152
460,sent,paket; päckchen,25,150
748,sent,verletzen; weh tun,18,144
470,sent,problem; schwierigkeit,23,138
384,sent,klub; verein; ag,27,135



=== Top collided predictions: word ===


Unnamed: 0,task,pred_norm,gt_label_count,collision_event_count
1435,word,land,67,1139
1994,word,uhr,40,400
1831,word,sehr,29,319
2049,word,vor,28,308
1391,word,klein,27,270
1398,word,kommen,26,260
1506,word,material,28,252
1395,word,kochen,34,238
1120,word,erkältung,23,230
1805,word,schnell,46,230


Cell 7 — Curate pairs: collision (i,j) vs random baseline (i,k), same fold and task

This matches your stated protocol (same fold + dataset for the random baseline).

Cell 8 — Compute similarities (with caching) and summarize Check 2 outputs

In [78]:
def concentration_table(cdf: pd.DataFrame, topk=20):
    total_events = len(cdf)
    total_uids   = cdf["uid"].nunique()

    by_events = cdf["prediction"].value_counts().head(topk).reset_index()
    by_events.columns = ["prediction", "collision_events"]
    by_events["event_share_%"] = 100 * by_events["collision_events"] / total_events

    by_uids = (cdf.groupby("prediction")["uid"].nunique()
               .sort_values(ascending=False).head(topk).reset_index())
    by_uids.columns = ["prediction", "unique_error_samples"]
    by_uids["uid_share_%"] = 100 * by_uids["unique_error_samples"] / total_uids

    out = by_events.merge(by_uids, on="prediction", how="outer").fillna(0)
    return out.sort_values("collision_events", ascending=False), total_events, total_uids

for t in ["sent", "word"]:
    cdf = coll[coll["task"] == t].copy()
    tab, n_events, n_uids = concentration_table(cdf, topk=20)

    print(f"\n=== {t.upper()} ===")
    print(f"collision events={n_events} | unique colliding error samples={n_uids}")
    display(tab.head(10))

    top10_event_share = cdf["prediction"].value_counts().head(10).sum() / n_events * 100
    top10_uid_share = (cdf.groupby("prediction")["uid"].nunique()
                       .sort_values(ascending=False).head(10).sum() / n_uids * 100)
    print(f"Top-10 share (events): {top10_event_share:.2f}%")
    print(f"Top-10 share (unique errors): {top10_uid_share:.2f}%")



=== SENT ===
collision events=24467 | unique colliding error samples=1712


Unnamed: 0,prediction,collision_events,event_share_%,unique_error_samples,uid_share_%
6,Inlineskatesfahren ist toll.,187.0,0.764295,11.0,0.642523
23,soziales Netzwerk,168.0,0.686639,8.0,0.46729
11,Rappe/Rappt wie ich.,162.0,0.662116,0.0,0.0
4,Handlungsort; Lage; Standort,153.0,0.625332,9.0,0.525701
8,Los; Ticket; Eintrittskarte,153.0,0.625332,9.0,0.525701
13,Tour; Fahrt; Rundgang,152.0,0.621245,8.0,0.46729
9,Paket; Päckchen,150.0,0.613071,6.0,0.350467
24,verletzen; weh tun,144.0,0.588548,8.0,0.46729
10,Problem; Schwierigkeit,138.0,0.564025,6.0,0.350467
7,Klub; Verein; AG,135.0,0.551764,0.0,0.0


Top-10 share (events): 6.30%
Top-10 share (unique errors): 4.96%

=== WORD ===
collision events=41576 | unique colliding error samples=2845


Unnamed: 0,prediction,collision_events,event_share_%,unique_error_samples,uid_share_%
6,Land,871.0,2.094959,13.0,0.456942
12,Uhr,400.0,0.962094,10.0,0.351494
21,sehr,319.0,0.76727,11.0,0.386643
25,vor,308.0,0.740812,11.0,0.386643
16,klein,270.0,0.649413,10.0,0.351494
18,land,268.0,0.644603,0.0,0.0
17,kommen,234.0,0.562825,9.0,0.316344
2,Erkältung,230.0,0.553204,10.0,0.351494
20,schnell,230.0,0.553204,0.0,0.0
15,gestern,224.0,0.538772,8.0,0.281195


Top-10 share (events): 8.07%
Top-10 share (unique errors): 3.55%


In [79]:
label_freq = (df.groupby(["task", "label"]).size()
              .reset_index(name="gt_label_count"))

coll_freq = coll.merge(
    label_freq,
    left_on=["task", "prediction"],
    right_on=["task", "label"],
    how="left",
    suffixes=("", "_gt")   # <- important: makes the right-side label "label_gt"
)

# Now drop only the right-side join key (optional)
if "label_gt" in coll_freq.columns:
    coll_freq = coll_freq.drop(columns=["label_gt"])

coll_freq["gt_label_count"] = coll_freq["gt_label_count"].fillna(0).astype(int)

for t in ["sent", "word"]:
    x = coll_freq[coll_freq["task"] == t]["gt_label_count"]
    print(f"\n=== {t.upper()} gt_label_count stats for collided predictions ===")
    print(x.describe(percentiles=[.5,.75,.9,.95,.99]))



=== SENT gt_label_count stats for collided predictions ===
count    24467.000000
mean        16.489271
std          5.047093
min          1.000000
50%         17.000000
75%         19.000000
90%         23.000000
95%         25.000000
99%         28.000000
max         38.000000
Name: gt_label_count, dtype: float64

=== WORD gt_label_count stats for collided predictions ===
count    41576.000000
mean        19.174187
std          9.946837
min          0.000000
50%         18.000000
75%         24.000000
90%         29.000000
95%         37.000000
99%         58.000000
max         58.000000
Name: gt_label_count, dtype: float64


In [80]:
print(coll_freq.columns.tolist())


['task', 'fold', 'json_path', 'sample_index', 'prediction', 'label', 'levenshtein_distance', 'uid', 'pred_norm', 'lab_norm', 'lev_pred_gt', 'match_fold', 'match_sample_index', 'match_uid', 'match_label', 'match_lab_norm', 'gt_label_count']


Check 2 — Input similarity on curated collision pairs (Notebook implementation)

This part depends on access to your IMU JSON files (paths in json_path). The code below is robust to common JSON layouts and implements the “dynamics-based” representation:

first-order differences (Δx)

inter-channel correlation (on x and Δx)

cosine similarity of the resulting feature vectors

It outputs:

collision_similarity_pairs.csv (per-pair similarity values)

collision_similarity_summary.csv (delta + probability > chance)

In [81]:
import argparse
import yaml

def load_cfg(yaml_path: str) -> argparse.Namespace:
    with open(yaml_path, "r") as f:
        cfg = yaml.safe_load(f)
    return argparse.Namespace(**cfg)

CFG_SENT_PATH = "/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/configs/test_sent.yaml"
CFG_WORD_PATH = "/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/configs/test.yaml"

cfg_sent = load_cfg(CFG_SENT_PATH)
cfg_word = load_cfg(CFG_WORD_PATH)


In [82]:
from rewi.model import build_encoder

def encoder_ratio_ds(cfg: argparse.Namespace) -> int:
    enc = build_encoder(cfg.num_channel, cfg.arch_en, getattr(cfg, "len_seq", 0))
    return int(enc.ratio_ds)

ratio_sent = encoder_ratio_ds(cfg_sent)
ratio_word = encoder_ratio_ds(cfg_word)

print("ratio_ds sent:", ratio_sent)
print("ratio_ds word:", ratio_word)


ratio_ds sent: 8
ratio_ds word: 8


In [83]:
from rewi.dataset import HRDataset

DATASET_CACHE = {}

def get_val_dataset(task: str, fold: int) -> HRDataset:
    key = (task, fold)
    if key in DATASET_CACHE:
        return DATASET_CACHE[key]

    if task == "sent":
        cfg = cfg_sent
        ratio_ds = ratio_sent
    elif task == "word":
        cfg = cfg_word
        ratio_ds = ratio_word
    else:
        raise ValueError(task)

    ds = HRDataset(
        path_anno=os.path.join(cfg.dir_dataset, "val.json"),
        categories=cfg.categories,
        ratio_ds=ratio_ds,
        idx_fold=fold,
        len_seq=getattr(cfg, "len_seq", 0),
        aug=False,
        cache=getattr(cfg, "cache", False),
    )
    DATASET_CACHE[key] = ds
    return ds

def extract_x(item):
    # HRDataset returns (seq, label)
    x = item[0]
    if hasattr(x, "detach"):
        x = x.detach().cpu().numpy()
    x = np.asarray(x)
    if x.ndim != 2:
        raise ValueError(f"Expected (T,C), got {x.shape}")
    return x

def load_x(task: str, fold: int, sample_index: int) -> np.ndarray:
    ds = get_val_dataset(task, fold)
    return extract_x(ds[int(sample_index)])


In [84]:
# One matched sample per error uid (stable selection of one match per uid)
pairs = (coll.sort_values(["uid", "match_uid"])
         .drop_duplicates(subset=["uid"])
         .copy())

# Diversify within each task: at most 2 per prediction
curated = (pairs.groupby(["task", "prediction"], group_keys=False)
           .head(2))

curated200 = pd.concat([
    curated[curated["task"]=="sent"].head(200),
    curated[curated["task"]=="word"].head(200),
], ignore_index=True)


print(curated200["task"].value_counts())
curated200[["task","fold","sample_index","match_fold","match_sample_index",
           "prediction","label","match_label","lev_pred_gt"]]



task
sent    200
word    200
Name: count, dtype: int64


Unnamed: 0,task,fold,sample_index,match_fold,match_sample_index,prediction,label,match_label,lev_pred_gt
0,sent,0,1016,0,2443,Souvenir; Andenken,Good morning.,Souvenir; Andenken,14
1,sent,0,1026,0,1220,fast; annähernd,la reine,fast; annähernd,11
2,sent,0,1027,1,1156,Und ein Angeber!,un avion,Und ein Angeber!,13
3,sent,0,1074,0,1895,einschenken; eingießen; schütten,einsteigen; hineingelangen,einschenken; eingießen; schütten,17
4,sent,0,108,0,1111,Region; Gegend,They're my friends.,Region; Gegend,14
...,...,...,...,...,...,...,...,...,...
395,word,0,234,0,2827,Rolltreppe,Rolltreppen,Rolltreppe,1
396,word,0,236,3,1332,Steuer,Staus,Steuer,3
397,word,0,2364,0,2212,cannot,court,cannot,4
398,word,0,237,0,2880,DVD,DTM,DVD,2


In [85]:
def resample_to_fixed_length(x: np.ndarray, L: int = 200) -> np.ndarray:
    T, C = x.shape
    if T == L:
        return x
    t_old = np.linspace(0, 1, T)
    t_new = np.linspace(0, 1, L)
    out = np.zeros((L, C), dtype=float)
    for c in range(C):
        out[:, c] = np.interp(t_new, t_old, x[:, c])
    return out

def zscore(x: np.ndarray) -> np.ndarray:
    mu = x.mean(axis=0, keepdims=True)
    sd = x.std(axis=0, keepdims=True) + 1e-8
    return (x - mu) / sd

def cosine(a: np.ndarray, b: np.ndarray) -> float:
    a = a.astype(float); b = b.astype(float)
    return float(np.dot(a, b) / ((np.linalg.norm(a)+1e-12) * (np.linalg.norm(b)+1e-12)))

def feature_vector(x: np.ndarray) -> np.ndarray:
    mu  = x.mean(axis=0)
    std = x.std(axis=0)
    mn  = x.min(axis=0)
    mx  = x.max(axis=0)
    eng = (x**2).mean(axis=0)
    return np.concatenate([mu, std, mn, mx, eng], axis=0)

def dyn_feature_vector(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x, dtype=float)
    dx = np.diff(x, axis=0)  # dynamics

    dx_mu  = dx.mean(axis=0)
    dx_std = dx.std(axis=0)
    dx_eng = (dx**2).mean(axis=0)

    # channel correlation structure
    C = x.shape[1]
    corr = np.corrcoef(x.T)
    triu = corr[np.triu_indices(C, k=1)]

    return np.concatenate([dx_mu, dx_std, dx_eng, triu], axis=0)

results = []
for r in curated200.itertuples(index=False):
    xi = load_x(r.task, int(r.fold), int(r.sample_index))
    xj = load_x(r.task, int(r.match_fold), int(r.match_sample_index))

    # Dynamics-based similarity (meaningful after per-sample normalization)
    feat_cos_dyn = cosine(dyn_feature_vector(xi), dyn_feature_vector(xj))

    # Sequence similarity (time-ordered, crude baseline)
    xi_r = resample_to_fixed_length(zscore(xi), L=200).reshape(-1)
    xj_r = resample_to_fixed_length(zscore(xj), L=200).reshape(-1)
    seq_cos = cosine(xi_r, xj_r)

    results.append({
        "task": r.task,
        "uid": r.uid,
        "match_uid": r.match_uid,
        "prediction": r.prediction,
        "gt_i": r.label,
        "gt_j": r.match_label,
        "lev_pred_gt": int(r.lev_pred_gt),
        "Ti": xi.shape[0], "Tj": xj.shape[0],
        "feat_cos_dyn": feat_cos_dyn,
        "seq_cos": seq_cos,
    })

sim_df = pd.DataFrame(results).sort_values(["task","seq_cos"], ascending=[True, False])
sim_df

Unnamed: 0,task,uid,match_uid,prediction,gt_i,gt_j,lev_pred_gt,Ti,Tj,feat_cos_dyn,seq_cos
47,sent,sent|0|2020,sent|0|167,to save,to add,to save,3,436,256,0.512806,0.189861
130,sent,sent|0|2902,sent|1|159,to put through,"to put up, put up",to put through,9,754,838,0.902871,0.181181
120,sent,sent|0|2663,sent|0|110,to ask,to cost,to ask,3,231,170,0.874548,0.177116
15,sent,sent|0|127,sent|0|140,private detective,point of view,private detective,12,334,576,0.950929,0.164751
19,sent,sent|0|1393,sent|0|1853,als Nächstes,alarm clock,als Nächstes,9,356,440,0.870247,0.163395
...,...,...,...,...,...,...,...,...,...,...,...
269,word,word|0|1543,word|0|137,wichtig,dreckig,wichtig,5,695,196,0.791522,-0.124254
363,word,word|0|2098,word|0|1942,Weg,cent,Weg,3,130,118,0.535442,-0.130057
318,word,word|0|1704,word|0|71,nein,voir,nein,3,242,120,0.765943,-0.131371
211,word,word|0|1062,word|0|868,drücken,drucken,drücken,1,466,551,0.753236,-0.150784


In [86]:
sim_df.groupby("task")[["feat_cos_dyn","seq_cos"]].describe()

Unnamed: 0_level_0,feat_cos_dyn,feat_cos_dyn,feat_cos_dyn,feat_cos_dyn,feat_cos_dyn,feat_cos_dyn,feat_cos_dyn,feat_cos_dyn,seq_cos,seq_cos,seq_cos,seq_cos,seq_cos,seq_cos,seq_cos,seq_cos
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
sent,200.0,0.700605,0.162864,0.098255,0.601864,0.734659,0.816192,0.950929,200.0,0.020515,0.065303,-0.160135,-0.022135,0.017278,0.062851,0.189861
word,200.0,0.707805,0.152743,0.114101,0.647254,0.749687,0.804714,0.934405,200.0,0.020934,0.087505,-0.161111,-0.025709,0.005775,0.059482,0.352767


In [87]:
for t in ["sent","word"]:
    sub = coll[coll["task"] == t]
    for fold in sorted(sub["fold"].unique())[:5]:
        ds = get_val_dataset(t, int(fold))
        mx = sub[sub["fold"] == fold]["sample_index"].max()
        print(t, "fold", fold, "len(ds)=", len(ds), "max sample_index in collisions=", mx)


sent fold 0 len(ds)= 4188 max sample_index in collisions= 4172
sent fold 1 len(ds)= 3684 max sample_index in collisions= 3681
sent fold 2 len(ds)= 4416 max sample_index in collisions= 4415
sent fold 3 len(ds)= 3871 max sample_index in collisions= 3867
sent fold 4 len(ds)= 4480 max sample_index in collisions= 4479
word fold 0 len(ds)= 4734 max sample_index in collisions= 4731
word fold 1 len(ds)= 5002 max sample_index in collisions= 4995
word fold 2 len(ds)= 4694 max sample_index in collisions= 4669
word fold 3 len(ds)= 4656 max sample_index in collisions= 4655
word fold 4 len(ds)= 5077 max sample_index in collisions= 5075


In [88]:
import random
random.seed(42)

def sample_random_idx(task: str, fold: int, avoid_idx: int) -> int:
    ds = get_val_dataset(task, fold)
    n = len(ds)
    while True:
        k = random.randint(0, n - 1)
        if k != avoid_idx:
            return k

baseline_rows = []
for r in curated200.itertuples(index=False):
    task = r.task
    fold = int(r.fold)
    i_idx = int(r.sample_index)

    xi = load_x(task, fold, i_idx)

    # collision match
    xj = load_x(task, int(r.match_fold), int(r.match_sample_index))
    sim_collision = cosine(dyn_feature_vector(xi), dyn_feature_vector(xj))

    # random match (same fold/task)
    k_idx = sample_random_idx(task, fold, i_idx)
    xk = load_x(task, fold, k_idx)
    sim_random = cosine(dyn_feature_vector(xi), dyn_feature_vector(xk))

    baseline_rows.append({
        "task": task,
        "uid": r.uid,
        "prediction": r.prediction,
        "sim_collision": sim_collision,
        "sim_random": sim_random,
        "delta": sim_collision - sim_random
    })

base_df = pd.DataFrame(baseline_rows)
display(base_df.groupby("task")[["sim_collision","sim_random","delta"]].describe())

# How often is collision similarity higher than random?
print("\nP(sim_collision > sim_random) by task:")
print(base_df.groupby("task").apply(lambda g: (g["delta"] > 0).mean()))


Unnamed: 0_level_0,sim_collision,sim_collision,sim_collision,sim_collision,sim_collision,sim_collision,sim_collision,sim_collision,sim_random,sim_random,sim_random,sim_random,sim_random,delta,delta,delta,delta,delta,delta,delta,delta
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
sent,200.0,0.700605,0.162864,0.098255,0.601864,0.734659,0.816192,0.950929,200.0,0.689004,...,0.809876,0.935645,200.0,0.011601,0.174156,-0.53428,-0.083545,0.008773,0.107422,0.60021
word,200.0,0.707805,0.152743,0.114101,0.647254,0.749687,0.804714,0.934405,200.0,0.709889,...,0.819757,0.935397,200.0,-0.002084,0.111945,-0.441257,-0.066629,-0.004052,0.063119,0.306242



P(sim_collision > sim_random) by task:
task
sent    0.535
word    0.490
dtype: float64


  print(base_df.groupby("task").apply(lambda g: (g["delta"] > 0).mean()))


In [89]:
summary = base_df.groupby("task").agg(
    mean_collision=("sim_collision", "mean"),
    mean_random=("sim_random", "mean"),
    mean_delta=("delta", "mean"),
    median_delta=("delta", "median"),
    p_collision_gt_random=("delta", lambda s: (s > 0).mean()),
    std_delta=("delta", "std"),
    n=("delta", "size"),
)
print(summary.to_string())


      mean_collision  mean_random  mean_delta  median_delta  p_collision_gt_random  std_delta    n
task                                                                                              
sent        0.700605     0.689004    0.011601      0.008773                  0.535   0.174156  200
word        0.707805     0.709889   -0.002084     -0.004052                  0.490   0.111945  200


In [90]:
desc = base_df.groupby("task")[["sim_collision","sim_random","delta"]].describe()
out_path = "./collision_redo/baseline_similarity_summary_new.csv"
desc.to_csv(out_path)
print("Wrote:", out_path)


Wrote: ./collision_redo/baseline_similarity_summary_new.csv


Check 3: Writer based analysis

In [91]:

# Minimal normalization used in your collision analysis (match what you used earlier)
_WS_RE = re.compile(r"\s+")
def norm_text(s):
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    s = str(s).strip()
    return _WS_RE.sub(" ", s)

# Ensure df has uid
if "uid" not in df.columns:
    df["uid"] = df.apply(lambda r: f"{r['task']}|{int(r['fold'])}|{int(r['sample_index'])}", axis=1)

# Make sure df has normalized label for counts
if "lab_norm" not in df.columns:
    df["lab_norm"] = df["label"].map(norm_text)

# Build per-task ground-truth label frequency
gt_counts = {t: g["lab_norm"].value_counts() for t, g in df.groupby("task")}

# --- coll_freq: event-weighted (one row per collision event) ---
# coll_out should already have uid + match_uid; add task/fold/sample_index by joining to df if needed.
base_cols = ["uid", "match_uid"]
missing_cols = [c for c in base_cols if c not in coll_out.columns]
if missing_cols:
    raise ValueError(f"coll_out missing required columns: {missing_cols}")

# Attach metadata for the error uid (i)
meta_i = df[["uid", "task", "fold", "sample_index", "prediction", "label"]].copy()
meta_i = meta_i.rename(columns={
    "task": "task",
    "fold": "fold",
    "sample_index": "sample_index",
    "prediction": "prediction",
    "label": "label",
})

# Attach matched label for the match uid (j)
meta_j = df[["uid", "label"]].copy().rename(columns={"uid": "match_uid", "label": "match_label"})

coll_freq = (
    coll_out[["uid", "match_uid"]]
    .merge(meta_i, on="uid", how="left")
    .merge(meta_j, on="match_uid", how="left")
)

# Add gt_label_count for the collided prediction (per task)
coll_freq["pred_norm"] = coll_freq["prediction"].map(norm_text)
coll_freq["gt_label_count"] = [
    int(gt_counts[t].get(pn, 0)) if pd.notna(t) else 0
    for t, pn in zip(coll_freq["task"], coll_freq["pred_norm"])
]

# --- uid_level: UID-weighted (one row per colliding error uid) ---
uid_level = (
    coll_freq.sort_values(["uid", "match_uid"])
             .drop_duplicates(subset=["uid"])
             .copy()
)

print("coll_freq rows (event-weighted):", len(coll_freq))
print("coll_freq rows (UID-weighted):", len(uid_level))
display(coll_freq.head(3))
display(uid_level.head(3))


coll_freq rows (event-weighted): 66043
coll_freq rows (UID-weighted): 4557


Unnamed: 0,uid,match_uid,task,fold,sample_index,prediction,label,match_label,pred_norm,gt_label_count
0,sent|0|21,sent|0|1209,sent,0,21,sich handeln um,sich freuen auf,sich handeln um,sich handeln um,19
1,sent|0|21,sent|0|3282,sent,0,21,sich handeln um,sich freuen auf,sich handeln um,sich handeln um,19
2,sent|0|21,sent|0|3539,sent,0,21,sich handeln um,sich freuen auf,sich handeln um,sich handeln um,19


Unnamed: 0,uid,match_uid,task,fold,sample_index,prediction,label,match_label,pred_norm,gt_label_count
1006,sent|0|1016,sent|0|2443,sent,0,1016,Souvenir; Andenken,Good morning.,Souvenir; Andenken,Souvenir; Andenken,0
1018,sent|0|1026,sent|0|1220,sent,0,1026,fast; annähernd,la reine,fast; annähernd,fast; annähernd,12
1030,sent|0|1027,sent|1|1156,sent,0,1027,Und ein Angeber!,un avion,Und ein Angeber!,Und ein Angeber!,0


In [92]:

def build_writer_lookup(dir_dataset: str, task_name: str) -> dict:
    """
    Returns dict keyed by (task, fold, sample_index) -> id_writer
    """
    path_val = os.path.join(dir_dataset, "val.json")
    with open(path_val, "r") as f:
        ann = json.load(f)

    lookup = {}
    folds = ann["annotations"].keys()
    for fold_str in folds:
        items = ann["annotations"][fold_str]
        for idx, a in enumerate(items):
            # expecting a["id_writer"] exists
            lookup[(task_name, int(fold_str), int(idx))] = a.get("id_writer", None)
    return lookup

writer_lookup = {}
writer_lookup.update(build_writer_lookup(cfg_sent.dir_dataset, "sent"))
writer_lookup.update(build_writer_lookup(cfg_word.dir_dataset, "word"))

print("writer_lookup entries:", len(writer_lookup))


writer_lookup entries: 44802


In [93]:
# example
k = ("sent", 0, 0)
print(k, "->", writer_lookup.get(k))


('sent', 0, 0) -> bu/7A0aK


In [94]:
df_w = df.copy()
df_w["id_writer"] = [
    writer_lookup.get((r.task, int(r.fold), int(r.sample_index)), None)
    for r in df_w.itertuples(index=False)
]

missing = df_w["id_writer"].isna().mean()
print(f"Missing writer fraction: {missing:.4f}")
print(df_w["id_writer"].dropna().astype(str).head())


Missing writer fraction: 0.0000
0    bu/7A0aK
1    bu/7A0aK
2    bu/7A0aK
3    bu/7A0aK
4    bu/7A0aK
Name: id_writer, dtype: object


In [95]:
coll_freq_w = coll_freq.copy()
coll_freq_w["id_writer"] = [
    writer_lookup.get((r.task, int(r.fold), int(r.sample_index)), None)
    for r in coll_freq_w.itertuples(index=False)
]

uid_level_w = uid_level.copy()
uid_level_w["id_writer"] = [
    writer_lookup.get((r.task, int(r.fold), int(r.sample_index)), None)
    for r in uid_level_w.itertuples(index=False)
]

print("coll_freq_w missing writer:", coll_freq_w["id_writer"].isna().mean())
print("uid_level_w missing writer:", uid_level_w["id_writer"].isna().mean())


coll_freq_w missing writer: 0.0
uid_level_w missing writer: 0.0


In [96]:
# total samples per writer
totals = (df_w.groupby(["task","id_writer"])
          .agg(n_samples=("uid","size"),
               n_errors=("levenshtein_distance", lambda s: int((s > 0).sum())))
          .reset_index())

# colliding uids per writer (UID-weighted)
coll_uids = (uid_level_w.groupby(["task","id_writer"])
             .agg(n_collision_uids=("uid","nunique"),
                  median_gt_label_count=("gt_label_count","median"),
                  p90_gt_label_count=("gt_label_count", lambda s: float(s.quantile(0.90))))
             .reset_index())

# collision events per writer (event-weighted)
coll_events = (coll_freq_w.groupby(["task","id_writer"])
               .agg(n_collision_events=("uid","size"))
               .reset_index())

writer_stats = (totals.merge(coll_uids, on=["task","id_writer"], how="left")
                      .merge(coll_events, on=["task","id_writer"], how="left"))

writer_stats[["n_collision_uids","n_collision_events"]] = writer_stats[["n_collision_uids","n_collision_events"]].fillna(0).astype(int)

# rates
writer_stats["error_rate"] = writer_stats["n_errors"] / writer_stats["n_samples"]
writer_stats["collision_rate"] = writer_stats["n_collision_uids"] / writer_stats["n_samples"]
writer_stats["collision_given_error"] = writer_stats["n_collision_uids"] / writer_stats["n_errors"].replace(0, np.nan)

# sort for inspection
writer_stats = writer_stats.sort_values(["task","collision_given_error","n_samples"], ascending=[True, False, False])

writer_stats.head(20)


Unnamed: 0,task,id_writer,n_samples,n_errors,n_collision_uids,median_gt_label_count,p90_gt_label_count,n_collision_events,error_rate,collision_rate,collision_given_error
292,sent,ZbuF!g6b,73,4,4,7.0,14.0,51,0.054795,0.054795,1.0
85,sent,28KQnY]1,64,1,1,0.0,0.0,17,0.015625,0.015625,1.0
253,sent,R8qJ5QkV,62,3,3,0.0,11.2,50,0.048387,0.048387,1.0
451,sent,§A§=?Vw!,61,1,1,0.0,0.0,18,0.016393,0.016393,1.0
98,sent,4k/+bh9§,60,1,1,0.0,0.0,12,0.016667,0.016667,1.0
93,sent,47HF=)!p,59,2,2,16.0,18.4,32,0.033898,0.033898,1.0
227,sent,MSkMRb93,58,4,4,0.0,14.7,65,0.068966,0.068966,1.0
126,sent,8hke§sGT,57,1,1,15.0,15.0,15,0.017544,0.017544,1.0
14,sent,"&,![]z5w",56,1,1,0.0,0.0,12,0.017857,0.017857,1.0
82,sent,2#4[A2RD,56,2,2,20.0,21.6,40,0.035714,0.035714,1.0


In [103]:
def topk_writer_summary(writer_stats, task, top_ks=[5, 10, 20]):
    results = []
    df_task = writer_stats[writer_stats["task"] == task].copy()
    df_task = df_task.sort_values("n_collision_uids", ascending=False)

    total_collisions = df_task["n_collision_uids"].sum()
    total_samples = df_task["n_samples"].sum()

    for k in top_ks:
        topk = df_task.head(k)

        # shares
        coll_share = 100 * topk["n_collision_uids"].sum() / total_collisions
        sample_share = 100 * topk["n_samples"].sum() / total_samples
        lift = coll_share / sample_share if sample_share > 0 else np.nan

        # collision given error (weighted)
        total_errors_topk = topk["n_errors"].sum()
        total_collisions_topk = topk["n_collision_uids"].sum()
        collision_given_error = (
            total_collisions_topk / total_errors_topk
            if total_errors_topk > 0 else np.nan
        )

        results.append({
            "task": task,
            "top_k": k,
            "collision_share_%": round(coll_share, 6),
            "sample_share_%": round(sample_share, 6),
            "lift": round(lift, 6),
            "collision_given_error": round(collision_given_error, 6)
        })

    return pd.DataFrame(results)

# Run for both tasks
summary_sent = topk_writer_summary(writer_stats, "sent")
summary_word = topk_writer_summary(writer_stats, "word")

topk_summary = pd.concat([summary_sent, summary_word], ignore_index=True)
topk_summary.head(10)


Unnamed: 0,task,top_k,collision_share_%,sample_share_%,lift,collision_given_error
0,sent,5,8.64486,2.500121,3.457776,0.494983
1,sent,10,14.42757,4.82097,2.99267,0.542857
2,sent,20,23.948598,7.723242,3.100848,0.543767
3,word,5,9.173989,4.122005,2.225614,0.516832
4,word,10,15.219684,7.577701,2.008483,0.522316
5,word,20,25.13181,10.673344,2.354633,0.581301


In [None]:
def concentration_collision_vs_samples(writer_stats: pd.DataFrame, task: str, topk=(5,10,20)):
    sub = writer_stats[writer_stats["task"] == task].copy()

    # Total collisions and samples
    total_coll = sub["n_collision_uids"].sum()
    total_samp = sub["n_samples"].sum()

    # Rank writers by collisions
    sub = sub.sort_values("n_collision_uids", ascending=False)

    out = []
    for k in topk:
        top = sub.head(k)
        coll_share = top["n_collision_uids"].sum() / total_coll if total_coll > 0 else np.nan
        samp_share = top["n_samples"].sum() / total_samp if total_samp > 0 else np.nan
        out.append({
            "task": task,
            "top_k": k,
            "collision_share": coll_share,
            "sample_share": samp_share,
            "lift": (coll_share / samp_share) if (samp_share and samp_share > 0) else np.nan
        })
    return pd.DataFrame(out)

conc2 = pd.concat([
    concentration_collision_vs_samples(writer_stats, "sent"),
    concentration_collision_vs_samples(writer_stats, "word")
], ignore_index=True)

conc2["collision_share_%"] = 100 * conc2["collision_share"]
conc2["sample_share_%"]    = 100 * conc2["sample_share"]
conc2 = conc2.drop(columns=["collision_share","sample_share"])

display(conc2.sort_values(["task","top_k"]))
print(conc2.sort_values(["task","top_k"]).to_string(index=False))


Unnamed: 0,task,top_k,lift,collision_share_%,sample_share_%
0,sent,5,3.457776,8.64486,2.500121
1,sent,10,2.99267,14.42757,4.82097
2,sent,20,3.100848,23.948598,7.723242
3,word,5,2.225614,9.173989,4.122005
4,word,10,2.008483,15.219684,7.577701
5,word,20,2.354633,25.13181,10.673344


task  top_k     lift  collision_share_%  sample_share_%
sent      5 3.457776           8.644860        2.500121
sent     10 2.992670          14.427570        4.820970
sent     20 3.100848          23.948598        7.723242
word      5 2.225614           9.173989        4.122005
word     10 2.008483          15.219684        7.577701
word     20 2.354633          25.131810       10.673344


In [100]:
top_writers = (writer_stats.sort_values(["task","n_collision_uids"], ascending=[True, False])
               .groupby("task", group_keys=False)
               .head(15)[["task","id_writer","n_samples","n_errors","n_collision_uids",
                          "error_rate","collision_rate","collision_given_error"]])

display(top_writers)


Unnamed: 0,task,id_writer,n_samples,n_errors,n_collision_uids,error_rate,collision_rate,collision_given_error
68,sent,1001,233,140,49,0.600858,0.2103,0.35
405,sent,s=4_k-]/,117,37,27,0.316239,0.230769,0.72973
170,sent,"E/r9K[,0",58,28,24,0.482759,0.413793,0.857143
177,sent,FxdJc.NU,48,45,24,0.9375,0.5,0.533333
127,sent,8r/s0Q_2,60,49,24,0.816667,0.4,0.489796
232,sent,N)H]ne[#,49,28,22,0.571429,0.44898,0.785714
7,sent,$0u2cYZw,46,28,22,0.608696,0.478261,0.785714
395,sent,pzT(/e8a,52,39,19,0.75,0.365385,0.487179
301,sent,[h(NNQxU,57,29,18,0.508772,0.315789,0.62069
334,sent,dUa!]%&2,73,30,18,0.410959,0.246575,0.6
