Cell 1 — Imports and global configuration

In [None]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd

# Headless-safe plotting (saves PNGs without needing a display)
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# =========================
# CONFIG
# =========================
CSV_PATH = "./quant_all_val_predictions_new.csv"
CSV_SEP = ";"
OUT_DIR = "./quant_analysis_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Plot controls
X_CLIP = 1.5
HIST_BINS = 80

# Regime definition: choose ONE
REGIME_MODE = "quantiles"   # "quantiles" OR "thresholds"
THR = [0.10, 0.30, 0.60]    # only used if REGIME_MODE == "thresholds"

# Quantile-example extraction
K_PER_QUANTILE = 5
QUANTILES = [0.50, 0.90, 0.99]
TOPK_TAIL = 8
MAX_SHOW = 35

# Suspicious row verification
VERIFY_D_WITH_RECOMPUTE = True

print("OUT_DIR:", OUT_DIR)


OUT_DIR: ./quant_analysis_outputs_new


Cell 2 — Utility helpers (slug + column standardization)

In [4]:
def _slug(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_\-]+", "", s)
    return s

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Accepts either:
      - capitalized: "Task","Fold","Json_path","Sample_index","Prediction","Label","Levenshtein_distance"
      - lowercase:   "task","fold","json_path","sample_index","prediction","label","levenshtein_distance"
    and normalizes to lowercase canonical names.
    """
    col_map = {c: re.sub(r"\s+", "", c).lower() for c in df.columns}

    targets = {
        "task": ["task"],
        "fold": ["fold"],
        "json_path": ["json_path", "jsonpath"],
        "sample_index": ["sample_index", "sampleindex"],
        "prediction": ["prediction", "pred"],
        "label": ["label", "gt", "groundtruth"],
        "levenshtein_distance": ["levenshtein_distance", "levenshteindistance", "lev", "distance"],
    }

    rename = {}
    for original, norm in col_map.items():
        for canon, aliases in targets.items():
            if norm in aliases:
                rename[original] = canon

    df = df.rename(columns=rename)

    required = ["task", "fold", "json_path", "sample_index", "prediction", "label", "levenshtein_distance"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(
            f"Missing required columns after normalization: {missing}\n"
            f"Found columns: {list(df.columns)}"
        )
    return df


Cell 3 — String sanitation helpers + optional Levenshtein recompute

In [5]:
def nfc(s: str) -> str:
    return unicodedata.normalize("NFC", "" if pd.isna(s) else str(s))

def show_repr(s: str) -> str:
    return repr("" if pd.isna(s) else str(s))

def trunc(s: str, n: int = MAX_SHOW) -> str:
    s = "" if pd.isna(s) else str(s)
    return s if len(s) <= n else (s[:n] + "...")

def levenshtein(a: str, b: str) -> int:
    """Classic DP Levenshtein (edit distance). Use only for small/suspicious subsets."""
    if a == b:
        return 0
    la, lb = len(a), len(b)
    if la == 0:
        return lb
    if lb == 0:
        return la
    if lb > la:
        a, b = b, a
        la, lb = lb, la
    prev = list(range(lb + 1))
    for i, ca in enumerate(a, start=1):
        cur = [i] + [0] * lb
        for j, cb in enumerate(b, start=1):
            cost = 0 if ca == cb else 1
            cur[j] = min(
                prev[j] + 1,
                cur[j - 1] + 1,
                prev[j - 1] + cost
            )
        prev = cur
    return prev[lb]


Cell 4 — Summary and regimes functions

In [6]:
def summarize_group(g: pd.DataFrame) -> pd.Series:
    n = len(g)

    lev_sum = g["levenshtein_distance"].sum()
    char_sum = g["label_chars"].sum()
    lev_norm_micro = lev_sum / max(1, char_sum)
    lev_norm_macro = g["lev_norm"].mean()

    vals = g["lev_norm"].to_numpy()
    vals = vals[np.isfinite(vals)]
    if len(vals) == 0:
        qs = {q: np.nan for q in [0.5, 0.9, 0.95, 0.99]}
    else:
        qs = {
            0.5: np.quantile(vals, 0.5),
            0.9: np.quantile(vals, 0.9),
            0.95: np.quantile(vals, 0.95),
            0.99: np.quantile(vals, 0.99),
        }

    return pd.Series({
        "N": n,
        "Exact_match_%": 100.0 * g["exact"].mean(),
        "Mean_label_chars": g["label_chars"].mean(),
        "Mean_label_words": g["label_words"].mean(),
        "LevNorm_micro": lev_norm_micro,
        "LevNorm_macro": lev_norm_macro,
        "LevNorm_p50": qs[0.5],
        "LevNorm_p90": qs[0.9],
        "LevNorm_p95": qs[0.95],
        "LevNorm_p99": qs[0.99],
        "Tail_P(LevNorm>0.5)%": 100.0 * (g["lev_norm"] > 0.5).mean(),
        "Tail_P(LevNorm>1.0)%": 100.0 * (g["lev_norm"] > 1.0).mean(),
    })

def regimes_quantiles(g: pd.DataFrame) -> pd.Series:
    vals = g.loc[g["lev_norm"] > 0, "lev_norm"].to_numpy()
    total = len(g)
    exact = (g["lev_norm"] == 0).sum()

    if len(vals) == 0:
        return pd.Series({
            "Exact": 100.0, "Near-miss": 0.0, "Moderate": 0.0, "Moderate-high": 0.0, "Catastrophic": 0.0,
            "q50_nonzero": np.nan, "q90_nonzero": np.nan, "q99_nonzero": np.nan
        })

    q50 = np.quantile(vals, 0.50)
    q90 = np.quantile(vals, 0.90)
    q99 = np.quantile(vals, 0.99)

    near = ((g["lev_norm"] > 0) & (g["lev_norm"] <= q50)).sum()
    mod = ((g["lev_norm"] > q50) & (g["lev_norm"] <= q90)).sum()
    mod_hi = ((g["lev_norm"] > q90) & (g["lev_norm"] <= q99)).sum()
    cat = (g["lev_norm"] > q99).sum()

    return pd.Series({
        "Exact": 100.0 * exact / total,
        "Near-miss": 100.0 * near / total,
        "Moderate": 100.0 * mod / total,
        "Moderate-high": 100.0 * mod_hi / total,
        "Catastrophic": 100.0 * cat / total,
        "q50_nonzero": q50,
        "q90_nonzero": q90,
        "q99_nonzero": q99,
    })

def regimes_thresholds(g: pd.DataFrame, thr=(0.10, 0.30, 0.60)) -> pd.Series:
    t1, t2, t3 = thr
    total = len(g)

    exact = (g["lev_norm"] == 0).sum()
    near = ((g["lev_norm"] > 0) & (g["lev_norm"] <= t1)).sum()
    mod = ((g["lev_norm"] > t1) & (g["lev_norm"] <= t2)).sum()
    mod_hi = ((g["lev_norm"] > t2) & (g["lev_norm"] <= t3)).sum()
    cat = (g["lev_norm"] > t3).sum()

    return pd.Series({
        "Exact": 100.0 * exact / total,
        "Near-miss": 100.0 * near / total,
        "Moderate": 100.0 * mod / total,
        "Moderate-high": 100.0 * mod_hi / total,
        "Catastrophic": 100.0 * cat / total,
        "thr1": t1, "thr2": t2, "thr3": t3,
    })


Cell 5 — Plotting functions

In [7]:
def plot_exact_rate_by_fold(task: str, g: pd.DataFrame, out_dir: str):
    per_fold = g.groupby("fold")["exact"].mean().sort_index() * 100.0
    plt.figure()
    plt.bar(per_fold.index.astype(int).astype(str), per_fold.values)
    plt.xlabel("Fold")
    plt.ylabel("Exact match (%)")
    plt.title(f"{task}: Exact match rate by fold")
    out = os.path.join(out_dir, f"{_slug(task)}_exact_rate_by_fold.png")
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()

def plot_histograms_and_cdfs(task: str, g: pd.DataFrame, out_dir: str, x_clip: float = 1.5):
    vals = g["lev_norm"].to_numpy()
    vals = vals[np.isfinite(vals)]
    task_id = _slug(task)

    # Unclipped histogram
    plt.figure()
    plt.hist(vals, bins=HIST_BINS)
    plt.xlabel("Normalized Levenshtein (unclipped)")
    plt.ylabel("Count")
    plt.title(f"{task}: histogram of normalized Levenshtein (unclipped)")
    plt.savefig(os.path.join(out_dir, f"{task_id}_levnorm_hist_unclipped.png"), dpi=200, bbox_inches="tight")
    plt.close()

    # Clipped histogram
    plt.figure()
    plt.hist(np.clip(vals, 0, x_clip), bins=HIST_BINS, range=(0, x_clip))
    plt.xlabel(f"Normalized Levenshtein (clipped at {x_clip})")
    plt.ylabel("Count")
    plt.title(f"{task}: histogram of normalized Levenshtein (clipped)")
    plt.savefig(os.path.join(out_dir, f"{task_id}_levnorm_hist_clip{x_clip}.png"), dpi=200, bbox_inches="tight")
    plt.close()

    # Unclipped CDF
    s = np.sort(vals)
    y = np.arange(1, len(s) + 1) / len(s)
    plt.figure()
    plt.plot(s, y)
    plt.xlabel("Normalized Levenshtein (unclipped)")
    plt.ylabel("CDF")
    plt.title(f"{task}: CDF of normalized Levenshtein (unclipped)")
    plt.savefig(os.path.join(out_dir, f"{task_id}_levnorm_cdf_unclipped.png"), dpi=200, bbox_inches="tight")
    plt.close()

    # Clipped-window CDF
    mask = s <= x_clip
    plt.figure()
    plt.plot(s[mask], y[mask])
    plt.xlabel(f"Normalized Levenshtein (x ≤ {x_clip})")
    plt.ylabel("CDF")
    plt.title(f"{task}: CDF of normalized Levenshtein (x ≤ {x_clip})")
    plt.savefig(os.path.join(out_dir, f"{task_id}_levnorm_cdf_xmax{x_clip}.png"), dpi=200, bbox_inches="tight")
    plt.close()


Cell 6 — Quantile-example extraction functions

In [8]:
def pick_k_closest(nonzero_df: pd.DataFrame, target: float, k: int, used_uids: set) -> pd.DataFrame:
    gg = nonzero_df[~nonzero_df["uid"].isin(used_uids)].copy()
    if gg.empty:
        return gg
    gg["abs_diff"] = (gg["lev_norm"] - target).abs()
    gg = gg.sort_values(["abs_diff", "lev_norm"], ascending=[True, True]).head(k).copy()
    return gg

def quantile_examples_for_task(task_df: pd.DataFrame, k_per_q: int, quantiles: list[float]):
    nz = task_df[task_df["lev_norm"] > 0].copy()
    if len(nz) == 0:
        return pd.DataFrame(), {}

    vals = nz["lev_norm"].to_numpy()
    qvals = {q: float(np.quantile(vals, q)) for q in quantiles}

    used = set()
    picked_chunks = []
    for q in quantiles:
        target = qvals[q]
        picked = pick_k_closest(nz, target, k_per_q, used)
        if not picked.empty:
            picked["target_quantile"] = q
            picked["target_value"] = target
            picked_chunks.append(picked)
            used.update(picked["uid"].tolist())

    out = pd.concat(picked_chunks, ignore_index=True) if picked_chunks else pd.DataFrame()
    return out, qvals

def tail_topk_for_task(task_df: pd.DataFrame, topk: int) -> pd.DataFrame:
    nz = task_df[task_df["lev_norm"] > 0].copy()
    if nz.empty:
        return pd.DataFrame()
    return nz.sort_values("lev_norm", ascending=False).head(topk).copy()


Cell 7 — Load CSV and build core features

In [9]:
df = pd.read_csv(CSV_PATH, sep=CSV_SEP)
df = standardize_columns(df)

# Types
df["task"] = df["task"].astype(str)
df["fold"] = pd.to_numeric(df["fold"], errors="coerce")
df["sample_index"] = pd.to_numeric(df["sample_index"], errors="coerce")
df["levenshtein_distance"] = pd.to_numeric(df["levenshtein_distance"], errors="coerce")

df = df.dropna(subset=["fold", "sample_index", "levenshtein_distance"]).copy()
df["fold"] = df["fold"].astype(int)
df["sample_index"] = df["sample_index"].astype(int)
df["levenshtein_distance"] = df["levenshtein_distance"].astype(int)

df["prediction"] = df["prediction"].astype(str)
df["label"] = df["label"].astype(str)

# NFC versions (for stable length + debug checks)
df["pred_nfc"] = df["prediction"].map(nfc)
df["label_nfc"] = df["label"].map(nfc)

# Lengths and metrics
df["label_chars"] = df["label_nfc"].str.len().clip(lower=1)
df["label_words"] = df["label_nfc"].str.split().apply(len).clip(lower=1)

df["lev_norm"] = df["levenshtein_distance"] / df["label_chars"]
df["exact"] = (df["levenshtein_distance"] == 0)

df.head()


Unnamed: 0,task,fold,json_path,sample_index,prediction,label,levenshtein_distance,pred_nfc,label_nfc,label_chars,label_words,lev_norm,exact
0,sent,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,0,Du bist dran.,Du bist dran.,0,Du bist dran.,Du bist dran.,13,3,0.0,True
1,sent,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,1,"Lass/Lasst uns hingehen!; Los, gehen wir hin!","Lass/Lasst uns hingehen!; Los, gehen wir hin!",0,"Lass/Lasst uns hingehen!; Los, gehen wir hin!","Lass/Lasst uns hingehen!; Los, gehen wir hin!",45,7,0.0,True
2,sent,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,2,Ich hole sie/ihn.,Ich hole sie/ihn.,0,Ich hole sie/ihn.,Ich hole sie/ihn.,17,3,0.0,True
3,sent,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,3,Hospital; Krankenhaus,Hospital; Krankenhaus,0,Hospital; Krankenhaus,Hospital; Krankenhaus,21,2,0.0,True
4,sent,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,4,schon einmal; vorher; zuvor,schon einmal; vorher; zuvor,0,schon einmal; vorher; zuvor,schon einmal; vorher; zuvor,27,4,0.0,True


Cell 8 — Overall + foldwise summary CSVs

In [10]:
overall = df.groupby("task", as_index=True).apply(summarize_group).sort_index()
foldwise = df.groupby(["task", "fold"], as_index=True).apply(summarize_group).sort_index()

overall_path = os.path.join(OUT_DIR, "overall_summary_norm.csv")
foldwise_path = os.path.join(OUT_DIR, "foldwise_summary_norm.csv")

overall.to_csv(overall_path, index=True)
foldwise.to_csv(foldwise_path, index=True)

overall, foldwise.head()


  overall = df.groupby("task", as_index=True).apply(summarize_group).sort_index()
  foldwise = df.groupby(["task", "fold"], as_index=True).apply(summarize_group).sort_index()


(            N  Exact_match_%  Mean_label_chars  Mean_label_words  \
 task                                                               
 sent  20639.0      78.235380         18.932119          2.973497   
 word  24163.0      79.406531          6.525100          1.000000   
 
       LevNorm_micro  LevNorm_macro  LevNorm_p50  LevNorm_p90  LevNorm_p95  \
 task                                                                        
 sent       0.163636       0.169270          0.0     0.823529          1.0   
 word       0.128366       0.127702          0.0     0.625000          0.8   
 
       LevNorm_p99  Tail_P(LevNorm>0.5)%  Tail_P(LevNorm>1.0)%  
 task                                                           
 sent     1.428571             16.929115              4.384902  
 word     1.142857             12.030791              1.117411  ,
                 N  Exact_match_%  Mean_label_chars  Mean_label_words  \
 task fold                                                              
 

Cell 9 — Regime tables (overall + foldwise)

In [12]:
if REGIME_MODE == "quantiles":
    regimes_overall = df.groupby("task", as_index=True).apply(regimes_quantiles).sort_index()
    regimes_fold = df.groupby(["task", "fold"], as_index=True).apply(regimes_quantiles).sort_index()
else:
    regimes_overall = df.groupby("task", as_index=True).apply(lambda g: regimes_thresholds(g, THR)).sort_index()
    regimes_fold = df.groupby(["task", "fold"], as_index=True).apply(lambda g: regimes_thresholds(g, THR)).sort_index()

reg_overall_path = os.path.join(OUT_DIR, f"regimes_overall_{REGIME_MODE}.csv")
reg_fold_path = os.path.join(OUT_DIR, f"regimes_fold_{REGIME_MODE}.csv")

regimes_overall.to_csv(reg_overall_path, index=True)
regimes_fold.to_csv(reg_fold_path, index=True)

regimes_overall


  regimes_overall = df.groupby("task", as_index=True).apply(regimes_quantiles).sort_index()
  regimes_fold = df.groupby(["task", "fold"], as_index=True).apply(regimes_quantiles).sort_index()


Unnamed: 0_level_0,Exact,Near-miss,Moderate,Moderate-high,Catastrophic,q50_nonzero,q90_nonzero,q99_nonzero
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
sent,78.23538,11.13426,8.479093,1.938078,0.213189,0.8,1.230769,1.777778
word,79.406531,10.437446,9.038613,0.984977,0.132434,0.6,1.0,1.5


Cell 10 — Correlations (simple + report-table style)

In [13]:
# Simple per-task Pearson correlations (raw d vs len, and lev_norm vs len)
corr_rows = []
for task, g in df.groupby("task"):
    try:
        r_raw = np.corrcoef(g["levenshtein_distance"], g["label_chars"])[0, 1]
    except Exception:
        r_raw = np.nan
    try:
        r_norm = np.corrcoef(g["lev_norm"], g["label_chars"])[0, 1]
    except Exception:
        r_norm = np.nan
    corr_rows.append({"task": task, "pearson_rawLev_vs_len": r_raw, "pearson_normLev_vs_len": r_norm})

length_corr_simple = pd.DataFrame(corr_rows).sort_values("task")
length_corr_simple_path = os.path.join(OUT_DIR, "length_correlation_raw_vs_norm.csv")
length_corr_simple.to_csv(length_corr_simple_path, index=False)

# Report-table style (All vs d>0 only)
def corr_len_block(g: pd.DataFrame) -> dict:
    rho_d_all = g["levenshtein_distance"].corr(g["label_chars"], method="pearson")
    rho_e_all = g["lev_norm"].corr(g["label_chars"], method="pearson")

    g_bad = g[g["levenshtein_distance"] > 0]
    if len(g_bad) >= 2:
        rho_d_dpos = g_bad["levenshtein_distance"].corr(g_bad["label_chars"], method="pearson")
        rho_e_dpos = g_bad["lev_norm"].corr(g_bad["label_chars"], method="pearson")
    else:
        rho_d_dpos = np.nan
        rho_e_dpos = np.nan

    return {
        "rho_d_all": float(rho_d_all) if pd.notna(rho_d_all) else np.nan,
        "rho_dtilde_all": float(rho_e_all) if pd.notna(rho_e_all) else np.nan,
        "rho_d_dpos": float(rho_d_dpos) if pd.notna(rho_d_dpos) else np.nan,
        "rho_dtilde_dpos": float(rho_e_dpos) if pd.notna(rho_e_dpos) else np.nan,
        "n_all": int(len(g)),
        "n_dpos": int((g["levenshtein_distance"] > 0).sum()),
    }

rows = []
for task, g in df.groupby("task", sort=True):
    r = corr_len_block(g)
    r["task"] = task
    rows.append(r)

corr_df = pd.DataFrame(rows)[
    ["task", "rho_d_all", "rho_dtilde_all", "rho_d_dpos", "rho_dtilde_dpos", "n_all", "n_dpos"]
].sort_values("task")

corr_df_path = os.path.join(OUT_DIR, "corr_len_vs_error_by_task.csv")
corr_df.to_csv(corr_df_path, index=False)

length_corr_simple, corr_df



(   task  pearson_rawLev_vs_len  pearson_normLev_vs_len
 0  sent               0.139599               -0.032001
 1  word               0.202630                0.005674,
    task  rho_d_all  rho_dtilde_all  rho_d_dpos  rho_dtilde_dpos  n_all  n_dpos
 0  sent   0.139599       -0.032001    0.663745        -0.079558  20639    4492
 1  word   0.202630        0.005674    0.623748        -0.104828  24163    4976)

Cell 11 — Generate plots per task (PNG files)

In [14]:
for task, g in df.groupby("task"):
    plot_exact_rate_by_fold(task, g, OUT_DIR)
    plot_histograms_and_cdfs(task, g, OUT_DIR, x_clip=X_CLIP)

print("Plots saved in:", OUT_DIR)


Plots saved in: ./quant_analysis_outputs_new


Cell 12 — Build UID, suspicious flags, and optional recompute check

In [15]:
df["uid"] = df["task"] + "|" + df["fold"].astype(str) + "|" + df["sample_index"].astype(str)

# Debug visibility
df["pred_repr"] = df["pred_nfc"].map(show_repr)
df["label_repr"] = df["label_nfc"].map(show_repr)
df["pred_len"] = df["pred_nfc"].str.len()
df["label_len"] = df["label_nfc"].str.len()

# Suspicious flags
df["looks_equal_but_dpos"] = (df["pred_nfc"] == df["label_nfc"]) & (df["levenshtein_distance"] > 0)
df["looks_unequal_but_dzero"] = (df["pred_nfc"] != df["label_nfc"]) & (df["levenshtein_distance"] == 0)
df["is_suspicious"] = df["looks_equal_but_dpos"] | df["looks_unequal_but_dzero"]

# Optional recompute on suspicious rows only
df["lev_recomputed_nfc"] = np.nan
df["d_mismatch_flag"] = False

if VERIFY_D_WITH_RECOMPUTE:
    sus_idx = df.index[df["is_suspicious"]].tolist()
    if sus_idx:
        recomputed = []
        for i in sus_idx:
            recomputed.append(levenshtein(df.at[i, "pred_nfc"], df.at[i, "label_nfc"]))
        df.loc[sus_idx, "lev_recomputed_nfc"] = recomputed
        df.loc[sus_idx, "d_mismatch_flag"] = (
            df.loc[sus_idx, "lev_recomputed_nfc"].astype(int) != df.loc[sus_idx, "levenshtein_distance"].astype(int)
        )

print("Suspicious rows:", int(df["is_suspicious"].sum()))
print("D mismatches among suspicious:", int(df["d_mismatch_flag"].sum()))


Suspicious rows: 0
D mismatches among suspicious: 0


Cell 13 — Create report-friendly truncated columns

In [16]:
df["prediction_full"] = df["prediction"]
df["label_full"] = df["label"]
df["prediction_short"] = df["prediction"].map(lambda s: trunc(s, MAX_SHOW))
df["label_short"] = df["label"].map(lambda s: trunc(s, MAX_SHOW))

df[["task","fold","sample_index","levenshtein_distance","lev_norm","prediction_short","label_short"]].head()


Unnamed: 0,task,fold,sample_index,levenshtein_distance,lev_norm,prediction_short,label_short
0,sent,0,0,0,0.0,Du bist dran.,Du bist dran.
1,sent,0,1,0,0.0,"Lass/Lasst uns hingehen!; Los, gehe...","Lass/Lasst uns hingehen!; Los, gehe..."
2,sent,0,2,0,0.0,Ich hole sie/ihn.,Ich hole sie/ihn.
3,sent,0,3,0,0.0,Hospital; Krankenhaus,Hospital; Krankenhaus
4,sent,0,4,0,0.0,schon einmal; vorher; zuvor,schon einmal; vorher; zuvor


Cell 14 — Quantile examples + tail examples + quantile targets

In [17]:
all_examples = []
all_qvals_rows = []
all_tail = []

for task, g in df.groupby("task", sort=True):
    ex_df, qvals = quantile_examples_for_task(g, K_PER_QUANTILE, QUANTILES)
    tail_df = tail_topk_for_task(g, TOPK_TAIL)

    n_nonzero = int((g["lev_norm"] > 0).sum())
    for q, v in qvals.items():
        all_qvals_rows.append({
            "task": task,
            "quantile": q,
            "quantile_value": v,
            "n_nonzero": n_nonzero,
            "n_total": int(len(g)),
            "exact_match_%": float(100.0 * (g["levenshtein_distance"] == 0).mean()),
        })

    if not ex_df.empty:
        all_examples.append(ex_df)
    if not tail_df.empty:
        all_tail.append(tail_df)

examples_all = pd.concat(all_examples, ignore_index=True) if all_examples else pd.DataFrame()
qvals_all = pd.DataFrame(all_qvals_rows) if all_qvals_rows else pd.DataFrame(
    columns=["task", "quantile", "quantile_value", "n_nonzero", "n_total", "exact_match_%"]
)
tail_all = pd.concat(all_tail, ignore_index=True) if all_tail else pd.DataFrame()

print("examples_all:", examples_all.shape)
print("qvals_all:", qvals_all.shape)
print("tail_all:", tail_all.shape)


examples_all: (30, 30)
qvals_all: (6, 6)
tail_all: (16, 27)


Cell 15 — Select final columns and write CSV outputs (examples/tail/suspicious)

In [20]:
# =========================
# CLEAN COLUMN SETS
# - Remove all flag/debug columns from examples + tail except is_suspicious
# - Keep FULL text (prediction_full/label_full), drop short text
# - Keep suspicious_rows_debug.csv AS IS
# =========================

examples_cols = [
    "task",
    "target_quantile", "target_value",
    "fold", "sample_index",
    "levenshtein_distance", "label_chars", "lev_norm",
    "prediction_full", "label_full",
    "is_suspicious",
    "json_path"
]

tail_cols = [
    "task", "fold", "sample_index",
    "levenshtein_distance", "label_chars", "lev_norm",
    "prediction_full", "label_full",
    "is_suspicious",
    "json_path"
]

# NOTE: Keep suspicious debug CSV exactly as before (do not change sus_cols)
sus_cols = [
    "task","fold","sample_index",
    "levenshtein_distance","lev_norm",
    "prediction_short","label_short",
    "pred_repr","label_repr",
    "looks_equal_but_dpos","looks_unequal_but_dzero",
    "lev_recomputed_nfc","d_mismatch_flag",
    "json_path"
]

# Ensure frames exist with correct schema
if examples_all.empty:
    examples_all = pd.DataFrame(columns=examples_cols)
else:
    # Ensure required columns exist
    for c in examples_cols:
        if c not in examples_all.columns:
            examples_all[c] = np.nan

    examples_all = examples_all[examples_cols].sort_values(
        ["task", "target_quantile", "lev_norm"],
        ascending=[True, True, True]
    )

if tail_all.empty:
    tail_all = pd.DataFrame(columns=tail_cols)
else:
    for c in tail_cols:
        if c not in tail_all.columns:
            tail_all[c] = np.nan

    tail_all = tail_all[tail_cols].sort_values(
        ["task", "lev_norm"],
        ascending=[True, False]
    )

# Keep suspicious_rows_debug.csv as it is
sus = df[df["is_suspicious"]].copy()
if sus.empty:
    sus = pd.DataFrame(columns=sus_cols)
else:
    for c in sus_cols:
        if c not in sus.columns:
            sus[c] = np.nan

    sus = sus[sus_cols].sort_values(
        ["d_mismatch_flag", "task", "fold", "sample_index"],
        ascending=[False, True, True, True]
    )

# Write outputs
out_examples = os.path.join(OUT_DIR, "examples_by_quantile_all_tasks.csv")
out_qvals = os.path.join(OUT_DIR, "quantile_targets_nonzero_by_task.csv")
out_tail = os.path.join(OUT_DIR, "examples_tail_topk_by_task.csv")
out_susp = os.path.join(OUT_DIR, "suspicious_rows_debug.csv")

examples_all.to_csv(out_examples, index=False)
qvals_all.to_csv(out_qvals, index=False)
tail_all.to_csv(out_tail, index=False)
sus.to_csv(out_susp, index=False)

print("Wrote:", out_examples)
print("Wrote:", out_qvals)
print("Wrote:", out_tail)
print("Wrote:", out_susp)
print("Suspicious rows:", len(sus))


Wrote: ./quant_analysis_outputs_new/examples_by_quantile_all_tasks.csv
Wrote: ./quant_analysis_outputs_new/quantile_targets_nonzero_by_task.csv
Wrote: ./quant_analysis_outputs_new/examples_tail_topk_by_task.csv
Wrote: ./quant_analysis_outputs_new/suspicious_rows_debug.csv
Suspicious rows: 0


Cell 16 — Quick “what was created?” check

In [19]:
# List the main CSV outputs and a few plot files
files = sorted(os.listdir(OUT_DIR))
print("Total files in OUT_DIR:", len(files))

# Show key CSVs first
for f in files:
    if f.endswith(".csv"):
        print("CSV:", f)

# Show a few PNGs
pngs = [f for f in files if f.endswith(".png")]
print("\nPNG count:", len(pngs))
print("First 10 PNGs:", pngs[:10])


Total files in OUT_DIR: 20
CSV: corr_len_vs_error_by_task.csv
CSV: examples_by_quantile_all_tasks.csv
CSV: examples_tail_topk_by_task.csv
CSV: foldwise_summary_norm.csv
CSV: length_correlation_raw_vs_norm.csv
CSV: overall_summary_norm.csv
CSV: quantile_targets_nonzero_by_task.csv
CSV: regimes_fold_quantiles.csv
CSV: regimes_overall_quantiles.csv
CSV: suspicious_rows_debug.csv

PNG count: 10
First 10 PNGs: ['sent_exact_rate_by_fold.png', 'sent_levnorm_cdf_unclipped.png', 'sent_levnorm_cdf_xmax1.5.png', 'sent_levnorm_hist_clip1.5.png', 'sent_levnorm_hist_unclipped.png', 'word_exact_rate_by_fold.png', 'word_levnorm_cdf_unclipped.png', 'word_levnorm_cdf_xmax1.5.png', 'word_levnorm_hist_clip1.5.png', 'word_levnorm_hist_unclipped.png']
