In [1]:
# --- Notebook cell 1: imports + Levenshtein (fast if available, else fallback) ---
import os
import json
import pandas as pd

try:
    import Levenshtein  # pip install python-Levenshtein
    def lev(a: str, b: str) -> int:
        return Levenshtein.distance(a, b)
except Exception:
    def lev(a: str, b: str) -> int:
        if a == b:
            return 0
        if not a:
            return len(b)
        if not b:
            return len(a)
        prev = list(range(len(b) + 1))
        for i, ca in enumerate(a, 1):
            cur = [i]
            for j, cb in enumerate(b, 1):
                ins = cur[j - 1] + 1
                dele = prev[j] + 1
                sub = prev[j - 1] + (ca != cb)
                cur.append(min(ins, dele, sub))
            prev = cur
        return prev[-1]


In [2]:
# --- Notebook cell 2: set paths + metadata (edit if needed) ---
json_path = "/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/inference_results_sent/fold_0/exports/val_full_fold0_epoch0.json"

task_name = "word"
fold = 0

out_csv = "/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/quant_results_word/fold_0/exports/fold0_predictions_sent_new.csv"
sep = ";"  # to match your unified CSV style


In [3]:
# --- Notebook cell 3: load json + basic sanity checks ---
with open(json_path, "r", encoding="utf-8") as f:
    obj = json.load(f)

preds = obj.get("predictions", None)
labels = obj.get("labels", None)

assert preds is not None and labels is not None, f"Missing keys in JSON. Found keys: {list(obj.keys())}"
assert len(preds) == len(labels), f"Length mismatch: preds={len(preds)} labels={len(labels)}"

len(preds), len(labels)


(4188, 4188)

In [4]:
# --- Notebook cell 4: build dataframe (same schema as your unified CSV) ---
rows = []
for i, (p, y) in enumerate(zip(preds, labels)):
    p = "" if p is None else str(p)
    y = "" if y is None else str(y)
    rows.append({
        "Task": task_name,
        "Fold": fold,
        "Json_path": os.path.abspath(json_path),
        "Sample_index": i,
        "Prediction": p,
        "Label": y,
        "Levenshtein_distance": int(lev(p, y)),
    })

df = pd.DataFrame(rows)
df.head(10)


Unnamed: 0,Task,Fold,Json_path,Sample_index,Prediction,Label,Levenshtein_distance
0,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,0,Du bist dran.,Du bist dran.,0
1,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,1,"Lass/Lasst uns hingehen!; Los, gehen wir hin!","Lass/Lasst uns hingehen!; Los, gehen wir hin!",0
2,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,2,Ich hole sie/ihn.,Ich hole sie/ihn.,0
3,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,3,Hospital; Krankenhaus,Hospital; Krankenhaus,0
4,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,4,schon einmal; vorher; zuvor,schon einmal; vorher; zuvor,0
5,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,5,die Londoner U-Bahn,die Londoner U-Bahn,0
6,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,6,nächste/-r/-s; der/die Nächste(n),nächste/-r/-s; der/die Nächste(n),0
7,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,7,Angst; Furcht; Befürchtung,Angst; Furcht; Befürchtung,0
8,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,8,Für wen ...?,Für wen ...?,0
9,word,0,/home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...,9,Dekoration; Schmuck,Dekoration; Schmuck,0


In [5]:
# --- Notebook cell 6: save CSV ---
os.makedirs(os.path.dirname(out_csv), exist_ok=True)
df.to_csv(out_csv, sep=sep, index=False)
out_csv


'/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/quant_results_word/fold_0/exports/fold0_predictions_sent_new.csv'

In [6]:
# --- Notebook cell 7: verify the file was written correctly ---
df_check = pd.read_csv(out_csv, sep=sep)
(df_check.shape, df_check.head(3))


((4188, 7),
    Task  Fold                                          Json_path  \
 0  word     0  /home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...   
 1  word     0  /home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...   
 2  word     0  /home/woody/iwso/iwso214h/imu-hwr/work/REWI_wo...   
 
    Sample_index                                     Prediction  \
 0             0                                  Du bist dran.   
 1             1  Lass/Lasst uns hingehen!; Los, gehen wir hin!   
 2             2                              Ich hole sie/ihn.   
 
                                            Label  Levenshtein_distance  
 0                                  Du bist dran.                     0  
 1  Lass/Lasst uns hingehen!; Los, gehen wir hin!                     0  
 2                              Ich hole sie/ihn.                     0  )

In [7]:
# --- Notebook cell 1: load fold-0 CSV ---
import pandas as pd

fold0_csv = "/home/woody/iwso/iwso214h/imu-hwr/work/REWI_work/quant_results_word/fold_0/exports/fold0_predictions_sent_new.csv"
df0 = pd.read_csv(fold0_csv, sep=";")

df0.shape, df0.columns


((4188, 7),
 Index(['Task', 'Fold', 'Json_path', 'Sample_index', 'Prediction', 'Label',
        'Levenshtein_distance'],
       dtype='object'))

In [8]:
# --- Notebook cell 2: filter the specific indices ---
indices = [1423, 1946, 130, 1443, 3006]

cols = ["Sample_index", "Prediction", "Label", "Levenshtein_distance"]
hit = df0[df0["Sample_index"].isin(indices)][cols].sort_values("Sample_index")

hit


Unnamed: 0,Sample_index,Prediction,Label,Levenshtein_distance
130,130,Was um alles in der We,Wait and see!,16
1423,1423,not ony other boxy,round of boxing,12
1443,1443,SMS; Kurznachrich,single ticket,16
1946,1946,to inght way,mountain biking,12
3006,3006,Klatsch/Klatscht in die Hände.,klar; deutlich,25
