## 1) Imports and load duodata.csv

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path(".")

duodata_path = DATA_DIR / "duodata.csv"
duo_path = DATA_DIR / "duo.csv"

duo = pd.read_csv(duodata_path)


## 2) Filter out rows with DE / FR as learning or UI language

In [None]:
mask_allowed_languages = ~duo["learning_language"].isin(["de", "fr"]) & ~duo["ui_language"].isin(["de", "fr"])
duo = duo.loc[mask_allowed_languages].reset_index(drop=True)

## 3) Add p_clipped (clipped version of p_recall)

In [None]:
eps = 1e-4

duo["p_clipped"] = duo["p_recall"].clip(lower=eps, upper=1 - eps)


## 4) Add half_life

In [None]:
duo["half_life"] = -duo["delta"] / np.log2(duo["p_clipped"])

## 5) Extract lemma from lexeme_string

In [None]:
def extract_lemma_from_lexeme_string(s):
    if not isinstance(s, str):
        return ""
    out = s
    if "/" in out:
        out = out.split("/", 1)[1]
    if "<" in out:
        out = out.split("<", 1)[0]
    return out

duo["lemma"] = duo["lexeme_string"].apply(extract_lemma_from_lexeme_string)



duo.to_csv(duo_path, index=False)
print("Saved:", duo_path)