## Cell 1 â€“ Imports

In [1]:
import pandas as pd
from wordfreq import zipf_frequency
import nltk
from nltk.corpus import wordnet as wn
import os


## Cell 2 â€“ Download NLTK resources

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paolo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\paolo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## ðŸ“Š Cell 3 â€“ Load Brysbaert concreteness norms

In [None]:
BRYSBAERT_EXCEL = "Concreteness_ratings_Brysbaert_et_al_BRM.xlsx"


brys = pd.read_excel(BRYSBAERT_EXCEL)


brys["word_lower"] = brys["Word"].astype(str).str.lower()


concreteness_dict = dict(zip(brys["word_lower"], brys["Conc.M"]))

len(concreteness_dict), list(concreteness_dict.items())[:5]


(39954,
 [('roadsweeper', 4.85),
  ('traindriver', 4.54),
  ('tush', 4.45),
  ('hairdress', 3.93),
  ('pharmaceutics', 3.77)])

## Cell 4 â€“ Helper functions (length, concreteness, frequency, semantic field)

In [None]:
def get_concreteness(lemma: str) -> float | None:
    
    if not isinstance(lemma, str):
        return None
    return concreteness_dict.get(lemma.lower(), None)


def get_frequency(lemma: str) -> float:
    
    if not isinstance(lemma, str):
        return 0.0
    return zipf_frequency(lemma, "en")


PREF_POS_ORDER = ['n', 'v', 'a', 'r']  

def normalize_lemma_for_wn(lemma: str):
    if not isinstance(lemma, str):
        return None
    lemma = lemma.strip().lower()
    
    lemma = lemma.replace(" ", "_")
    return lemma

def map_pos_hint(pos_hint):
    
    if pos_hint is None or not isinstance(pos_hint, str):
        return None
    pos_hint = pos_hint.upper()
    if pos_hint.startswith("N"):
        return 'n'
    if pos_hint.startswith("V"):
        return 'v'
    if pos_hint.startswith("ADJ"):
        return 'a'
    if pos_hint.startswith("ADV"):
        return 'r'
    return None

def get_semantic_field_improved(lemma: str, pos_hint=None):
    
    lemma_norm = normalize_lemma_for_wn(lemma)
    if lemma_norm is None:
        return None

    wn_pos_hint = map_pos_hint(pos_hint)
    if wn_pos_hint:
        pos_order = [wn_pos_hint] + [p for p in PREF_POS_ORDER if p != wn_pos_hint]
    else:
        pos_order = PREF_POS_ORDER

    for pos in pos_order:
        synsets = wn.synsets(lemma_norm, pos=pos)
        if synsets:
            return synsets[0].lexname()
    return None


## Cell 5 â€“ Function to process one CSV

In [None]:
def process_file(filename: str, save_suffix: str | None = None) -> pd.DataFrame:
    


    if not os.path.exists(filename):
        raise FileNotFoundError(f"{filename} not found.")

    print(f"Processing {filename} ...")
    df = pd.read_csv(filename)

    
    for col in ["src_lemma", "tgt_lemma"]:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in {filename}")

    
    df["src_lemma_length"] = df["src_lemma"].astype(str).str.len()
    df["tgt_lemma_length"] = df["tgt_lemma"].astype(str).str.len()

    
    df["concreteness"] = df["src_lemma"].apply(get_concreteness)

    
    df["frequency"] = df["src_lemma"].apply(get_frequency)

    
    if "src_pos" in df.columns:
        df["semantic_field"] = df.apply(
            lambda row: get_semantic_field_improved(row["src_lemma"], row["src_pos"]),
            axis=1
        )
    else:
        df["semantic_field"] = df["src_lemma"].apply(
            lambda x: get_semantic_field_improved(x, pos_hint=None)
        )

    
    df["semantic_field"] = df["semantic_field"].str.split(".").str[-1]

    

    
    base, ext = os.path.splitext(filename)
    if save_suffix is None:
        out_path = filename
    else:
        out_path = f"{base}{save_suffix}{ext}"

    df.to_csv(out_path, index=False)
    print(f"Saved processed file to: {out_path}\n")

    return df


## Cell 6 â€“ Run on your three files

In [None]:
files = [
    "en_it_prefrel_lemma_hf.csv",
    "en_es_prefrel_lemma_hf.csv",
    "en_pt_prefrel_lemma_hf.csv",
]



SAVE_SUFFIX = None  

processed_dfs = {}

for f in files:
    df_proc = process_file(f, save_suffix=SAVE_SUFFIX)
    processed_dfs[f] = df_proc


processed_dfs["en_it_prefrel_lemma_hf.csv"].head()


Processing en_it_prefrel_lemma_hf.csv ...
Saved processed file to: en_it_prefrel_lemma_hf.csv

Processing en_es_prefrel_lemma_hf.csv ...
Saved processed file to: en_es_prefrel_lemma_hf.csv

Processing en_pt_prefrel_lemma_hf.csv ...
Saved processed file to: en_pt_prefrel_lemma_hf.csv



Unnamed: 0,src_lemma,tgt_lemma,src_pos,tgt_pos,src_median_hf,tgt_median_hf,src_lexeme_count,tgt_lexeme_count,grammatical_conflicts,meaning_conflicts,src_lemma_length,tgt_lemma_length,concreteness,frequency,semantic_field
0,a,a,det,prep,4110157.0,184180800.0,2,9,gender_number_forms,True,1,1,1.46,7.36,quantity
1,about,circa,adv,adv,271478300.0,521556800.0,2,2,no_conflicts,True,5,5,1.77,6.4,all
2,about,intorno,adv,adv,271478300.0,334613900.0,2,1,no_conflicts,True,5,7,1.77,6.4,all
3,absolutely,assolutamente,adv,adv,645662000.0,333941600.0,1,1,no_conflicts,True,10,13,1.97,4.98,all
4,absolutely,perfettamente,adv,adv,645662000.0,14635070.0,1,1,no_conflicts,True,10,13,1.97,4.98,all


In [None]:
import pandas as pd

files = [
    "en_it_prefrel_lemma_hf.csv",
    "en_es_prefrel_lemma_hf.csv",
    "en_pt_prefrel_lemma_hf.csv",
]

def fix_grammatical_conflicts(filename):
    print(f"Fixing grammatical_conflicts in {filename}...")

    #
    df = pd.read_csv(filename, sep=";")

    if "grammatical_conflicts" not in df.columns:
        raise ValueError(f"'grammatical_conflicts' column not found in {filename}. "
                         f"Columns are: {df.columns.tolist()}")

    def remap(value):
        if not isinstance(value, str):
            v = str(value)
        else:
            v = value
        v = v.strip().lower()

        if v == "gender_forms":
            return "no_conflicts"
        elif v == "number_forms":
            return "gender_number_forms"
        elif v == "verb_forms":
            return "verb_forms"
        else:
            
            return "verb_forms"

    df["grammatical_conflicts"] = df["grammatical_conflicts"].apply(remap)

    
    df.to_csv(filename, sep=";", index=False)
    print(f"Updated file saved: {filename}\n")


for f in files:
    fix_grammatical_conflicts(f)


Fixing grammatical_conflicts in en_it_prefrel_lemma_hf.csv...
Updated file saved: en_it_prefrel_lemma_hf.csv

Fixing grammatical_conflicts in en_es_prefrel_lemma_hf.csv...
Updated file saved: en_es_prefrel_lemma_hf.csv

Fixing grammatical_conflicts in en_pt_prefrel_lemma_hf.csv...
Updated file saved: en_pt_prefrel_lemma_hf.csv



In [None]:
import pandas as pd
import numpy as np

files = [
    "en_it_prefrel_lemma_hf.csv",
    "en_es_prefrel_lemma_hf.csv",
    "en_pt_prefrel_lemma_hf.csv",
]

def pos_category(src_pos):
    
    if pd.isna(src_pos):
        return "NA"
    s = str(src_pos).strip().lower()

    if s.startswith("adv"):
        return "adv"
    if s.startswith("prep") or s == "adp":
        return "prep"
    if s.startswith("v"):
        return "verb"
    if s == "" or s == "nan":
        return "NA"
    return "other"


def refine_grammatical_conflicts(row):
    gc = row["grammatical_conflicts"]
    pos_cat = pos_category(row["src_pos"])

    
    if pos_cat == "adv":
        return "no_conflicts"

    
    if gc != "verb_forms":
        return gc  

    
    if pos_cat in ("prep", "NA"):
        return "no_conflicts"

    
    if pos_cat != "verb":
        return "gender_number_forms"

    
    return "verb_forms"


def apply_refinement(filename):
    print(f"Refining grammatical_conflicts in {filename}...")

    
    df = pd.read_csv(filename, sep=";")

    if "grammatical_conflicts" not in df.columns or "src_pos" not in df.columns:
        raise ValueError(f"Missing 'grammatical_conflicts' or 'src_pos' in {filename}. "
                         f"Columns: {df.columns.tolist()}")

    df["grammatical_conflicts"] = df.apply(refine_grammatical_conflicts, axis=1)

    df.to_csv(filename, sep=";", index=False)
    print(f"Saved updated file: {filename}\n")


for f in files:
    apply_refinement(f)


Refining grammatical_conflicts in en_it_prefrel_lemma_hf.csv...
Saved updated file: en_it_prefrel_lemma_hf.csv

Refining grammatical_conflicts in en_es_prefrel_lemma_hf.csv...
Saved updated file: en_es_prefrel_lemma_hf.csv

Refining grammatical_conflicts in en_pt_prefrel_lemma_hf.csv...
Saved updated file: en_pt_prefrel_lemma_hf.csv



In [None]:
import pandas as pd

files = [
    "en_it_prefrel_lemma_hf.csv",
    "en_es_prefrel_lemma_hf.csv",
    "en_pt_prefrel_lemma_hf.csv",
]

def apply_additional_rules(filename):
    print(f"Applying additional grammatical_conflicts rules to {filename}...")

    
    df = pd.read_csv(filename, sep=";")

    required_cols = ["grammatical_conflicts", "src_pos", "src_lexeme_count", "tgt_lexeme_count"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing column '{col}' in {filename}. Columns: {df.columns.tolist()}")

    
    pos_norm = df["src_pos"].astype(str).str.strip().str.lower()
    mask_conj = pos_norm.str.startswith("conj")  

    df.loc[mask_conj, "grammatical_conflicts"] = "no_conflicts"

    
    
    df["src_lexeme_count"] = pd.to_numeric(df["src_lexeme_count"], errors="coerce")
    df["tgt_lexeme_count"] = pd.to_numeric(df["tgt_lexeme_count"], errors="coerce")

    mask_both_one = (
        (df["grammatical_conflicts"] == "gender_number_forms") &
        (df["src_lexeme_count"] == 1) &
        (df["tgt_lexeme_count"] == 1)
    )

    df.loc[mask_both_one, "grammatical_conflicts"] = "no_conflicts"

    
    df.to_csv(filename, sep=";", index=False)
    print(f"Saved updated file: {filename}\n")


for f in files:
    apply_additional_rules(f)


Applying additional grammatical_conflicts rules to en_it_prefrel_lemma_hf.csv...
Saved updated file: en_it_prefrel_lemma_hf.csv

Applying additional grammatical_conflicts rules to en_es_prefrel_lemma_hf.csv...
Saved updated file: en_es_prefrel_lemma_hf.csv

Applying additional grammatical_conflicts rules to en_pt_prefrel_lemma_hf.csv...
Saved updated file: en_pt_prefrel_lemma_hf.csv



In [None]:
import pandas as pd
import numpy as np


in_files = [
    "fulldata_en_it.csv",
    "fulldata_en_es.csv",
    "fulldata_en_pt.csv",
]

def pos_category(src_pos):
    
    if pd.isna(src_pos):
        return "NA"
    s = str(src_pos).strip().lower()

    if s.startswith("adv"):
        return "adv"
    if s.startswith("prep") or s == "adp":
        return "prep"
    if s.startswith("v"):
        return "verb"
    if s == "" or s == "nan":
        return "NA"
    return "other"


def remap_base(value):
    
    if not isinstance(value, str):
        v = str(value)
    else:
        v = value
    v = v.strip().lower()

    if v == "gender_forms":
        return "no_conflicts"
    elif v == "number_forms":
        return "gender_number_forms"
    elif v == "verb_forms":
        return "verb_forms"
    else:
        
        return "verb_forms"


def refine_grammatical_conflicts(row):
    
    gc = row["grammatical_conflicts"]
    pos_cat = pos_category(row["src_pos"])

    
    if pos_cat == "adv":
        return "no_conflicts"

    
    if gc != "verb_forms":
        return gc  

    
    if pos_cat in ("prep", "NA"):
        return "no_conflicts"

    
    if pos_cat != "verb":
        return "gender_number_forms"

    
    return "verb_forms"


for in_path in in_files:
    print(f"\n=== Processing {in_path} ===")

    
    df = pd.read_csv(in_path)  

    
    if "grammatical_conflicts" not in df.columns:
        raise ValueError(f"'grammatical_conflicts' column not found in {in_path}. "
                         f"Columns are: {df.columns.tolist()}")

    df["grammatical_conflicts"] = df["grammatical_conflicts"].apply(remap_base)

    
    if "src_pos" not in df.columns:
        raise ValueError(f"'src_pos' column not found in {in_path}. "
                         f"Columns are: {df.columns.tolist()}")

    df["grammatical_conflicts"] = df.apply(refine_grammatical_conflicts, axis=1)

    
    required_cols = ["grammatical_conflicts", "src_pos", "src_lexeme_count", "tgt_lexeme_count"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing column '{col}' in {in_path}. Columns: {df.columns.tolist()}")

    
    pos_norm = df["src_pos"].astype(str).str.strip().str.lower()
    mask_conj = pos_norm.str.startswith("conj")  
    df.loc[mask_conj, "grammatical_conflicts"] = "no_conflicts"

    
    df["src_lexeme_count"] = pd.to_numeric(df["src_lexeme_count"], errors="coerce")
    df["tgt_lexeme_count"] = pd.to_numeric(df["tgt_lexeme_count"], errors="coerce")

    mask_both_one = (
        (df["grammatical_conflicts"] == "gender_number_forms") &
        (df["src_lexeme_count"] == 1) &
        (df["tgt_lexeme_count"] == 1)
    )
    df.loc[mask_both_one, "grammatical_conflicts"] = "no_conflicts"

    
    out_path = in_path.replace(".csv", "_gramfixed.csv")
    df.to_csv(out_path, index=False)  
    print(f"Saved updated file: {out_path}")



=== Processing fulldata_en_it.csv ===
Saved updated file: fulldata_en_it_gramfixed.csv

=== Processing fulldata_en_es.csv ===
Saved updated file: fulldata_en_es_gramfixed.csv

=== Processing fulldata_en_pt.csv ===
Saved updated file: fulldata_en_pt_gramfixed.csv


In [None]:
import pandas as pd

langs = ["it", "es", "pt"]

for xx in langs:
    full_path = f"fulldata_en_{xx}_gramfixed.csv"
    old_path  = f"en_{xx}_prefrel_lemma_hf.csv"
    out_path  = f"fulldata_en_{xx}_complete.csv"

    print(f"\n=== Transferring semantic_field for ENâ€“{xx.upper()} ===")
    print(f"Reading main file: {full_path}")
    print(f"Reading donor file: {old_path}")

    
    df_full = pd.read_csv(full_path)

    
    df_old = pd.read_csv(old_path, sep=";")

    
    for col in ["src_lemma", "tgt_lemma"]:
        if col not in df_full.columns:
            raise ValueError(f"Column '{col}' not found in {full_path}. "
                             f"Columns: {df_full.columns.tolist()}")
        if col not in df_old.columns:
            raise ValueError(f"Column '{col}' not found in {old_path}. "
                             f"Columns: {df_old.columns.tolist()}")
    if "semantic_field" not in df_old.columns:
        raise ValueError(f"'semantic_field' column not found in {old_path}. "
                         f"Columns: {df_old.columns.tolist()}")

    
    df_full["src_norm"] = df_full["src_lemma"].astype(str).str.strip().str.lower()
    df_full["tgt_norm"] = df_full["tgt_lemma"].astype(str).str.strip().str.lower()

    df_old["src_norm"] = df_old["src_lemma"].astype(str).str.strip().str.lower()
    df_old["tgt_norm"] = df_old["tgt_lemma"].astype(str).str.strip().str.lower()

    
    df_full_sorted = df_full.sort_values(["src_norm", "tgt_norm"]).reset_index(drop=True)
    df_old_sorted  = df_old.sort_values(["src_norm", "tgt_norm"]).reset_index(drop=True)

    
    if not df_full_sorted["src_norm"].equals(df_old_sorted["src_norm"]):
        raise ValueError(f"src_norm mismatch after sorting for {xx}")
    if not df_full_sorted["tgt_norm"].equals(df_old_sorted["tgt_norm"]):
        raise ValueError(f"tgt_norm mismatch after sorting for {xx}")

    print("Keys align after sorting â€” safe to transfer semantic_field.")

    
    df_full_sorted["semantic_field"] = df_old_sorted["semantic_field"].values

    
    df_full_sorted = df_full_sorted.drop(columns=["src_norm", "tgt_norm"])

    
    df_full_sorted.to_csv(out_path, index=False)
    print(f"Saved combined file with corrected semantic_field to: {out_path}")



=== Transferring semantic_field for ENâ€“IT ===
Reading main file: fulldata_en_it_gramfixed.csv
Reading donor file: en_it_prefrel_lemma_hf.csv
Keys align after sorting â€” safe to transfer semantic_field.
Saved combined file with corrected semantic_field to: fulldata_en_it_complete.csv

=== Transferring semantic_field for ENâ€“ES ===
Reading main file: fulldata_en_es_gramfixed.csv
Reading donor file: en_es_prefrel_lemma_hf.csv
Keys align after sorting â€” safe to transfer semantic_field.
Saved combined file with corrected semantic_field to: fulldata_en_es_complete.csv

=== Transferring semantic_field for ENâ€“PT ===
Reading main file: fulldata_en_pt_gramfixed.csv
Reading donor file: en_pt_prefrel_lemma_hf.csv
Keys align after sorting â€” safe to transfer semantic_field.
Saved combined file with corrected semantic_field to: fulldata_en_pt_complete.csv


In [None]:
import pandas as pd

languages = ["it", "es", "pt"]

for xx in languages:
    dupli_path = f"dupli_{xx}.csv"
    full_path  = f"complete_en_{xx}.csv"
    out_path   = f"complete_en_{xx}_clean.csv"

    print(f"\n=== Cleaning ENâ€“{xx.upper()} using {dupli_path} ===")

    
    df_full = pd.read_csv(full_path)

    
    df_dupli = pd.read_csv(dupli_path)

    
    for col in ["src_lemma", "tgt_lemma", "keep_drop"]:
        if col not in df_dupli.columns:
            raise ValueError(f"Column '{col}' not found in {dupli_path}. "
                             f"Columns: {df_dupli.columns.tolist()}")

    
    to_drop = (
        df_dupli[df_dupli["keep_drop"].str.lower() == "no"]
        [["src_lemma", "tgt_lemma"]]
        .drop_duplicates()
    )

    print(f"Pairs marked for removal: {len(to_drop)}")

    
    to_drop["drop_key"] = (
        to_drop["src_lemma"].astype(str) + "||" + to_drop["tgt_lemma"].astype(str)
    )
    df_full["drop_key"] = (
        df_full["src_lemma"].astype(str) + "||" + df_full["tgt_lemma"].astype(str)
    )

    
    df_clean = df_full[~df_full["drop_key"].isin(to_drop["drop_key"])].copy()

    
    df_clean = df_clean.drop(columns=["drop_key"])

    
    df_clean.to_csv(out_path, index=False)
    print(f"Clean file saved: {out_path}  (rows kept: {len(df_clean)})")



=== Cleaning ENâ€“IT using dupli_it.csv ===
Pairs marked for removal: 3
Clean file saved: complete_en_it_clean.csv  (rows kept: 959)

=== Cleaning ENâ€“ES using dupli_es.csv ===
Pairs marked for removal: 15
Clean file saved: complete_en_es_clean.csv  (rows kept: 1324)

=== Cleaning ENâ€“PT using dupli_pt.csv ===
Pairs marked for removal: 6
Clean file saved: complete_en_pt_clean.csv  (rows kept: 1144)


In [None]:
duo_path = "duo.csv"

final_paths = {
    "es": "final_en_es.csv",
    "it": "final_en_it.csv",
    "pt": "final_en_pt.csv",
}

output_paths = {
    "es": "final_en_es_2.csv",
    "it": "final_en_it_2.csv",
    "pt": "final_en_pt_2.csv",
}


In [None]:
import pandas as pd

duo = pd.read_csv(duo_path)


assert "lemma" in duo.columns, "Expected 'lemma' column in duo.csv"
assert "learning_language" in duo.columns, "Expected 'learning_language' column in duo.csv"
assert "ui_language" in duo.columns, "Expected 'ui_language' column in duo.csv"


def compute_lemma_counts_for_pair(duo_df, xx_code):
    


    subset = duo_df[
        duo_df["learning_language"].isin(["en", xx_code]) &
        duo_df["ui_language"].isin(["en", xx_code])
    ]
    
    counts = subset["lemma"].value_counts()
    
    return counts.to_dict()


lemma_counts = {}
for xx in ["es", "it", "pt"]:
    print(f"Computing lemma counts for enâ€“{xx}...")
    lemma_counts[xx] = compute_lemma_counts_for_pair(duo, xx)


Computing lemma counts for enâ€“es...
Computing lemma counts for enâ€“it...
Computing lemma counts for enâ€“pt...


In [None]:
for xx, final_path in final_paths.items():
    print(f"Processing {final_path} for enâ€“{xx}...")

    df = pd.read_csv(final_path)

    
    assert "src_lemma" in df.columns, f"'src_lemma' not found in {final_path}"
    assert "tgt_lemma" in df.columns, f"'tgt_lemma' not found in {final_path}"

    counts_dict = lemma_counts[xx]

    
    df["src_session_count"] = df["src_lemma"].map(counts_dict).fillna(0).astype(int)
    df["tgt_session_count"] = df["tgt_lemma"].map(counts_dict).fillna(0).astype(int)

    
    out_path = output_paths[xx]
    df.to_csv(out_path, index=False)
    print(f"Saved with session counts to: {out_path}")


Processing final_en_es.csv for enâ€“es...
Saved with session counts to: final_en_es_2.csv
Processing final_en_it.csv for enâ€“it...
Saved with session counts to: final_en_it_2.csv
Processing final_en_pt.csv for enâ€“pt...
Saved with session counts to: final_en_pt_2.csv
