# File 02/5

## DESCRIPTION:
- The file uses regex and other filter methods to extract prefixes
- (e.g. < po- >) and suffixes (e.g. < -yva- >/< -iva- > or < -nuti >) of verbs 
### INPUTFILE:
./OUTPUTS/dataframe_02_4.csv
### OUTPUTFILE 
- ./OUTPUTS/dataframe_02_5.csv
- ./VERBS_BY_TYPE/MAPPING_nu_from_NONE.csv
- ./VERBS_BY_TYPE/MAPPING_NONE_is_sub_of_prefix.csv
- ./VERBS_BY_TYPE/counts_VCOMB_*.csv

In [1]:
import os
import re

import pandas as pd 
import numpy as np

from typing import Tuple, Dict

In [2]:
# read input file 
df = pd.read_csv("OUTPUTS/dataframe_02_4.csv")

In [1]:
def drop_unnamed_columns(df):
    # keep only columns not starting with "Unnamed.."
    return df.loc[:, ~df.columns.str.startswith("Unnamed")]

df = drop_unnamed_columns(df)

NameError: name 'df' is not defined

In [4]:
df.columns

Index(['File', 'Text Title', 'Language', 'Sentence ID', 'Token ID', 'Form',
       'Lemma', 'Lemma_norm', 'POS', 'Morphology', 'Head ID', 'Relation',
       'Presentation After', 'Russian Translation', 'English Translation',
       'Type', 'century', 'exact', 'lang', 'region', 'Negation',
       'Negation_Marker', 'place', 'Sentence_Text'],
      dtype='object')

In [5]:
print(df.POS.unique())

['I-' 'Pp' 'Ne' 'A-' 'Nb' 'V-' 'R-' 'Pt' 'C-' 'G-' 'Px' 'Ps' 'Pd' nan 'Pk'
 'Df' 'Mo' 'Ma' 'Dq' 'Pr' 'Du' 'Pi' 'Pc' 'F-']


### From now on: use "Lemma_norm" instead of "Lemma" 
### -> INFO:  "Lemma_norm" is the normalized variant of "Lemma"

# A) Get the set of all verbs in "Lemma_norm"

In [2]:
##################################################################
# P01) 
# SET OF Lemma_norm (i.e.  POS="V-") 
##################################################################

# 1) List of Verbs, i.e. "POS" == "V-"
mask = df["POS"] == "V-"

# 2) get base_form column
base_forms_norm = df.loc[mask, "Lemma_norm"]

# 3) List of base_forms_norm 
base_forms_norm_list_all = base_forms_norm.tolist()

# 4) get list of uniq Lemma_norm entries
base_forms_norm_list_set = set(base_forms_norm_list_all)

NameError: name 'df' is not defined

In [7]:
print("Lemma_Norm:\n")
print(base_forms_norm_list_all[:10])
print(type(base_forms_norm_list_all))
verbs_altogether = len(base_forms_norm_list_all)
print(f"\nSum of entries 'Lemma_norm': {verbs_altogether}")

set_verbs_num = set(base_forms_norm_list_all)
print(f"\nUnique forms of 'Lemma_norm' (=SET): {len(set_verbs_num)}")
verb_list = list(set_verbs_num)
print(f"\nFirst 20 entries:\n{verb_list[:21]}")

Lemma_Norm:

['д_ржати', 'повелети', 'быти', 'от_дати', 'почати', 'хотети', 'от_яти', 'быти', 'от_имати', 'с_стояти']
<class 'list'>

Sum of entries 'Lemma_norm': 42652

Unique forms of 'Lemma_norm' (=SET): 3778

First 20 entries:
['уимати', 'обестити', 'приложити', 'урод_ствовати', 'об_говаривати', 'перекыдати', 'высягнути', 'прив_метати', 'от_врещи', 'наести', 'обламывати', 'пополаскывати', 'перекрепливати', 'нагрязнити', 'преложити', 'поругати', 'с_судити', 'с_бродити', 'в_ст_ргнути', 'с_грести', 'даровати']


# Which derivatives can presumably be found by deriving from simplex?
- D: читати -> simplex
 - -> I: читывати -> iteratives 
 - -> P: прочитати -> perfective form of simplex 
- P: прочитати -> perfective form of simplex 
 - -> PI: прочитывати -> secondary imperfectives  
 - -> PS: распрочитати -> double prefixed perfective form 
- PS: распрочитати -> double prefixed perfective form 
 - -> PSI: распрочитывати -> secondary imperfective with double prefix

# Create columns storing info about 
## - secondary imperfective verbs: {-yvati} / {-ivati} -> Output: Boolean (True/False) in Spalte "V_yva"
## - prefixed verb ->  Output: Boolean (True/False) in Spalte "V_prefix"

# A) I. Implement col "V_yya": 
## val == True, if verb contains suffix {-yva} or variant {-iva}; else False

In [8]:
def add_v_yva_column(
    df: pd.DataFrame,
    anchor: str = "Lemma_norm",
    new_col: str = "V_yva",
    suffixes: Tuple[str, ...] = ("ивати", "ывати"),
) -> pd.DataFrame:
    """
    Create column V_yva and assign Boolean True if valid suffix, else False
    """

    # 0) Show columns before modification (debugging / sanity check)
    df_cols_before = set(df.columns.tolist())

    # 1) Build mask: lemma ends with one of the suffixes
    mask = df[anchor].str.endswith(suffixes, na=False)

    # 2) Check if column already exists
    if new_col in df.columns:
        print("Column already exists")
        return df

    # 3) Insert column after anchor if possible
    if anchor in df.columns:
        insert_pos = df.columns.get_loc(anchor) + 1
        df.insert(insert_pos, new_col, mask)
    else:
        print(f"Column '{anchor}' not found – appending '{new_col}' at the end.")
        df[new_col] = mask

    # 4) Show newly added columns
    df_cols_after = set(df.columns.tolist())
    print(f"Newly created col: {df_cols_after - df_cols_before}")

    return df

In [9]:
# df ist dein bestehendes DataFrame
df = add_v_yva_column(df)

Newly created col: {'V_yva'}


# A) II a. To find prefixed verbs, i.e. in order to populate col "V_prefix":
## Implemen base_prefixes as regex variants 

In [10]:
# considers all prefixes mentioned in Zanchi & Naccarato (2016, 368) 
# as well as all prefixes known in Modern Standard Russian (MSR)
base_prefixes = [
    'без', 'вз', 'воз', 'вос', 'вс', 'въ', 'вы', 'до', 'за', 'из', 'изо',
    'ис', 'на', 'над', 'надо', 'о', 'об', 'обо', 'от', 'ото', 'пере', 'по',
    'под', 'подо', 'пре', 'пред', 'предо', 'при', 'про', 'прѣ',
    'раз', 'разо', 'рас', 'роз', 'рос', 'с', 'со', 'у', 'через'
]

# base_prefixes may have different graphemic variants in Old East Slavic 
variants = {
    "без":   r"бе[зѕс]ь?",
    "в":     r"в[ъьо]?",
    "вз":    r"в[ъьо][зс]?",
    "из":    r"и[зсѕ]ь?",
    "раз":   r"р[ао][зсѕ]ъ?",
    "с":     r"с[ъьо]?",
    "воз":   r"в[ъьо][сз]",
    "над":   r"над[ъьо]?",
    "об":    r"об[ъьо]?",
    "от":    r"от[ъьо]?",
    "через": r"чере[зс][ъьо]?",
}

# ---------- 1) build combined regex ----------
def build_combined_regex(bases, special):
    patterns = [
        special.get(p, re.escape(p))    # fallback = prefix itself
        # longest prefix first > so that e.g. {pred-} is found before {pre-} etc.
        # to guarantee that the correct prefix is found 
        for p in sorted(bases, key=len, reverse=True)  
    ]
    return r'^(' + '|'.join(patterns) + ')'

combined_regex = build_combined_regex(base_prefixes, variants)

# A) II b. Implement col "V_prefix" -> Value: Boolean (True/False) 
## True: if verb is prefixed, else False

In [11]:
def add_V_prefix_column(df: pd.DataFrame, combined_regex: str) -> pd.DataFrame:
    """
    Create col 'V_prefix',
    which is True for (POS=='V-') if one of the prefix patterns is found in col 'Lemma'.
    Insert col 'V_prefix' to the right of col 'V_yva'. 
    """
    df = df.copy()
    # 1) Initialize "V_prefix" with value False
    df['V_prefix'] = False

    # 2) Mask for verbs only (i.e. POS == "V-")
    mask_verbs = df['POS'] == 'V-'
    # Mask for regex match only 
    mask_prefix = df['Lemma'].astype(str).str.contains(combined_regex, regex=True, na=False)
    # set True in col 'V_prefix' if condition of both masks is True
    df.loc[mask_verbs & mask_prefix, 'V_prefix'] = True

    # 3) adjust column order: insert 'V_prefix' right to col 'V_yva'
    cols = list(df.columns)
    if 'V_yva' in cols:
        idx = cols.index('V_yva')
        cols.remove('V_prefix')
        cols.insert(idx + 1, 'V_prefix')
        df = df[cols]
    return df

In [12]:
df = add_V_prefix_column(df, combined_regex)

  mask_prefix = df['Lemma'].astype(str).str.contains(combined_regex, regex=True, na=False)


# A) III. Implement col "V_nuti" -> Value: Boolean (True/False)
## True: if verb ends in {-nuti}, else False 

In [13]:
def add_V_nuti_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create col 'V_nuti'. 
    For all (POS=='V-'): if 'Lemma_norm' ends in {-нути} ({-nuti}), 'V_yva' is True, else False. 
    Insert col right next to 'V_yva'.

    Args:
        df (pd.DataFrame)
    Returns:
        pd.DataFrame: with col 'V_nuti' 
    """
    # 1) filter for verbs only 
    df_verbs = df[df["POS"] == "V-"]

    # 2) build mask: Lemma_norm ends in "нуть"
    mask_nuti = df_verbs["Lemma_norm"].str.endswith("нути", na=False)

    # 3) Create new col for Boolean for all cols 
    # (not only cols containing verbs, i.e. "POS"=="V-")
    #  Defautlt: set all values to "False"
    df["V_nuti"] = False
    # reset those to True if Verb ends in {-nuti}
    df.loc[df_verbs.index[mask_nuti], "V_nuti"] = True

    # 4) Adjust col order: find index of col 'V_yva'
    cols = list(df.columns)
    if "V_yva" in cols:
        idx_v_yva = cols.index("V_yva")
        # del col 'V_nuti' (as it is at the end of df right now)
        cols.remove("V_nuti")
        # Insert 'V_nuti' right after 'V_yva'
        cols.insert(idx_v_yva + 1, "V_nuti")
        # Return DataFrame with new col order 
        return df[cols]
    else:
        # If no 'V_yva' in DF, 'V_nuti' stays at the end of the df 
        return df

In [14]:
df= add_V_nuti_column(df)

# B) Print combinations of "V_yva", "V_prefix", "V_nuti"

In [15]:
# 1) Define masks 
mask_v   = df["POS"]=="V-"
mask_yva = df.get("V_yva", False)==True
mask_pre = df.get("V_prefix", False)==True
mask_nut = df.get("V_nuti", False)==True

# 2) Exclusive Masks
only_yva       = mask_v & mask_yva & ~mask_pre & ~mask_nut
only_prefix    = mask_v & mask_pre & ~mask_yva & ~mask_nut
only_nuti      = mask_v & mask_nut & ~mask_yva & ~mask_pre

yva_and_pref   = mask_v & mask_yva & mask_pre & ~mask_nut
yva_and_nuti   = mask_v & mask_yva & mask_nut & ~mask_pre
pref_and_nuti  = mask_v & mask_pre & mask_nut & ~mask_yva

all_three      = mask_v & mask_yva & mask_pre & mask_nut

# 3) Count
def cnt(m):
    sub = df[m]
    return len(sub), sub["Lemma_norm"].nunique()


print("Entities: (Form: (verb form count, lemma count))")
print("Only yva:\t\t",       cnt(only_yva))
print("Only prefix:\t\t",    cnt(only_prefix))
print("Only nuti:\t\t",      cnt(only_nuti))
print("prefix ∧ yva:\t\t",  cnt(yva_and_pref))
print("yva ∧ nuti:\t\t",    cnt(yva_and_nuti))
print("prefix ∧ nuti:\t\t", cnt(pref_and_nuti))
print("prefix ∧ yva ∧ nuti:\t",     cnt(all_three))

# 4) Summen
total_exclusive = sum(cnt(m)[0] for m in [only_yva,only_prefix,only_nuti,
                                         yva_and_pref,yva_and_nuti,
                                         pref_and_nuti,all_three])
print()
print("Sum of verbs for which one of the above conditions is true:", total_exclusive)
print("All verbs:", len(df[mask_v]))

Entities: (Form: (verb form count, lemma count))
Only yva:		 (132, 10)
Only prefix:		 (23947, 2844)
Only nuti:		 (120, 38)
prefix ∧ yva:		 (471, 152)
yva ∧ nuti:		 (0, 0)
prefix ∧ nuti:		 (664, 157)
prefix ∧ yva ∧ nuti:	 (0, 0)

Sum of verbs for which one of the above conditions is true: 25334
All verbs: 42652


# Create a separate col where all above masks are summed up to one unique property 

In [16]:
def add_V_COMB_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a column "V_COMB" according to the logic below: 
      - POS != "V-" → "0"
      - POS == "V-" & no Flags True  → "1"
      - if entry is not a verb: df["V_COMB"] == 0
      - if entry is Verb and has none of the above = NONE 
      - other output in "V_COMB": 
        pref, pref_nu, pref_yva, yva, nu 

    - Insert Column right to "V_nuti" if the latter exists, else to the end of the df columns 
    - Important: Values can never overwrite each other, i.e. 
    "pref_nu" cannot be overwritten by "pref" or vice versa
    """
    def make_comb(row):
        # 1) If entry is not a verb: 
        if row.get("POS") != "V-":
            return "0"
        
        # 2) Collect all flags set in order: YVA (2), PREFIX (3), NUTI (4)
        comb = ""
        if row.get("V_prefix", False):
            comb += "_pref"
        if row.get("V_yva", False):
            comb += "_yva"
        if row.get("V_nuti", False):
            comb += "_nu"
        
        # 3) if none of the 3 flags is True, return "1"
        return comb[1:] if comb else "NONE"
    
    # Create new column "V_COMB":
    df["V_COMB"] = df.apply(make_comb, axis=1)
    
    # Adjust col order:  set col "V_COMB" on the right to col "V_nuti"
    cols = list(df.columns)
    if "V_nuti" in cols:
        idx_nuti = cols.index("V_nuti")
        cols.remove("V_COMB")         # V_COMB at the very right 
        cols.insert(idx_nuti + 1, "V_COMB")
        return df[cols]
    else:
        # If V_nuti not existent then V_COMB stays at the very right
        return df

In [17]:
df = add_V_COMB_column(df)

In [18]:
column_count = len(df.columns)

In [19]:
pd.set_option('display.max_columns',column_count)
pd.set_option('display.max_rows',100)
print(f"num of cols in df: ", column_count)
df.head(10)

num of cols in df:  28


Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,Lemma_norm,V_yva,V_nuti,V_COMB,V_prefix,POS,Morphology,Head ID,Relation,Presentation After,Russian Translation,English Translation,Type,century,exact,lang,region,Negation,Negation_Marker,place,Sentence_Text
0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,се,False,False,0,False,I-,---------n,,voc,,"вот, это","behold, here is",OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,аз_,False,False,0,False,Pp,1s---mn--i,2157784.0,sub,,я,I,OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
2,mst,Mstislav’s letter,orv,189407,2157775,мьстиславъ,мьстиславъ,м_стислав_,False,False,0,False,Ne,-s---mn--i,2157774.0,apos,,Мстислав,Mstislav,OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
3,mst,Mstislav’s letter,orv,189407,2157776,володимирь,володимирь,володимир_,False,False,0,False,A-,-s---mnpsi,2157777.0,atr,,Владимира,Vladimir's,OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
4,mst,Mstislav’s letter,orv,189407,2157777,сн҃ъ,сынъ,сын_,False,False,0,False,Nb,-s---mn--i,2157775.0,apos,,сын,son,OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
5,mst,Mstislav’s letter,orv,189407,2157778,дьржа,дьржати,д_ржати,False,False,NONE,False,V-,-sppamn-si,2157784.0,xadv,,"держать, задерживать, иметь, блюсти, возвр:дер...","hold, possess, keep, refl:keep",OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
6,mst,Mstislav’s letter,orv,189407,2157779,рѹськѹ,русьскыи,рус_скыи,False,False,0,False,A-,-s---fapsi,2157780.0,atr,,русский,Rusian,OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
7,mst,Mstislav’s letter,orv,189407,2157780,землю,земля,земля,False,False,0,False,Nb,-s---fa--i,2157778.0,obj,,земля,"land, soil, ground",OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
8,mst,Mstislav’s letter,orv,189407,2157781,въ,въ,в_,False,False,0,False,R-,---------n,2157778.0,adv,,"в, на, за","in, into, on, at, for",OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"
9,mst,Mstislav’s letter,orv,189407,2157782,своѥ,свои,свои,False,False,0,False,Pt,3s---na--i,2157783.0,atr,,свой,"self’s, own",OR,12,1130.0,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))"


# How many unique tokens exist (might be of the same type, here: Lemma)

In [20]:
df.V_COMB.value_counts()

V_COMB
0           192623
pref         23947
NONE         17318
pref_nu        664
pref_yva       471
yva            132
nu             120
Name: count, dtype: int64

In [21]:
df.V_COMB.value_counts().sum()

np.int64(235275)

In [22]:
df.to_csv("OUTPUTS/dataframe_02_5.csv")

# For each combination: get the according lemmas ("Lemma_norm") and count 
# their occurrences
## Reference:
df["Lemma_norm"], df["V_COMB"]

In [23]:
def lemma_counts_by_vcomb(df: pd.DataFrame) -> Dict[str, Dict[str, int]]:
    """
    Get all unique values from V_COMB, 

    Return a nested dictionary of the form:
    {
        V_COMB_value: {
            Lemma_norm: frequency,
            ...
        },
        ...
    }
    """
    return {
        comb: subdf["Lemma_norm"].value_counts().to_dict()
        for comb, subdf in df.groupby("V_COMB")
    }

# call function
counts = lemma_counts_by_vcomb(df)

In [24]:
# get list of unique V_COMB values 
vcomb_list = df["V_COMB"].unique().tolist()

# print first ten values per key to console: 
for v in sorted(vcomb_list):
    if v != "0":
        top10 = list(counts.get(v, {}).items())[:10]
        print(f"{v}: {top10}\n")

NONE: [('быти', 4817), ('рещи', 1131), ('ити', 754), ('глаголати', 573), ('хотети', 556), ('видети', 452), ('дати', 436), ('имети', 356), ('творити', 281), ('бити', 265)]

nu: [('минути', 24), ('д_рзнути', 15), ('кликнути', 13), ('кынути', 10), ('т_лкнути', 5), ('тонути', 4), ('коснути', 3), ('тянути', 3), ('треснути', 3), ('тронути', 3)]

pref: [('приити', 887), ('с_творити', 495), ('пос_лати', 491), ('поити', 463), ('в_зяти', 436), ('начати', 356), ('стати', 295), ('слышати', 280), ('прияти', 262), ('стояти', 228)]

pref_nu: [('побегнути', 73), ('погыбнути', 55), ('помянути', 34), ('прибегнути', 26), ('в_здвигнути', 24), ('ужаснути', 17), ('покынути', 16), ('ус_нути', 16), ('в_зд_хнути', 15), ('постигнути', 15)]

pref_yva: [('пребывати', 67), ('призывати', 34), ('наказывати', 34), ('с_казывати', 28), ('с_прашивати', 17), ('забывати', 15), ('убивати', 12), ('почивати', 11), ('избивати', 10), ('проливати', 8)]

yva: [('бывати', 122), ('давывати', 2), ('живати', 1), ('пуживати', 1), ('п

In [25]:
# 1. Sum of all counts
total_counts = sum(
    freq
    for subdict in counts.values()
    for freq in subdict.values()
)

# 2. Number of unique lemmas over all groups 
unique_lemmas = {
    lemma
    for subdict in counts.values()
    for lemma in subdict.keys()
}
num_unique_lemmas = len(unique_lemmas)

print(f"Sum of all counts in all groups V_COMBI: {total_counts:_}")
print(f"Sum of unique Lemma_norm: {num_unique_lemmas:_}")

Sum of all counts in all groups V_COMBI: 229_155
Sum of unique Lemma_norm: 12_768


## For each combination: write the verbs and frequencies to csv 

In [26]:
# 1. Create output directory
output_dir = "VERBS_BY_TYPE"
os.makedirs(output_dir, exist_ok=True)

# 2. und 3.: write each combi with lemmas and freqs to csv file
for vcomb, subdict in counts.items():
    # skip "0" -> that is simplex verbs 
    if vcomb == "0":
        continue

    # Convert Subdict (i.e. inner dictionary) to DataFrame
    df_group = pd.DataFrame(
        list(subdict.items()),
        columns=["Lemma_norm", "Count"]
    )

    # File name based on V_COMB entry 
    fname = f"counts_VCOMB_{vcomb}.csv"
    path = os.path.join(output_dir, fname)

    # Write to CSV
    df_group.to_csv(path, index=False)
    print(f"Wrote file: {path}")

Wrote file: VERBS_BY_TYPE/counts_VCOMB_NONE.csv
Wrote file: VERBS_BY_TYPE/counts_VCOMB_nu.csv
Wrote file: VERBS_BY_TYPE/counts_VCOMB_pref.csv
Wrote file: VERBS_BY_TYPE/counts_VCOMB_pref_nu.csv
Wrote file: VERBS_BY_TYPE/counts_VCOMB_pref_yva.csv
Wrote file: VERBS_BY_TYPE/counts_VCOMB_yva.csv


# Create FILE with entries where 
## a. MAPPING_NONE_is_sub_of_prefix
- expected entry:
  NONE (=simplex verb) ['pref (=prefixed variants of simplex verb)]
- e.g.: begati: ['pribegati', 'pobegati', 'ot_begati', 'prebegati']

In [27]:
# --- 1) Mapping of "NONE" and "pref" lemmas ---
none_lemmas = list(counts["NONE"].keys()) # e.g. [byti, rešči, iti, ...]
pref_lemmas = list(counts["pref"].keys()) # e.g. [priiti, s_tvoriti, poiti, ...]

mapping = []
for none in none_lemmas:
    # check if none (of none_lemmas list) is substring of p (here: pref_lemma entry)
    # e.g. none = "iti" is substring of p = "poiti" > match!
    matches = [p for p in pref_lemmas if none in p]
    # if match: append dictionary entry to list mapping 
    # Output: {"NONE": "iti", "pref": ["poiti", "zaiti", ...]}
    mapping.append({"NONE": none, "pref": matches})

# from List[Dict[str,  List[str]]] -> create dataframe df_non_pref 
df_none_pref = pd.DataFrame(mapping)

# --- 2) del duplicates ---
# a) Convert to dict
## 1. from df > create Series where index is NONE, values is pref
## e.g. iti [poiti, priiti]
##      byti [pobyti]... 
# 2. convert into normal dict: i.e. { "iti": ["poiti, priiti"], "byti": [...], ...}
none_to_prefs = df_none_pref.set_index("NONE")["pref"].to_dict()

# b) Invert: collect all NONEs for each pref 
pref_to_nones = {}
for none, prefs in none_to_prefs.items():
    for p in prefs:
        # {'prebyti': ['byti'], 's_byti': ['byti'], 'zabyti': ['byti'],
        pref_to_nones.setdefault(p, []).append(none)

# c) if a pref is list element of > 1 none, 
# delete the dublicate none entries, e.g.: 
## nones = ["by", "byti"] -> best_none = "byti"
## BEFORE: none_to_prefs = {"by": ["zabyti"], "byti": ["zabyti"]}
## AFTER: none_to_prefs = {"by": [], "byti": ["zabyti"]}
for p, nones in pref_to_nones.items():
    if len(nones) > 1:
        best_none = max(nones, key=len)
        for none in nones:
            if none != best_none:
                # delete p from the list of NONE
                none_to_prefs[none] = [x for x in none_to_prefs[none] if x != p]

# d) New DataFrame from cleaned dict, sorted by NONE
df_clean = pd.DataFrame([
    {"NONE": none, "pref": none_to_prefs[none]}
    for none in sorted(none_to_prefs.keys())
])

# --- 3) Create output dir and save new csv file inside of it  ---
output_dir = "VERBS_BY_TYPE"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "MAPPING_NONE_is_sub_of_prefix.csv")
df_clean.to_csv(output_path, index=False)
print(f"Wrote mapping to csv: {os.path.join(output_dir, 
                                           'MAPPING_NONE_is_sub_of_prefix.csv')}")

Wrote mapping to csv: VERBS_BY_TYPE/MAPPING_NONE_is_sub_of_prefix.csv


# b.MAPPING_pref_nu_from_none

In [28]:
# Define what has to be deleted in order to get the search string 
## (here: none_suffix_regex, consisting of 1 char + "-ti" at the end) 
## ex.: {-ati}, {-iti} etc. 
none_suffix_regex = re.compile(r'.{1}ти$')
# Regex for PREFIX for verbs of type pref_nu
prefix_regex = re.compile(
    r'^(?:бе[зс]|в_?з|из_?|раз_?|рас_?|с_?|вос_?|над_?|об_?|от_?|чере[зс])'
)
# Regex for SUFFIX for verbs of type pref_nu
suffix_regex = re.compile(r'нути$')

# create list of all "NONE" lemmas 
none_raw = list(counts["NONE"].keys())
# for NONE lemmas (as keys): 
## create a list of stems (w/o stem vowel and infinitive ending) of NONE lemmas  
## save the list as value of the corresponding key  
none_cores = {none: none_suffix_regex.sub("", none) for none in none_raw}

# 2. Roh-Mapping NONE -> pref_nu (Substring-Match auf den Kern)
# create list of all "pref_nu" lemmas 
pref_nu_lemmas = list(counts["pref_nu"].keys())
none_to_nus = {}
# Iterate over the list "none_raw" (i.e. non-prefixex verbs)
# (i.e. the list w/ all "none" lemmas, i.e ["iti", "dati",...])
for none in none_raw:
    core = none_cores[none]
    matches = [p for p in pref_nu_lemmas if core in p]
    none_to_nus[none] = matches

# 3. Duplikat-Bereinigung
# a) Invert: for each pref_nu: collect all NONE
nu_to_nones = {}
for none, nus in none_to_nus.items():
    for nu in nus:
        nu_to_nones.setdefault(nu, []).append(none)

# b) In case of multiple assignments, keep the pref_nu verb
#    only with the longest matching NONE base
for nu, nones in nu_to_nones.items():
    if len(nones) > 1:
        beste_none = max(nones, key=len)
        for none in nones:
            if none != beste_none:
                none_to_nus[none].remove(nu)

# 4. Create and save DataFrame (sorted alphabetically by verbs of columns NONE)
df_nu_clean = pd.DataFrame([
    {"NONE": none, "pref_nu": none_to_nus[none]}
    for none in sorted(none_to_nus.keys())
])

output_dir = "VERBS_BY_TYPE"
os.makedirs(output_dir, exist_ok=True)
df_nu_clean.to_csv(
    os.path.join(output_dir, "MAPPING_nu_from_NONE.csv"),
    index=False
)

print(f"Wrote mapping to csv: {os.path.join(output_dir, 'MAPPING_nu_from_NONE.csv')}")

Wrote mapping to csv: VERBS_BY_TYPE/MAPPING_nu_from_NONE.csv
