# FILE 02/1

## DESCRIPTION: 
### Adds the columns ["century","exact","lang","source","place"] to the DataFrame 

## INPUTFILES:
### ../01/OUTPUTS/dataframe_01_02.csv 
### ./metadata/metadata.yaml

## OUTPUTFILE: 
### ./OUTPUTS/dataframe_02_1.csv

In [1]:
import os 
import shutil
import re
import yaml
import pandas as pd 

## Delete old output files created with this script

In [2]:
output_dir = "./OUTPUTS"

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

os.makedirs(output_dir)

## Read DataFrame 

In [3]:
df_ = pd.read_csv("../01/OUTPUTS/dataframe_01_02.csv", 
                 dtype={"Russian Translation": "string", "English Translation": "string"})
df_['Head ID'] = pd.to_numeric(df_['Head ID'], errors='coerce').astype('Int64')
display(df_)

Unnamed: 0.1,Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,POS,Morphology,Head ID,Relation,Presentation After,Russian Translation,English Translation,Type
0,0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,I-,---------n,,voc,,"вот, это","behold, here is",OR
1,1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,Pp,1s---mn--i,2157784,sub,,я,I,OR
2,2,mst,Mstislav’s letter,orv,189407,2157775,мьстиславъ,мьстиславъ,Ne,-s---mn--i,2157774,apos,,Мстислав,Mstislav,OR
3,3,mst,Mstislav’s letter,orv,189407,2157776,володимирь,володимирь,A-,-s---mnpsi,2157777,atr,,Владимира,Vladimir's,OR
4,4,mst,Mstislav’s letter,orv,189407,2157777,сн҃ъ,сынъ,Nb,-s---mn--i,2157775,apos,,сын,son,OR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317277,317277,psal-sin,54,chu,219251,2339057,нізъведеши,низъвести,V-,2spia----i,,pred,,,,OCS
317278,317278,psal-sin,54,chu,219251,2339058,ѩ,и,Pp,3p---pa--i,2339057,obj,,,,OCS
317279,317279,psal-sin,54,chu,219251,2339059,въ,въ,R-,---------n,2339057,obl,,,,OCS
317280,317280,psal-sin,54,chu,219251,2339060,стѹденецъ,стоуденьць,Nb,-s---ma--i,2339059,obl,,,,OCS


## Remove certain files if not needed (here: all files which are NOT Old East Slavic, i.e. (Old) Church Slavonic codexes)

In [4]:
files_to_remove = ['supr', 'zogr', 'kiev-mis', 'vit-const', 'vit-meth', 'psal-sin']

# Dataframe with filtered files 
df2 = df_[~df_['File'].isin(files_to_remove)]

In [5]:
# Check if deletion worked
assert (~df2['File'].isin(files_to_remove)).all()

In [6]:
# Print the files left 
all_files = set(df2["File"])
print(all_files)

{'nov-sin', 'const', 'mstislav-col', 'vest-kur', 'ust-vlad', 'nov-marg', 'ostromir-col', 'pskov-ivan', 'riga-goth', 'usp-sbor', 'novgorod-jaroslav', 'nov-list', 'peter', 'dux-grjaz', 'sergrad', 'spi', 'suz-lav', 'smol-pol-lit', 'pvl-hyp', 'pskov', 'mst', 'schism', 'drac', 'birchbark', 'kiev-hyp', 'afnik', 'varlaam', 'domo', 'rusprav', 'avv', 'rig-smol1281', 'lav', 'luk-koloc', 'zadon'}


In [7]:
def compare_titles(df, yaml_path):
    """
    Compare DF Text Titles with YAML metadata titles.

    Semantics:
    - Exact "*"            -> global wildcard: all DF titles are accepted
    - "<prefix> *"         -> prefix wildcard: matches all DF titles starting with <prefix>
    - everything else      -> exact title match
    """

    # --- Load YAML ---
    with open(yaml_path, "r", encoding="utf-8") as f:
        meta_info = yaml.safe_load(f)

    result = {}

    # --- Iterate over files ---
    for file in df["File"].unique():

        # DF titles
        df_titles = set(
            title for title in df.loc[df["File"] == file, "Text Title"].dropna().unique()
            if isinstance(title, str)
        )

        yaml_entries = meta_info.get(file, [])

        wildcard_global = False
        exact_titles = set()
        prefix_titles = set()

        # --- Parse YAML titles ---
        for entry in yaml_entries:
            if isinstance(entry, str):
                title = entry
            elif isinstance(entry, dict):
                title = entry.get("title") or entry.get("text")
            else:
                continue

            if not isinstance(title, str):
                continue

            title = title.strip()

            if title == "*":
                wildcard_global = True
            elif title.endswith(" *"):
                prefix_titles.add(title[:-2])
            else:
                exact_titles.add(title)

        # --- Global wildcard: accept all DF titles ---
        if wildcard_global:
            result[file] = {
                "only_in_df": set(),
                "only_in_yaml": set(),
                "in_both": df_titles,
            }
            continue

        # --- Match DF titles ---
        matched_df_titles = set()

        for t in df_titles:
            if t in exact_titles:
                matched_df_titles.add(t)
            elif any(t.startswith(p) for p in prefix_titles):
                matched_df_titles.add(t)

        only_in_df = df_titles - matched_df_titles
        only_in_yaml = exact_titles - matched_df_titles
        in_both = matched_df_titles

        result[file] = {
            "only_in_df": only_in_df,
            "only_in_yaml": only_in_yaml,
            "in_both": in_both,
        }

    return result

# Function call
yaml_path  = "./metadata/metadata.yaml"
comparison = compare_titles(df2, yaml_path)

mismatches = []
for file in all_files:
    assert len( comparison[file]['only_in_df'] ) == 0, f"Mismatch in File {file}"
    if len( comparison[file]['only_in_df'] ) != 0:
        mismatches.append(file)
        
for file in mismatches:
    print(f"_______________FILE___ {file}______________________\n")
    print(f"{file} – ONLY IN DF:", comparison[file]['only_in_df'])
    print(f"{file} – ONLY IN YAML:", comparison[file]['only_in_yaml'])
    print("\n")


assert len(mismatches) == 0, "Error: MISMATCH: YAML <> DF"

## Join DataFrame and YAML-File: 

In [8]:
def enrich_with_wildcard(df_in, yaml_path):
    """
    - Extract"century","exact","lang","source","place" aus der YAML
      for each "Text" (e.g. afnik) and the titles "Title"
    - Use wildcard '*' in YAML, which leads to inheriting "Title" 
    - Use Wildcard 'Any Text *' in YAML, if 'Any Text 1', 'Any Text 2' etc.
      (relevant e.g. if 'Any Text' was created the same year and was written by the same author (or missing infos)) 
    """
    # Open and read from YAML file 
    with open(yaml_path, "r", encoding="utf-8") as f:
        meta_info = yaml.safe_load(f)

    # Copy df, initialize empty df columns to store the metadata 
    # stored in the YAML file 
    df = df_in.copy()
    for col in ["century","exact","lang","source","place"]:
        df[col] = None

    # Iterate over the keys "file_key" of the meta_info
    for file_key, entries in meta_info.items():
        # mask_file stores boolean 
        mask_file = df["File"] == file_key
        #print(mask_file.sum())

        for e in entries:
            # Hol dir den relevanten Titel-String (text oder title)
            txt = e.get("text") or e.get("title")
            txt = txt.strip() if isinstance(txt, str) else None

            # Baue die Text-Maske
            if txt is None or txt == "*":
                # Universal-Wildcard: alle Zeilen der Datei
                mask = mask_file
            elif "*" in txt:
                # Präfix-Wildcard: alles, was mit dem prefix beginnt
                prefix = txt.replace("*","").rstrip()
                mask = mask_file & df["Text Title"].str.startswith(prefix)
            else:
                # Exakter Titel
                mask = mask_file & (df["Text Title"] == txt)

            # Und weise die Felder zu
            for field in ["century","exact","lang","source","place"]:
                if field in e:
                    val = e[field]
                    if isinstance(val, list):
                        val = ", ".join(val)
                    df.loc[mask, field] = val

    return df

# Anwendung:
df_enriched = enrich_with_wildcard(df2, yaml_path)

inputfile = "nov-marg"
# Test: schau dir alle 'usp-sbor'-Zeilen an
print(df_enriched.loc[df_enriched["File"]==inputfile, ["Text Title","century"]].drop_duplicates().head())

                  Text Title century
289148  September, folio 176      13
289179    October, folio 127      13
289192   September, folio 56      13
289209   September, folio 57      13
289214   September, folio 84      13


## ERROR CHECKING

In [9]:
# Missing information for "place" in "File" or "Text Title"? 
# Filter place == NaN 
missing_place = df_enriched.loc[df_enriched['place'].isna(), ['File', 'Text Title']]
assert len(missing_place) == 0, "Information for 'place' missing"

In [10]:
# Missing information for "century" in "File"?
# List of 'File'-cols where 'century' is None/NaN:
files_missing_century = df_enriched.loc[df_enriched['century'].isna(), 'File'].unique().tolist()
assert len(files_missing_century) == 0, "Information for 'century' missing"

In [11]:
# Print first two rows of a random file to make sure data look as expected
df_enriched[df_enriched["File"]=="birchbark"].head(2)

Unnamed: 0.1,Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,POS,Morphology,...,Relation,Presentation After,Russian Translation,English Translation,Type,century,exact,lang,source,place
79818,79818,birchbark,9,orv,210135,2287444,ѿ,отъ,R-,---------n,...,obl,,"от, из, с, у","from, of",OR,12,1160‒1180,birchbark,,Novgorod
79819,79819,birchbark,9,orv,210135,2287445,гостѧтꙑ,гостята,Ne,-s---fg--i,...,obl,,,,OR,12,1160‒1180,birchbark,,Novgorod


In [12]:
# All centuries in COL "century"?
which_centuries = df_enriched["century"].unique()
print(which_centuries)

[12 '12' '14' '11' '13' '15' 15 '15/2' '17/2' '17' '16/2' 16 11 13]


# Replace century infos of type "16/2" in Column "century" by "16"

In [13]:
df_enriched["century"] = (
    df_enriched["century"]
    .astype(str) # cast to string 
    .str.split("/") # delim 
    .str[0] # cut off second element -> only whole centuries are kept (16/2-> 16)
    .replace("None", pd.NA) # cast "None" to "N.A." -> N.A. compatible with Int64 (≠ None)
    .astype("Int64") # allows N.A. values (≠ int)
)

## Verify century type casting

In [14]:
filtered_df = df_enriched[df_enriched["File"] == "dux-grjaz"]
display(filtered_df.head(2))

Unnamed: 0.1,Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,POS,Morphology,...,Relation,Presentation After,Russian Translation,English Translation,Type,century,exact,lang,source,place
235438,235438,dux-grjaz,Testament of Ivan Jurievich Grjaznoj,orv,213401,2306029,﻿Во,въ,R-,---------n,...,adv,,"в, на, за","in, into, on, at, for",OR,16,1579,OR,,Moscow
235439,235439,dux-grjaz,Testament of Ivan Jurievich Grjaznoj,orv,213401,2306030,имѧ,имя,Nb,-s---na--i,...,obl,,имя,name,OR,16,1579,OR,,Moscow


# Statistics 

In [15]:
## Evaluation: How many tokens (all parts of speech) per "century"
df_enriched.century.value_counts().sort_index()

century
11    25932
12    93208
13    20447
14     3095
15    42789
16    23880
17    25924
Name: count, dtype: Int64

In [16]:
# How many verb lemmas per century in the DF?
df_enriched.loc[df_enriched["POS"] == "V-", "century"].value_counts().sort_index()

century
11     5145
12    17994
13     3115
14      455
15     7414
16     3623
17     4906
Name: count, dtype: Int64

## ERROR CHECKING: all columns contain value for "lang" (=language)?

In [17]:
filtered_df_lang_none = df_enriched[df_enriched["lang"].isna()]
unique_files = filtered_df_lang_none["File"].unique()
print(unique_files)
nunique_files = filtered_df_lang_none["File"].nunique()
print(nunique_files)

# If amount of unique files missing entry "lang" is not NULL -> return Error 
assert len(filtered_df_lang_none) == 0, "Error: missing entries for column 'lang'"

[]
0


# Write data to csv  

In [18]:
df_enriched.to_csv("OUTPUTS/dataframe_02_1.csv", index=False) # auskommentiert 12.05.2025 da Output File bereits vorhanden