# File 01/01

# DESCRIPTION: 

# INPUT_FILES:
- ../source_data/treebank-releases-20180919/ # xml files (afnik.xml usw. )
- ../source_data/translations/orv.xml # Translations for the Old East Slavic words
- ../source_data/translations/chu.xml # Translations for the (Old) Church Slavonic words
# OUTPUT_FILE:
- ./OUTPUTS/dataframe_01_02.csv'

In [1]:
import os
from pathlib import Path
import shutil
import glob
from typing import List, Tuple

import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm

In [2]:
output_dir = "./OUTPUTS"

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

In [3]:
os.makedirs("./OUTPUTS", exist_ok=True)

In [None]:
# Define paths for translation files (containing the translations for the Old East Slavic texts of the treebank releases)
TRANSLATION_FILES = {
    "orv": "../source_data/translations/orv.xml",
    "chu": "../source_data/translations/chu.xml"
}


class Token:
    def __init__(self, token_id: str, form: str, lemma: str, pos: str,
                 morphology: str, head_id: str, relation: str, presentation_after: str):
        """
        Classes reflecting the PROIEL input files structure
        """
        self.token_id = token_id
        self.form = form
        self.lemma = lemma
        self.part_of_speech = pos
        self.morphology = morphology
        self.head_id = head_id
        self.relation = relation
        self.presentation_after = presentation_after

class Sentence:
    def __init__(self, sentence_id: str, tokens: List[Token]):
        self.sentence_id = sentence_id
        self.tokens = tokens

class Text:
    def __init__(self, title: str, sentences: List[Sentence], language: str):
        self.title = title
        self.sentences = sentences
        self.language = language


def parse_proiel_xml_with_language(file_path: str) -> Tuple[List[Text], str]:
    """
    Function to parse a PROIEL XML file and determine the language used in it
    """
    tree = ET.parse(file_path)
    root = tree.getroot()

    source_element = root.find("source")
    if source_element is None:
        raise ValueError("<source> Element nicht gefunden!")

    # Sprache aus dem <source>-Attribut extrahieren
    language = source_element.attrib.get("language", "unknown")

    texts = []
    for div in source_element.findall("div"):
        title = div.find("title").text if div.find("title") is not None else "Untitled"
        sentences = []
        
        for sentence_elem in div.findall("sentence"):
            sentence_id = sentence_elem.attrib.get("id", "Unknown")
            tokens = []
            
            for token_elem in sentence_elem.findall("token"):
                tokens.append(Token(
                    token_elem.attrib.get("id", "Unknown"),
                    token_elem.attrib.get("form", ""),
                    token_elem.attrib.get("lemma", ""),
                    token_elem.attrib.get("part-of-speech", ""),
                    token_elem.attrib.get("morphology", ""),
                    token_elem.attrib.get("head-id", ""),
                    token_elem.attrib.get("relation", ""),
                    token_elem.attrib.get("presentation-after", "")
                ))
            
            sentences.append(Sentence(sentence_id, tokens))
        
        texts.append(Text(title, sentences, language))
    
    return texts, language

def build_translation_dict(translation_root: ET.Element) -> dict:
    """
    Create a translation dictionary from one of the xml files 
    """
    translation_dict = {}
    for lemma in translation_root.findall(".//lemma[@lemma]"):
        lemma_word = lemma.attrib['lemma']
        pos = lemma.attrib.get('part-of-speech', '')
        key = (lemma_word, pos)
        glosses = lemma.find('glosses')
        russian_gloss = ""
        english_gloss = ""
        if glosses is not None:
            for gloss in glosses.findall('gloss'):
                lang = gloss.attrib.get('language')
                if lang == "rus":
                    russian_gloss = gloss.text or ""
                elif lang == "eng":
                    english_gloss = gloss.text or ""
        translation_dict[key] = (russian_gloss, english_gloss)
    return translation_dict


def load_translation_file(language: str) -> dict:
    """
    Function to load the translation files based on the language
    """
    translation_file = TRANSLATION_FILES.get(language)
    if not translation_file or not os.path.exists(translation_file):
        raise ValueError(f"Keine gültige Übersetzungsdatei für Sprache '{language}' gefunden!")

    # Parse translation file and create dictionary 
    tree = ET.parse(translation_file)
    root = tree.getroot()
    return build_translation_dict(root)


def texts_to_df(texts: List[Text], translation_dict: dict, file_stem: str) -> pd.DataFrame:
    """
    Create DataFrame from the parsed texts including the translations
    """
    data = []
    for text in texts:
        for sentence in text.sentences:
            for token in sentence.tokens:
                key = (token.lemma, token.part_of_speech)
                russian_gloss, english_gloss = translation_dict.get(key, ("", ""))
                data.append({
                    "File": file_stem,
                    "Text Title": text.title,
                    "Language": text.language,
                    "Sentence ID": sentence.sentence_id,
                    "Token ID": token.token_id,
                    "Form": token.form,
                    "Lemma": token.lemma,
                    "POS": token.part_of_speech,
                    "Morphology": token.morphology,
                    "Head ID": token.head_id,
                    "Relation": token.relation,
                    "Presentation After": token.presentation_after,
                    "Russian Translation": russian_gloss,
                    "English Translation": english_gloss
                })
    return pd.DataFrame(data)

# ---------------------------
# Main 
# ---------------------------
if __name__ == "__main__":
    # input file for Old East Slavic 
    input_directory = "../source_data/treebank-releases-20180919/"
    xml_files = glob.glob(os.path.join(input_directory, "*.xml"))

    # empty list
    df_list = []


    # Iterate over xml_files 
    ## Use tqdm to show process line
    for file in tqdm(xml_files, desc="Processing XML files"):
        try:
            # Parse file and recognize language
            texts, language = parse_proiel_xml_with_language(file)
            
            # Load corresponding translation file
            if language in TRANSLATION_FILES:
                translation_dict = load_translation_file(language)
            else:
                print(f"Warning: No file for translation for langauge '{language}' found,\
                No translation was added.")
                translation_dict = {}

            # Get file's basename -> no extension, no path
            file_stem = os.path.splitext(os.path.basename(file))[0]

            # Create DF 
            df_temp = texts_to_df(texts, translation_dict, file_stem)
            df_list.append(df_temp)

        # If exception -> throw error and print eror message   
        except Exception as e:
            print(f"Error handling file {file}: {e}")

    # Concatenate all dfs to one huge DataFrame
    if df_list:
        df_all = pd.concat(df_list, ignore_index=True)
        print(df_all)
    else:
        print("Files could not be processed.")


In [5]:
display(df_all[:5])
print(len(df_all))

Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,POS,Morphology,Head ID,Relation,Presentation After,Russian Translation,English Translation
0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,I-,---------n,,voc,,"вот, это","behold, here is"
1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,Pp,1s---mn--i,2157784.0,sub,,я,I
2,mst,Mstislav’s letter,orv,189407,2157775,мьстиславъ,мьстиславъ,Ne,-s---mn--i,2157774.0,apos,,Мстислав,Mstislav
3,mst,Mstislav’s letter,orv,189407,2157776,володимирь,володимирь,A-,-s---mnpsi,2157777.0,atr,,Владимира,Vladimir's
4,mst,Mstislav’s letter,orv,189407,2157777,сн҃ъ,сынъ,Nb,-s---mn--i,2157775.0,apos,,сын,son


317282


In [6]:
# Type mapping for "File" and "Lang" -> Langs: (OCS/CS: (Old) Church Slavonic, OR: Old Russian)
type_mapping = {
    "OCS": ["supr", "zogr", "kiev-mis", "psal-sin"],
    "CS": ["vit-const", "vit-meth"],
    "OR": [
        "afnik", "birchbark", "smol-pol-lit", "mstislav-col", "ostromir-col", "peter", "domo", "sergrad",
        "schism", "pskov-ivan", "rig-smol1281", "mst", "nov-marg", "novgorod-jaroslav", "rusprav",
        "ust-vlad", "dux-grjaz", "riga-goth", "nov-sin", "kiev-hyp", "avv", "nov-list", "pvl-hyp",
        "lav", "suz-lav", "drac", "spi", "luk-koloc", "pskov", "const", "usp-sbor", "varlaam",
        "vest-kur", "zadon"
    ]
}

# Fill new col "Type" containing the language of the File (UNK (unknown) if unknown)
def get_type(file_id):
    for lang, ids in type_mapping.items():
        if file_id in ids:
            return lang
    return "UNK" 
    
# Function call 
df_all["Type"] = df_all["File"].apply(get_type)

In [7]:
display(df_all[:5])
print(len(df_all))

Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,POS,Morphology,Head ID,Relation,Presentation After,Russian Translation,English Translation,Type
0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,I-,---------n,,voc,,"вот, это","behold, here is",OR
1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,Pp,1s---mn--i,2157784.0,sub,,я,I,OR
2,mst,Mstislav’s letter,orv,189407,2157775,мьстиславъ,мьстиславъ,Ne,-s---mn--i,2157774.0,apos,,Мстислав,Mstislav,OR
3,mst,Mstislav’s letter,orv,189407,2157776,володимирь,володимирь,A-,-s---mnpsi,2157777.0,atr,,Владимира,Vladimir's,OR
4,mst,Mstislav’s letter,orv,189407,2157777,сн҃ъ,сынъ,Nb,-s---mn--i,2157775.0,apos,,сын,son,OR


317282


In [8]:
unique_files         = df_all["File"].unique()
print(unique_files)
print(len(unique_files))

['mst' 'mstislav-col' 'supr' 'vit-meth' 'birchbark' 'pskov' 'const'
 'luk-koloc' 'lav' 'smol-pol-lit' 'nov-sin' 'avv' 'kiev-hyp' 'peter'
 'vest-kur' 'spi' 'zadon' 'rusprav' 'pskov-ivan' 'rig-smol1281' 'zogr'
 'drac' 'sergrad' 'nov-list' 'kiev-mis' 'ostromir-col' 'varlaam' 'afnik'
 'dux-grjaz' 'vit-const' 'ust-vlad' 'riga-goth' 'domo' 'usp-sbor' 'schism'
 'nov-marg' 'suz-lav' 'novgorod-jaroslav' 'pvl-hyp' 'psal-sin']
40


In [9]:
df_all.to_csv('OUTPUTS/dataframe_01_02.csv')

In [10]:
assert len(unique_files) == 40, "Unexpected number of input files"
assert Path('OUTPUTS/dataframe_01_02.csv').exists(), "Error creating output file"