# File 01/02

# Description
- This file helps the user identify all values stored in the "morphology" attribute in the XML files.
- The "morphology" attributes are positional morphosyntactic tags,
i.e. each tag consists of exactly 10 characters. Each position encodes a specific
morphological feature.
- Position 0 encodes person (1 = first, 2 = second, 3 = third),
position 1 encodes number (s = singular, d = dual, etc.), and so on.
- Since different parts of speech encode different sets of features,
the placeholder "-" is used to ensure positional consistency across tags.
(See the output for further details.)

## INPUT_FILES:
../source_data/treebank-releases-20180919

OUTPUT_FILE:
- NONE 

In [1]:
import os
import glob
import xml.etree.ElementTree as ET
from collections import defaultdict

# folder path to the files
folder = "../source_data/treebank-releases-20180919"

# Create a list of all xml files in "folder" 
xml_files = glob.glob(os.path.join(folder, "*.xml"))

# Dictionary storing another dictionary for each field (e.g. "person", "number" etc.) 
# Each of the fields then collects key-value pairs where key is the abbreviated form used in the xml 
# and value is the "long form" or "meaning" of the key 
# e.g. number: {'s': 'singular', 'd': 'dual', ...}
morphology_dict = defaultdict(dict)

for xml_file in xml_files:
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
    except ET.ParseError as e:
        print(f"Fehler beim Parsen von {xml_file}: {e}")
        continue

    # Find the <annotation> stub and the <morphology> stub inside <annotation> 
    annotation = root.find('annotation')
    if annotation is None:
        continue
    morphology = annotation.find('morphology')
    if morphology is None:
        continue

    # Iterate over all <field> elements in the <morphology> block
    for field in morphology.findall('field'):
        field_tag = field.get('tag')
        if not field_tag:
            continue
        # Iterate over all <value> elements in field
        for value in field.findall('value'):
            code = value.get('tag')
            summary = value.get('summary')
            if code and summary:
                if code in morphology_dict[field_tag]:
                    # if conflicts exist: print a warning message
                    if morphology_dict[field_tag][code] != summary:
                        print(f"Warning: In Field '{field_tag}' -> for Code '{code}' -> there are different translations available: "
                              f"'{morphology_dict[field_tag][code]}' vs. '{summary}'")
                else:
                    morphology_dict[field_tag][code] = summary

# print all categories as "category : {'abbreviaton': 'meaning', ...}
print("Collected morphology data:")
for field, mapping in morphology_dict.items():
    print(f"{field}: {mapping}\n")

Collected morphology data:
person: {'1': 'first person', '2': 'second person', '3': 'third person', 'x': 'uncertain person'}

number: {'s': 'singular', 'd': 'dual', 'p': 'plural', 'x': 'uncertain number'}

tense: {'p': 'present', 'i': 'imperfect', 'r': 'perfect', 's': 'resultative', 'a': 'aorist', 'u': 'past', 'l': 'pluperfect', 'f': 'future', 't': 'future perfect', 'x': 'uncertain tense'}

mood: {'i': 'indicative', 's': 'subjunctive', 'm': 'imperative', 'o': 'optative', 'n': 'infinitive', 'p': 'participle', 'd': 'gerund', 'g': 'gerundive', 'u': 'supine', 'x': 'uncertain mood', 'y': 'finiteness unspecified', 'e': 'indicative or subjunctive', 'f': 'indicative or imperative', 'h': 'subjunctive or imperative', 't': 'finite'}

voice: {'a': 'active', 'm': 'middle', 'p': 'passive', 'e': 'middle or passive', 'x': 'unspecified'}

gender: {'m': 'masculine', 'f': 'feminine', 'n': 'neuter', 'p': 'masculine or feminine', 'o': 'masculine or neuter', 'r': 'feminine or neuter', 'q': 'masculine, femin