# Fomrat Šolar corpus data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import logging
import xml.etree.ElementTree as ET

from utils.logging import get_logger
from solar_prepare_data import SOLAR_DIRECTORY, SOLAR_FILE


In [None]:
# Get logger
solar_data_logger = get_logger("Format Solar Corpus Data")


In [None]:
# Constants
SOLAR_SOURCE_FILE = "Developmental corpus Šolar 3.0/Solar.TEI/"
SOLAR_FILE_META = SOLAR_DIRECTORY + SOLAR_SOURCE_FILE + "solar.xml"
SOLAR_FILE_SOURCE = SOLAR_DIRECTORY + SOLAR_SOURCE_FILE + "solar-orig.xml"
SOLAR_FILE_TARGET = SOLAR_DIRECTORY + SOLAR_SOURCE_FILE + "solar-corr.xml"
SOLAR_FILE_LINK = SOLAR_DIRECTORY + SOLAR_SOURCE_FILE + "solar-errs.xml"


In [None]:
def main():
    """
    Main function for the solar format data. Functions extracts origin sentences,
    corrected sentences, error types and metadata from various xml files and
    writes it to one combined file.
    """
    # Pass the path of the xml document to enable the parsing process
    element_tree_meta = ET.parse(SOLAR_FILE_META)
    element_tree_source = ET.parse(SOLAR_FILE_SOURCE)
    element_tree_target = ET.parse(SOLAR_FILE_TARGET)
    element_tree_link = ET.parse(SOLAR_FILE_LINK)

    # Get the parent tag of the xml document
    root_meta = element_tree_meta.getroot()
    root_source = element_tree_source.getroot()
    root_target = element_tree_target.getroot()
    root_link = element_tree_link.getroot()

    # Map source, target and link to include tag
    root_meta[1][0][0] = root_source
    root_meta[1][0][1] = root_target
    root_meta[2] = root_link

    # Save the xml file
    element_tree_meta.write(
        SOLAR_FILE,
        encoding="utf-8",
        xml_declaration=True,
        default_namespace=ET.register_namespace("", "http://www.tei-c.org/ns/1.0"),
        method="xml",
    )


In [None]:
# Run solar data format
main()
