# Prepare Šolar corpus text data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from utils.solar_enum import Solar
from utils.logging import get_logger
from solar_prepare_data import read_solar, SOLAR_DIRECTORY, SOLAR_TAG_SEG


In [None]:
# Get logger
solar_text_logger = get_logger("Prepare Solar Corpus Text Data")


In [None]:
def get_solar_word_text(data_xml, space_flag=False):
    """
    Returns the word text from the solar data.

    @param data_xml: the solar data in xml format
    @param space_flag: whether to add space between words
    @return: the word text
    """
    word_text = data_xml.text

    # If word has property join right don't add space
    if not data_xml.get("join") == "right" and not space_flag:
        word_text += " "

    return word_text


In [None]:
def get_solar_text(text=Solar.COMPLETE):
    """
    Gets the solar corpus data and returns a string of text.

    text = COMPLETE: complete text
    text = SOURCE: source text
    text = TARGET: target text

    @param text: corpus type (complete, source or target)
    @return: the corpus data in text format
    """
    index_text = [0, 1]  # default index_text - Solar.COMPLETE
    if text == Solar.SOURCE:
        index_text = [0]  # set index_text to Solar.SOURCE
    elif text == Solar.TARGET:
        index_text = [1]  # set index_text to Solar.TARGET
    elif not text == Solar.COMPLETE:
        solar_text_logger.warning("Invalid text parameter")

    # Read solar corpus data
    data_xml = read_solar()
    data = []

    # Loop through the list of text
    for index_i in index_text:
        temp_data = ""
        # Loop through the list of div
        for index_j in range(len(data_xml[1][0][index_i][0])):
            # Loop through the list of p (skip bibl)
            for index_k in range(1, len(data_xml[1][0][index_i][0][index_j])):
                # Loop through the list of s
                for index_l in range(len(data_xml[1][0][index_i][0][index_j][index_k])):
                    # Loop through the list of w, pc, seg (skip linkGrp)
                    for index_m in range(
                        len(data_xml[1][0][index_i][0][index_j][index_k][index_l]) - 1
                    ):
                        if (
                            data_xml[1][0][index_i][0][index_j][index_k][index_l][
                                index_m
                            ].tag
                            == SOLAR_TAG_SEG
                        ):
                            # Loop through the list of w, pc in seg
                            for index_n in range(
                                len(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ][index_m]
                                )
                            ):
                                temp_data += get_solar_word_text(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ][index_m][index_n]
                                )
                        else:
                            temp_data += get_solar_word_text(
                                data_xml[1][0][index_i][0][index_j][index_k][index_l][
                                    index_m
                                ]
                            )

                temp_data += "\n"
            temp_data += "\n"
        data.append(temp_data)

    return data


In [None]:
def save_solar_text():
    """
    Saves the solar corpus text as a txt file.
    """
    # Get complete text
    data = get_solar_text(text=Solar.COMPLETE)

    # Write data in source and target files
    for index, filename in enumerate(
        ["solar_source_text.txt", "solar_target_text.txt"]
    ):
        with open(SOLAR_DIRECTORY + filename, "w") as file:
            file.write(data[index])
            file.close()

    # Write data in solar file
    with open(SOLAR_DIRECTORY + "solar_complete_text.txt", "w") as file:
        file.writelines(data)
        file.close()

    solar_text_logger.info("Solar data saved to txt file")

    return
