# Prepare Šolar corpus token data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

from utils.solar_enum import Solar
from utils.logging import get_logger
from solar_prepare_data import (
    read_solar,
    SOLAR_DIRECTORY,
    SOLAR_ID,
    SOLAR_TAG_SEG,
    MSD_INDEX_FILE,
)


In [None]:
# Get logger
solar_token_logger = get_logger("Prepare Solar Corpus Token Data")


In [None]:
def get_solar_word_token(data_xml, space_flag=False):
    """
    Returns the word data from the solar data.

    [id, text, lemma, msd (sl), msd (en), space_flag]

    @param data_xml: solar data in xml format
    @param space_flag: flag to indicate if the word is a space
    @return: a tuple of word id, word text, lemma, msd in sl and en and space indicator
    """
    # Load msd index from slovene specification
    msd_index = np.load(MSD_INDEX_FILE, allow_pickle=True)

    word_id = data_xml.get(SOLAR_ID)
    word_text = data_xml.text
    word_lemma = data_xml.get("lemma")
    word_msd_sl = data_xml.get("ana").split(":")[-1]
    word_msd_en = msd_index[msd_index[:, 1] == word_msd_sl][0, 0]
    word_space = not data_xml.get("join") == "right" or space_flag

    return [word_id, word_text, word_lemma, word_msd_sl, word_msd_en, word_space]


In [None]:
def get_solar_token(text=Solar.COMPLETE):
    """
    Gets the solar corpus data and returns a data frame of words, lemmas and mds.

    text = COMPLETE: complete text
    text = SOURCE: source text
    text = TARGET: target text

    @param text: corpus type (complete, source or target)
    """
    index_text = [0, 1]  # default index_text - Solar.COMPLETE
    if text == Solar.SOURCE:
        index_text = [0]  # set index_text to Solar.SOURCE
    elif text == Solar.TARGET:
        index_text = [1]  # set index_text to Solar.TARGET
    elif not text == Solar.COMPLETE:
        solar_token_logger.warning("Invalid text parameter")

    # Read solar corpus data
    data_xml = read_solar()
    data = []

    # Loop through the list of text
    for index_i in index_text:
        # Loop through the list of div
        for index_j in range(len(data_xml[1][0][index_i][0])):
            # Loop through the list of p (skip bibl)
            for index_k in range(1, len(data_xml[1][0][index_i][0][index_j])):
                # Loop through the list of s
                for index_l in range(len(data_xml[1][0][index_i][0][index_j][index_k])):
                    # Loop through the list of w, pc, seg (skip linkGrp)
                    for index_m in range(
                        len(data_xml[1][0][index_i][0][index_j][index_k][index_l]) - 1
                    ):
                        if (
                            data_xml[1][0][index_i][0][index_j][index_k][index_l][
                                index_m
                            ].tag
                            == SOLAR_TAG_SEG
                        ):
                            # Loop through the list of w, pc in seg
                            for index_n in range(
                                len(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ][index_m]
                                )
                            ):
                                # This variable indicates if it is the end of the sentence
                                space_flag = (
                                    index_n
                                    == len(
                                        data_xml[1][0][index_i][0][index_j][index_k][
                                            index_l
                                        ][index_m]
                                    )
                                    - 1
                                )
                                data.append(
                                    get_solar_word_token(
                                        data_xml[1][0][index_i][0][index_j][index_k][
                                            index_l
                                        ][index_m][index_n],
                                        space_flag,
                                    )
                                )
                        else:
                            # This variable indicates if it is the end of the sentence
                            space_flag = (
                                index_m
                                == len(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ]
                                )
                                - 2
                            )
                            data.append(
                                get_solar_word_token(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ][index_m],
                                    space_flag,
                                )
                            )

    df_columns = ["id", "text", "lemma", "msd (sl)", "msd (en)", "space"]
    df_rows = data

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_solar_token():
    """
    Saves the solar corpus tokens as a data frame to a csv file.
    """
    # Get complete data frame
    data_complete = get_solar_token(text=Solar.COMPLETE)
    data_source = get_solar_token(text=Solar.SOURCE)
    data_target = get_solar_token(text=Solar.TARGET)

    data_complete.to_csv(SOLAR_DIRECTORY + "solar_complete_token.csv", index=False)
    data_source.to_csv(SOLAR_DIRECTORY + "solar_source_token.csv", index=False)
    data_target.to_csv(SOLAR_DIRECTORY + "solar_target_token.csv", index=False)
    solar_token_logger.info("Solar tokens saved to csv file")

    return
