# Prepare Šolar corpus sentence data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd

from utils.solar_enum import Solar
from utils.logging import get_logger
from solar_prepare_text import get_solar_word_text
from solar_prepare_data import read_solar, SOLAR_DIRECTORY, SOLAR_ID, SOLAR_TAG_SEG


In [None]:
# Get logger
solar_sentence_logger = get_logger("Prepare Solar Corpus Sentence Data")


In [None]:
def get_solar_data_sentence(text=Solar.COMPLETE):
    """
    Gets the solar corpus data and returns a data frame of sentences,
    corresponding sentence ids and word ids.

    text = COMPLETE: complete text
    text = SOURCE: source text
    text = TARGET: target text

    @param text: corpus type (complete, source or target)
    @return: data frame of sentences and corresponding sentence ids and word ids
    """
    index_text = [0, 1]  # default index_text - Solar.COMPLETE
    if text == Solar.SOURCE:
        index_text = [0]  # set index_text to Solar.SOURCE
    elif text == Solar.TARGET:
        index_text = [1]  # set index_text to Solar.TARGET
    elif not text == Solar.COMPLETE:
        solar_sentence_logger.warning("Invalid text parameter")

    # Read solar corpus data
    data_xml = read_solar()
    data = []

    # Loop through the list of text
    for index_i in index_text:
        # Loop through the list of div
        for index_j in range(len(data_xml[1][0][index_i][0])):
            # Loop through the list of p (skip bibl)
            for index_k in range(1, len(data_xml[1][0][index_i][0][index_j])):
                # Loop through the list of s
                for index_l in range(len(data_xml[1][0][index_i][0][index_j][index_k])):
                    sentence_id = data_xml[1][0][index_i][0][index_j][index_k][
                        index_l
                    ].get(SOLAR_ID)

                    # Save combination of [word, word_id]
                    temp_data = ["", ""]

                    # Loop through the list of w, pc, seg (skip linkGrp)
                    for index_m in range(
                        len(data_xml[1][0][index_i][0][index_j][index_k][index_l]) - 1
                    ):
                        if (
                            data_xml[1][0][index_i][0][index_j][index_k][index_l][
                                index_m
                            ].tag
                            == SOLAR_TAG_SEG
                        ):
                            # Loop through the list of w, pc in seg
                            for index_n in range(
                                len(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ][index_m]
                                )
                            ):
                                # This variable indicates if it is the end of the sentence
                                space_flag = (
                                    index_n
                                    == len(
                                        data_xml[1][0][index_i][0][index_j][index_k][
                                            index_l
                                        ][index_m]
                                    )
                                    - 1
                                )

                                temp_data[0] += get_solar_word_text(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ][index_m][index_n]
                                )
                                temp_data[1] += data_xml[1][0][index_i][0][index_j][
                                    index_k
                                ][index_l][index_m][index_n].get(SOLAR_ID) + (
                                    " " if not space_flag else ""
                                )
                        else:
                            # This variable indicates if it is the end of the sentence
                            space_flag = (
                                index_m
                                == len(
                                    data_xml[1][0][index_i][0][index_j][index_k][
                                        index_l
                                    ]
                                )
                                - 2
                            )

                            temp_data[0] += get_solar_word_text(
                                data_xml[1][0][index_i][0][index_j][index_k][index_l][
                                    index_m
                                ],
                                space_flag,
                            )
                            temp_data[1] += data_xml[1][0][index_i][0][index_j][
                                index_k
                            ][index_l][index_m].get(SOLAR_ID) + (
                                " " if not space_flag else ""
                            )

                    data.append([sentence_id, *temp_data])

    df_columns = ["id", "sentence", "word_id"]
    df_rows = data

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_solar_sentence():
    """
    Saves the solar corpus sentences as a data frame to a csv file.
    """
    # Get complete data frame
    data_complete = get_solar_data_sentence(text=Solar.COMPLETE)
    data_source = get_solar_data_sentence(text=Solar.SOURCE)
    data_target = get_solar_data_sentence(text=Solar.TARGET)

    data_complete.to_csv(SOLAR_DIRECTORY + "solar_complete_sentence.csv", index=False)
    data_source.to_csv(SOLAR_DIRECTORY + "solar_source_sentence.csv", index=False)
    data_target.to_csv(SOLAR_DIRECTORY + "solar_target_sentence.csv", index=False)
    solar_sentence_logger.info("Solar data saved to csv file")

    return
