# Prepare Lektor corpus data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd

from utils.reading import read_xml
from utils.logging import get_logger
from utils.lektor_enum import Lektor


In [None]:
# Get logger
lektor_data_logger = get_logger("Prepare Lektor Corpus Data")


In [None]:
# Constants
LEKTOR_DIRECTORY = "../../data/lektor/"
LEKTOR_FILE = "../../data/lektor/lektor_complete.xml"  # complete lektor data
MSD_INDEX_FILE = "../../slovene_specification/MSD_index.npy"  # MSD index

# Lektor Text Encoding Initiative (TEI) TAG
LEKTOR_TAG_ROOT = "root"
LEKTOR_TAG_HEAD = "head"
LEKTOR_TAG_TEXT = "text"
LEKTOR_TAG_ID = "id"
LEKTOR_TAG_IZVOR = "izvor"
LEKTOR_TAG_SPOL_AVTORJA = "spol-avtorja"
LEKTOR_TAG_IZOBRAZBA_AVTORJA = "izobrazba-avtorja"
LEKTOR_TAG_STOPNJA_IZOBRAZBE = "stopnja-izobrazbe"
LEKTOR_TAG_IZBORNI_JEZIK = "izvorni-jezik"
LEKTOR_TAG_SPOL_LEKTORJA = "spol-lektorja"
LEKTOR_TAG_STAROST_LEKTORJA = "starost-lektorja"
LEKTOR_TAG_IZOBRAZBA_LEKTORJA = "izobrazba-lektorja"
LEKTOR_TAG_P = "p"
LEKTOR_TAG_S = "S"
LEKTOR_TAG_S0 = "s0"  # end of sentence
LEKTOR_TAG_S1 = "s1"  # beginning of sentence
LEKTOR_TAG_W = "w"
LEKTOR_TAG_C = "c"
LEKTOR_TAG_LEKT = "lekt"
LEKTOR_TAG_DEL = "del"
LEKTOR_TAG_INS = "ins"

# Lektor attribute name for identifying the lektor text
# Lektor id: lektor{#div}{s-t}.{#p}.{#s}.{#w}
LEKTOR_ID = "LEKTOR_ID"


In [None]:
def read_lektor():
    """
    Reads the lektor corpus and returns a list of dictionaries.

    Lektor:
    -> head[30 = # of written works] -> id + izvor + spol_avtorja +
        + izobrazba-avtorja + stopnja-izobrazbe + izvorni-jezik + spol-lektorja +
        + starost-lektorja + izobrazba-lektorja
    -> text[30 = # of written works] -> p[...] -> w[...] + c[...]

    text = written work
    p = paragraph
    s = space
    w = word
    c = punctuation
    lekt = correction (del + ins)

    @return: list of dictionaries
    """
    # Read the lektor data and return it
    data_xml = read_xml(LEKTOR_FILE)
    lektor_data_logger.info("Lektor data read")

    return data_xml


In [None]:
def get_lektor_lekt_data(data_xml, text=Lektor.SOURCE):
    """
    Gets the lektor source data for lekt section and returns sentence and the
    error type.

    @param data_xml: lektor data in xml format
    @param text: lektor text (source or target)
    @return: lektor data for lekt section
    """
    index_text = 0
    if text == Lektor.TARGET:
        index_text = 1
    elif not text == Lektor.SOURCE:
        lektor_data_logger.warning("Invalid text parameter")

    sentence = ""
    sentence_error = (
        data_xml.get("tip") if not index_text else ""
    )  # error type for sentence

    # Loop through the list of w, c, lekt
    for index in range(len(data_xml[index_text])):
        if (
            data_xml[index_text][index].tag == LEKTOR_TAG_W
            or data_xml[index_text][index].tag == LEKTOR_TAG_C
        ):
            # Special case for words and punctuation
            sentence += (
                data_xml[index_text][index].text
                if data_xml[index_text][index].text
                else ""
            )

        elif data_xml[index_text][index].tag == LEKTOR_TAG_S:
            # Special case for spaces
            sentence += " "

        elif LEKTOR_TAG_LEKT in data_xml[index_text][index].tag:
            # Special case for lektor
            # Recursively call this function to get the word text (nested lekt)
            temp_sentence, temp_sentence_error = get_lektor_lekt_data(
                data_xml[index_text][index], text
            )
            sentence += temp_sentence
            sentence_error = (
                temp_sentence_error
                + (";" if len(temp_sentence_error) else "")
                + sentence_error
            )

    return sentence, sentence_error


In [None]:
def get_lektor_data_multiple_error():
    """
    Gets the lektor source and target data, combines words into sentences with
    errors and returns the data frame of sentence ids, sentences and error
    types in sentences. Generated sentences contain multiple errors.

    @return: data frame of sentence ids, sentences and error types in sentences
    """
    # Read the lektor corpus data
    data_xml = read_lektor()

    # Store multiple error sentence data
    data = []
    sentence = ""
    sentence_id = ""
    sentence_error = ""
    temp_sentence = ""
    sentence_counter = 0

    # Loop through the list of text (skip head)
    for index_i in range(1, len(data_xml), 2):
        # Loop through the list of p
        for index_j in range(len(data_xml[index_i])):
            sentence_counter = 0  # reset the sentence counter
            # Loop through the list of w, c, lekt
            for index_k in range(len(data_xml[index_i][index_j])):
                # Beginning of sentence
                if data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_S1:
                    # Add sentence triplet to data and reset temp data
                    data.append([sentence_id, sentence, sentence_error])

                    # If sentence contains any errors, append correct sentence
                    if not sentence_error == "":
                        data.append([sentence_id, temp_sentence, ""])

                    sentence_counter += 1  # increase the sentence counter
                    sentence = ""
                    temp_sentence = ""
                    sentence_id = "{}.{}.{}".format(
                        index_i // 2 + 1, index_j + 1, sentence_counter
                    )
                    sentence_error = ""
                    lektor_data_logger.info("Processing sentence: " + sentence_id)

                elif (
                    data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_W
                    or data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_C
                ):
                    # Special case for words and punctuation
                    temp_temp_sentence = (
                        data_xml[index_i][index_j][index_k].text
                        if data_xml[index_i][index_j][index_k].text
                        else ""
                    )
                    sentence += temp_temp_sentence
                    temp_sentence += temp_temp_sentence

                elif data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_S:
                    # Special case for spaces
                    sentence += " "
                    temp_sentence += " "

                elif LEKTOR_TAG_LEKT in data_xml[index_i][index_j][index_k].tag:
                    # Special case for lektor
                    temp_temp_sentence, temp_temp_sentence_error = get_lektor_lekt_data(
                        data_xml[index_i][index_j][index_k], Lektor.SOURCE
                    )
                    sentence += temp_temp_sentence
                    sentence_error += temp_temp_sentence_error + (
                        ";" if len(temp_temp_sentence_error) else ""
                    )
                    temp_sentence += get_lektor_lekt_data(
                        data_xml[index_i][index_j][index_k], Lektor.TARGET
                    )[0]

    # Append last sentence and remove first one (empty)
    data.append([sentence_id, sentence, sentence_error])
    # If sentence contains any errors, append correct sentence
    if not sentence_error == "":
        data.append([sentence_id, temp_sentence, ""])

    data = data[1:]

    df_columns = ["id", "sentence", "error"]
    df_rows = data

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_lektor_data_multiple_error():
    """
    Saves the lektor corpus sentences and corresponding errors to a csv file.
    """
    data_multiple_error = get_lektor_data_multiple_error()

    data_multiple_error.to_csv(
        LEKTOR_DIRECTORY + "lektor_data_multiple_error_raw.csv", index=False
    )
    lektor_data_logger.info("Lektor data with multiple errors saved to csv file")

    return


In [None]:
def get_lektor_data_no_error(data_xml, text=Lektor.SOURCE):
    """
    Gets the lektor source and target data for lekt section and returns a
    sentences with only one error in the data frame of sentence id, sentence
    and error type in sentence.

    @param data_xml: lektor data xml
    @param text: corpus type (source or target)
    @return: a tuple of sentence and error type in sentence
    """
    index_text = 0
    if text == Lektor.TARGET:
        index_text = 1
    elif not text == Lektor.SOURCE:
        lektor_data_logger.warning("Invalid text parameter")

    sentence = [""]
    sentence_error = [
        (data_xml.get("tip") if not index_text else "")
    ]  # error type for sentence

    # Loop through the list of w, c, lekt
    for index in range(len(data_xml[index_text])):
        if (
            data_xml[index_text][index].tag == LEKTOR_TAG_W
            or data_xml[index_text][index].tag == LEKTOR_TAG_C
        ):
            # Special case for words and punctuation
            for temp_index in range(len(sentence)):
                sentence[temp_index] += (
                    data_xml[index_text][index].text
                    if data_xml[index_text][index].text
                    else ""
                )

        elif data_xml[index_text][index].tag == LEKTOR_TAG_S:
            # Special case for spaces
            for temp_index in range(len(sentence)):
                sentence[temp_index] += " "

        elif LEKTOR_TAG_LEKT in data_xml[index_text][index].tag:
            # Special case for lektor
            temp_sentence = sentence[0]
            temp_error_sentence = sentence_error[0]

            # Recursively call this function to get the word text (nested lekt)
            # End sentence with only target tokens
            temp_sentence_target, temp_error_target = get_lektor_data_no_error(
                data_xml[index_text][index], Lektor.TARGET
            )
            for temp_index in range(len(sentence)):
                sentence[temp_index] += temp_sentence_target[0]

            # Handle error type for lekt
            temp_sentence_source, temp_error_source = get_lektor_data_no_error(
                data_xml[index_text][index], Lektor.SOURCE
            )

            # Add sentences before target sentence
            for temp_index in range(len(temp_sentence_source)):
                sentence.append(temp_sentence)
                sentence_error.append(temp_error_sentence)
                sentence[-1] += temp_sentence_source[temp_index]
                sentence_error[-1] = (
                    temp_error_source[temp_index]
                    + (";" if len(temp_error_source[temp_index]) else "")
                    + sentence_error[-1]
                )

    return sentence, sentence_error


In [None]:
def get_lektor_data_single_error():
    """
    Gets the lektor source and target data, combines words into sentences with
    errors and returns the data frame of sentence ids, sentences and error
    types in sentences. Generated sentences contain only one error per sentence.

    @return: a data frame of sentence ids, sentences and error types in sentences
    """
    # Read the lektor corpus data
    data_xml = read_lektor()

    # Store multiple error sentence data
    data = []
    sentence = [""]
    sentence_id = ""
    sentence_error = [""]
    sentence_counter = 0

    # Loop through the list of text (skip head)
    for index_i in range(1, len(data_xml), 2):
        # Loop through the list of p
        for index_j in range(len(data_xml[index_i])):
            sentence_counter = 0  # reset the sentence counter
            # Loop through the list of w, c, lekt
            for index_k in range(len(data_xml[index_i][index_j])):
                # Beginning of sentence
                if data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_S1:
                    # Add sentence triplet to data and reset temp data
                    for index in range(len(sentence)):
                        data.append(
                            [sentence_id, sentence[index], sentence_error[index]]
                        )
                    sentence_counter += 1  # increase the sentence counter
                    sentence = [""]
                    sentence_id = "{}.{}.{}".format(
                        index_i // 2 + 1, index_j + 1, sentence_counter
                    )
                    sentence_error = [""]
                    lektor_data_logger.info("Processing sentence: " + sentence_id)

                elif (
                    data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_W
                    or data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_C
                ):
                    # Special case for words and punctuation
                    for index in range(len(sentence)):
                        sentence[index] += (
                            data_xml[index_i][index_j][index_k].text
                            if data_xml[index_i][index_j][index_k].text
                            else ""
                        )

                elif data_xml[index_i][index_j][index_k].tag == LEKTOR_TAG_S:
                    # Special case for spaces
                    for index in range(len(sentence)):
                        sentence[index] += " "

                elif LEKTOR_TAG_LEKT in data_xml[index_i][index_j][index_k].tag:
                    # Special case for lektor
                    temp_sentence = sentence[0]
                    temp_error_sentence = sentence_error[0]

                    # End sentence with only target tokens
                    temp_sentence_target, temp_error_target = get_lektor_data_no_error(
                        data_xml[index_i][index_j][index_k], Lektor.TARGET
                    )
                    for index in range(len(sentence)):
                        sentence[index] += temp_sentence_target[0]

                    # Handle error type for lekt
                    temp_sentence_source, temp_error_source = get_lektor_data_no_error(
                        data_xml[index_i][index_j][index_k], Lektor.SOURCE
                    )

                    # Add sentences before target sentence
                    for index in reversed(range(len(temp_sentence_source))):
                        sentence.append(temp_sentence)
                        sentence_error.append(temp_error_sentence)
                        sentence[-1] += temp_sentence_source[index]
                        sentence_error[-1] += temp_error_source[index] + (
                            ";" if len(temp_error_source[index]) else ""
                        )

    # Append last sentence and remove first one (empty)
    for index in range(len(sentence)):
        data.append([sentence_id, sentence[index], sentence_error[index]])
    data = data[1:]

    df_columns = ["id", "sentence", "error"]
    df_rows = data

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_lektor_data_single_error():
    """
    Saves the lektor corpus sentences and corresponding errors to a csv file.
    """
    data_single_error = get_lektor_data_single_error()

    data_single_error.to_csv(
        LEKTOR_DIRECTORY + "lektor_data_single_error_raw.csv", index=False
    )
    lektor_data_logger.info("Lektor data with single errors saved to csv file")

    return
