# Prepare Lektor corpus token data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

from utils.logging import get_logger
from utils.lektor_enum import Lektor
from lektor_prepare_data import (
    read_lektor,
    LEKTOR_DIRECTORY,
    LEKTOR_TAG_W,
    LEKTOR_TAG_C,
    LEKTOR_TAG_LEKT,
    LEKTOR_TAG_S1,
    LEKTOR_TAG_S,
    MSD_INDEX_FILE,
)


In [None]:
# Get logger
lektor_token_logger = get_logger("Prepare Lektor Corpus Token Data")


In [None]:
def get_lektor_word_token(
    data_xml,
    text_id,
    paragraph_id,
    sentence_id,
    word_id,
    space_flag=False,
    text=Lektor.SOURCE,
):
    """
    Returns the word data from the lektor data.

    [id, text, lemma, msd (sl), msd (en)]

    @param data_xml: lektor data in xml format
    @param text_id: text/article id
    @param paragraph_id: paragraph id
    @param sentence_id: sentence id
    @param word_id: word id
    @param space_flag: flag to indicate if the word is a space
    @param text: corpus type (source or target)
    @return: nothing - function is called recursively and the data is stored in the global variables
    """
    global lektor_data_global  # global data
    global lektor_counter_global  # global counter

    # Add 1 to the paragraph_id and word_id because the id should start with 1
    word_id = "lektor{:n}{}.{:n}.{:n}.{:n}".format(
        text_id,
        "t" if text == Lektor.TARGET else "s",
        paragraph_id + 1,
        sentence_id,
        word_id + 1,
    )
    word_text = data_xml.text

    if data_xml.tag == LEKTOR_TAG_W:
        # Special case for words
        if not len(data_xml.items()):
            return

        msd_index = np.load(MSD_INDEX_FILE, allow_pickle=True)

        word_lemma = data_xml.get("lemma")
        word_msd_sl = data_xml.get("msd")
        word_msd_en = msd_index[msd_index[:, 1] == word_msd_sl]
        # MSD for english does not exist
        word_msd_en = word_msd_en[0, 0] if len(word_msd_en) else None

        lektor_data_global.append(
            [word_id, word_text, word_lemma, word_msd_sl, word_msd_en, space_flag]
        )
        lektor_counter_global += 1
    elif data_xml.tag == LEKTOR_TAG_C:
        # Special case is punctuation
        word_lemma = data_xml.text
        word_msd_sl = "U"
        word_msd_en = "Z"
        lektor_data_global.append(
            [word_id, word_text, word_lemma, word_msd_sl, word_msd_en, space_flag]
        )
        lektor_counter_global += 1
    elif LEKTOR_TAG_LEKT in data_xml.tag:
        index_text = 0
        if text == Lektor.TARGET:
            index_text = 1
        elif not text == Lektor.SOURCE:
            lektor_token_logger.warning("Invalid text parameter")

        # Loop through the list of w, c, lekt
        for index in range(len(data_xml[index_text])):
            # Recursively call this function to get the word text (nested lekt)
            get_lektor_word_token(
                data_xml[index_text][index],
                text_id,
                paragraph_id,
                sentence_id,
                lektor_counter_global,
                index + 1 < len(data_xml[index_text])
                and data_xml[index_text][index + 1].tag == LEKTOR_TAG_S,
                text,
            )

    return


In [None]:
def get_lektor_token(text=Lektor.COMPLETE):
    """
    Gets the lektor corpus data and returns a data frame of words, lemmas and mds.

    text = COMPLETE: complete text
    text = SOURCE: source text
    text = TARGET: target text

    @param text: corpus type (complete, source or target)
    @return: data frame of word ids, words, lemmas, msd in sl and en and space indicators
    """
    index_text = [Lektor.SOURCE, Lektor.TARGET]  # default index_text - Lektor.COMPLETE
    if text == Lektor.SOURCE or text == Lektor.TARGET:
        index_text = [text]  # set index_text to Lektor.SOURCE or Lektor.TARGET
    elif not text == Lektor.COMPLETE:
        lektor_token_logger.warning("Invalid text parameter")

    # Read lektor corpus data
    data_xml = read_lektor()

    # This is global array that will contain the data and its counter
    global lektor_data_global
    lektor_data_global = []
    global lektor_counter_global
    lektor_counter_global = 0
    sentence_counter = 0

    # Loop through the list of SOURCE or TARGET text
    for index_i in index_text:
        # Loop through the list of text (skip head)
        for index_j in range(1, len(data_xml), 2):
            # Loop through the list of p
            for index_k in range(len(data_xml[index_j])):
                sentence_counter = 0  # reset the sentence counter
                # Loop through the list of w, c, lekt
                for index_l in range(len(data_xml[index_j][index_k])):
                    # Increase the sentence counter
                    if data_xml[index_j][index_k][index_l].tag == LEKTOR_TAG_S1:
                        sentence_counter += 1  # increase the sentence counter
                        lektor_counter_global = 0  # reset the word counter

                    get_lektor_word_token(
                        data_xml[index_j][index_k][index_l],
                        index_j // 2 + 1,
                        index_k,
                        sentence_counter,
                        lektor_counter_global,
                        index_l + 1 < len(data_xml[index_j][index_k])
                        and data_xml[index_j][index_k][index_l + 1].tag == LEKTOR_TAG_S,
                        index_i,
                    )

    df_columns = ["id", "text", "lemma", "msd (sl)", "msd (en)", "space_flag"]
    df_rows = lektor_data_global

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_lektor_token():
    """
    Saves the lektor corpus as a data frame to a csv file.
    """
    # Get complete data frame
    data_complete = get_lektor_token(text=Lektor.COMPLETE)
    data_source = get_lektor_token(text=Lektor.SOURCE)
    data_target = get_lektor_token(text=Lektor.TARGET)

    data_complete.to_csv(LEKTOR_DIRECTORY + "lektor_complete_token.csv", index=False)
    data_source.to_csv(LEKTOR_DIRECTORY + "lektor_source_token.csv", index=False)
    data_target.to_csv(LEKTOR_DIRECTORY + "lektor_target_token.csv", index=False)
    lektor_token_logger.info("Lektor data saved to csv file")

    return
