# Prepare Lektor corpus text data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from utils.logging import get_logger
from utils.lektor_enum import Lektor
from lektor_prepare_data import (
    read_lektor,
    LEKTOR_DIRECTORY,
    LEKTOR_TAG_W,
    LEKTOR_TAG_C,
    LEKTOR_TAG_LEKT,
    LEKTOR_TAG_S,
)


In [None]:
# Get logger
lektor_text_logger = get_logger("Prepare Lektor Corpus Text Data")


In [None]:
def get_lektor_word_text(data_xml, text=Lektor.SOURCE):
    """
    Returns the word text from the lektor data.

    text = SOURCE: source text
    text = TARGET: target text

    @param data_xml: lektor data in xml format
    @text: corpus type (source or target)
    @return: word in the corpus
    """
    data = ""

    if data_xml.tag == LEKTOR_TAG_W or data_xml.tag == LEKTOR_TAG_C:
        # Add a word or punctuation to data
        data = data_xml.text
    elif LEKTOR_TAG_LEKT in data_xml.tag:
        index_text = 0
        if text == Lektor.TARGET:
            index_text = 1
        elif not text == Lektor.SOURCE:
            lektor_text_logger.warning("Invalid text parameter")

        # Loop through the list of w, c, lekt
        for index in range(len(data_xml[index_text])):
            # Recursively call this function to get the word text (nested lekt)
            data += get_lektor_word_text(data_xml[index_text][index], text)
    elif data_xml.tag == LEKTOR_TAG_S:
        # Add space to data
        data = " "

    return data if data else ""


In [None]:
def get_lektor_text(text=Lektor.COMPLETE):
    """
    Gets the lektor corpus and returns a string of text.

    text = COMPLETE: complete text
    text = SOURCE: source text
    text = TARGET: target text

    @param text: corpus type (complete, source or target)
    @return: sentences in the corpus
    """
    index_text = [Lektor.SOURCE, Lektor.TARGET]  # default index_text - Lektor.COMPLETE
    if text == Lektor.SOURCE or text == Lektor.TARGET:
        index_text = [text]  # set index_text to Lektor.SOURCE or Lektor.TARGET
    elif not text == Lektor.COMPLETE:
        lektor_text_logger.warning("Invalid text parameter")

    # Read lektor corpus data
    data_xml = read_lektor()
    data = []

    # Loop through the list of SOURCE or TARGET text
    for index_i in index_text:
        temp_data = ""
        # Loop through the list of text (skip head)
        for index_j in range(1, len(data_xml), 2):
            # Loop through the list of p
            for index_k in range(len(data_xml[index_j])):
                # Loop through the list of w, c, lekt
                for index_l in range(len(data_xml[index_j][index_k])):
                    temp_data += get_lektor_word_text(
                        data_xml[index_j][index_k][index_l], index_i
                    )
                temp_data += "\n"
            temp_data += "\n"
        data.append(temp_data)

    return data


In [None]:
def save_lektor_text():
    """
    Saves the lektor corpus as a txt file.
    """
    # Get complete text
    data = get_lektor_text(text=Lektor.COMPLETE)

    # Write data in source and target files
    for index, filename in enumerate(
        ["lektor_source_text.txt", "lektor_target_text.txt"]
    ):
        with open(LEKTOR_DIRECTORY + filename, "w") as file:
            file.write(data[index])
            file.close()

    # Write data in lektor file
    with open(LEKTOR_DIRECTORY + "lektor_complete.txt", "w") as file:
        file.writelines(data)
        file.close()

    lektor_text_logger.info("Lektor data saved to txt file")

    return
