# Prepare Lektor corpus token data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

from utils.logging import get_logger
from utils.lektor_enum import Lektor
from lektor_prepare_data import (
    read_lektor,
    get_lektor_lekt_data,
    LEKTOR_DIRECTORY,
    LEKTOR_TAG_W,
    LEKTOR_TAG_C,
    LEKTOR_TAG_LEKT,
    LEKTOR_TAG_S1,
    LEKTOR_TAG_S,
)


In [None]:
# Get logger
lektor_sentence_logger = get_logger("Prepare Lektor Corpus Sentence Data")


In [None]:
def get_lektor_data_sentence(text=Lektor.COMPLETE):
    """
    Gets the lektor corpus data and returns a data frame of sentences.

    text = COMPLETE: complete text
    text = SOURCE: source text
    text = TARGET: target text

    @param text: corpus type (complete, source or target)
    @return: data frame of sentence ids and sentences
    """
    index_text = [Lektor.SOURCE, Lektor.TARGET]  # default index_text - Lektor.COMPLETE
    if text == Lektor.SOURCE or text == Lektor.TARGET:
        index_text = [text]  # set index_text to Lektor.SOURCE or Lektor.TARGET
    elif not text == Lektor.COMPLETE:
        lektor_sentence_logger.warning("Invalid text parameter")

    # Read lektor corpus data
    data_xml = read_lektor()

    # Store sentence data
    data = []
    sentence = ""
    sentence_id = ""
    sentence_counter = 0

    # Loop through the list of SOURCE or TARGET text
    for index_i in index_text:
        # Loop through the list of text (skip head)
        for index_j in range(1, len(data_xml), 2):
            # Loop through the list of p
            for index_k in range(len(data_xml[index_j])):
                sentence_counter = 0  # reset the sentence counter
                # Loop through the list of w, c, lekt
                for index_l in range(len(data_xml[index_j][index_k])):
                    # Increase the sentence counter
                    if data_xml[index_j][index_k][index_l].tag == LEKTOR_TAG_S1:
                        # Add sentence tuple to data and reset temp data
                        data.append([sentence_id, sentence])
                        sentence_counter += 1  # increase the sentence counter
                        sentence = ""
                        sentence_id = "{}.{}.{}".format(
                            index_j // 2 + 1, index_k + 1, sentence_counter
                        )
                        lektor_sentence_logger.info(
                            "Processing sentence: " + sentence_id
                        )

                    elif (
                        data_xml[index_j][index_k][index_l].tag == LEKTOR_TAG_W
                        or data_xml[index_j][index_k][index_l].tag == LEKTOR_TAG_C
                    ):
                        # Special case for words and punctuation
                        sentence += (
                            data_xml[index_j][index_k][index_l].text
                            if data_xml[index_j][index_k][index_l].text
                            else ""
                        )
                    elif data_xml[index_j][index_k][index_l].tag == LEKTOR_TAG_S:
                        # Special case for spaces
                        sentence += " "
                    elif LEKTOR_TAG_LEKT in data_xml[index_j][index_k][index_l].tag:
                        # Special case for lektor
                        temp_sentence, _ = get_lektor_lekt_data(
                            data_xml[index_j][index_k][index_l], index_i
                        )
                        sentence += temp_sentence

    # Append last sentence and remove first one (empty)
    data.append([sentence_id, sentence])
    data = data[1:]

    df_columns = ["id", "sentence"]
    df_rows = data

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_lektor_sentence():
    """
    Saves the lektor corpus sentences as a data frame to a csv file.
    """
    # Get complete data frame
    data_complete = get_lektor_data_sentence(text=Lektor.COMPLETE)
    data_source = get_lektor_data_sentence(text=Lektor.SOURCE)
    data_target = get_lektor_data_sentence(text=Lektor.TARGET)

    data_complete.to_csv(LEKTOR_DIRECTORY + "lektor_complete_sentence.csv", index=False)
    data_source.to_csv(LEKTOR_DIRECTORY + "lektor_source_sentence.csv", index=False)
    data_target.to_csv(LEKTOR_DIRECTORY + "lektor_target_sentence.csv", index=False)
    lektor_sentence_logger.info("Lektor data saved to csv file")

    return
