# Prepare Šolar and Lektor corpus sentence data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd

from utils.logging import get_logger
from utils.corpus_enum import Corpus
from utils.error_enum import ErrorType
from solar_prepare_data import SOLAR_DIRECTORY
from lektor_prepare_data import LEKTOR_DIRECTORY


In [None]:
# Get logger
prepare_data_sentence_logger = get_logger("Prepare Corpus Error Data")


In [None]:
# Constants
SENTENCE_MIN_LENGTH = 1
SENTENCE_MAX_LENGTH = 128
SOLAR_FILE_SINGLE_ERROR = (
    "../../data/solar/solar_data_single_error_all.csv"  # solar data with single error
)
SOLAR_FILE_MULTIPLE_ERROR = "../../data/solar/solar_data_multiple_error_all.csv"  # solar data with multiple error
LEKTOR_FILE_SINGLE_ERROR = "../../data/lektor/lektor_data_single_error_all.csv"  # lektor data with single error
LEKTOR_FILE_MULTIPLE_ERROR = "../../data/lektor/lektor_data_multiple_error_all.csv"  # solar data with multiple error


In [None]:
def read_sentence_data(corpus=Corpus.SOLAR, error=ErrorType.SINGLE):
    """
    Reads the solar or lektor sentence data and returns a data frame.

    @param corpus: corpus type (solar or lektor)
    @param error: error file type (single error or multiple errors)
    @return: data frame with the corpus sentence data
    """
    data_path = SOLAR_FILE_SINGLE_ERROR
    if corpus == Corpus.LEKTOR:
        if error == ErrorType.MULTIPLE:
            data_path = LEKTOR_FILE_MULTIPLE_ERROR
        else:
            data_path = LEKTOR_FILE_SINGLE_ERROR
    else:
        if error == ErrorType.MULTIPLE:
            data_path = SOLAR_FILE_MULTIPLE_ERROR

    # Read the solar or lektor error data from the file
    data = pd.read_csv(data_path, keep_default_na=False)
    prepare_data_sentence_logger.info("Corpus sentence data read")

    return data


In [None]:
def filter_sentence_data(corpus=Corpus.SOLAR, error=ErrorType.SINGLE):
    """
    Filters the solar or lektor sentence data and returns a data frame.

    @param corpus: corpus type (solar or lektor)
    @param error: error file type (single error or multiple errors)
    @return: data frame with the filtered corpus sentence data
    """
    # Read corpus data
    data = read_sentence_data(corpus, error)

    sentence_count = data.sentence.str.split().apply(len)
    data = data[
        (sentence_count >= SENTENCE_MIN_LENGTH)
        & (sentence_count <= SENTENCE_MAX_LENGTH)
    ]

    return data


In [None]:
def save_sentence_data():
    """
    Saves the solar and lektor filtered sentence data to the csv file.
    """
    # Get the data
    data_solar_single = filter_sentence_data(Corpus.SOLAR, ErrorType.SINGLE)
    data_solar_multiple = filter_sentence_data(Corpus.SOLAR, ErrorType.MULTIPLE)
    data_lektor_single = filter_sentence_data(Corpus.LEKTOR, ErrorType.SINGLE)
    data_lektor_multiple = filter_sentence_data(Corpus.LEKTOR, ErrorType.MULTIPLE)

    data_solar_single.to_csv(
        SOLAR_DIRECTORY + "solar_data_single_error.csv", index=False
    )
    data_solar_multiple.to_csv(
        SOLAR_DIRECTORY + "solar_data_multiple_error.csv", index=False
    )
    prepare_data_sentence_logger.info("Solar data saved to csv file")

    data_lektor_single.to_csv(
        LEKTOR_DIRECTORY + "lektor_data_single_error.csv", index=False
    )
    data_lektor_multiple.to_csv(
        LEKTOR_DIRECTORY + "lektor_data_multiple_error.csv", index=False
    )
    prepare_data_sentence_logger.info("Lektor data saved to csv file")
    return
