# Prepare Šolar and Lektor corpus data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from utils.logging import get_logger
from prepare_data_error import save_error_data
from prepare_data_sentence import save_sentence_data

# Import solar functions
from solar_prepare_text import save_solar_text
from solar_prepare_token import save_solar_token
from solar_prepare_sentence import save_solar_sentence
from solar_prepare_link import save_solar_link
from solar_prepare_data import (
    save_solar_data_multiple_error,
    save_solar_data_single_error,
)

# Import lektor functions
from lektor_prepare_text import save_lektor_text
from lektor_prepare_token import save_lektor_token
from lektor_prepare_sentence import save_lektor_sentence
from lektor_prepare_data import (
    save_lektor_data_multiple_error,
    save_lektor_data_single_error,
)

# Import sloleks functions
from sloleks_prepare_data import save_sloleks_data, save_sloleks_words


In [None]:
# Get logger
prepare_data_logger = get_logger("Prepare Solar Corpus Data")


In [None]:
def solar_prepare_data():
    """
    Generate all required files for solar corpus.
    """
    # Generate solar text data and save it to file
    save_solar_text()

    # Generate solar token data and save it to file
    save_solar_token()

    # Generate solar sentence data and save it to file
    save_solar_sentence()

    # Generate solar links between source and target data and save it to file
    save_solar_link()

    # Generate solar sentences with multiple errors and save it to file
    save_solar_data_multiple_error()

    # Generate solar sentences with single error and save it to file
    save_solar_data_single_error()

    return


In [None]:
def lektor_prepare_data():
    """
    Generate all required files for lektor corpus.
    """
    # Generate lektor text data and save it to file
    save_lektor_text()

    # Generate lektor token data and save it to file
    save_lektor_token()

    # Generate lektor sentence data and save it to file
    save_lektor_sentence()

    # Generate lektor sentences with multiple errors and save it to file
    save_lektor_data_multiple_error()

    # Generate lektor sentences with single error and save it to file
    save_lektor_data_single_error()

    return


In [None]:
def sloleks_prepare_data():
    """
    Generate all required files for sloleks corpus.
    """
    # Generate data and metadata and save it to files
    save_error_data()

    # Generate word list and save it to file
    save_sloleks_words()

    return


In [None]:
def main():
    """
    Main function for solar and lektor data preparation.
    """
    prepare_data_logger.info("Start")
    # Prepare solar data files
    prepare_data_logger.info("Prepare solar data")
    solar_prepare_data()
    # Prepare lektor data files
    prepare_data_logger.info("Prepare lektor data")
    lektor_prepare_data()
    # Prepare sloleks data files
    prepare_data_logger.info("Prepare sloleks data")
    sloleks_prepare_data()
    # Prepare solar and lektor error data
    prepare_data_logger.info("Prepare error data")
    save_error_data()
    # Filter solar and lektor error sentences (min length - 1, max length - 128)
    prepare_data_logger.info("Filter error data")
    save_sentence_data()
    prepare_data_logger.info("End")

    return


In [None]:
# Run solar and lektor data preparation
main()
