# Helper functions for Šolar and Lektor corpus data analysis


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd

from utils.corpus_enum import Corpus
from utils.logging import get_logger
from utils.file_type_enum import FileType


In [None]:
# Get logger
analyze_data_logger = get_logger("Analyze Corpus Data")


In [None]:
# Constants
SOLAR_DIRECTORY = "../../data/solar/"
SOLAR_FILE_TOKEN = "../../data/solar/solar_complete_token.csv"  # complete solar data
SOLAR_FILE_SOURCE_TOKEN = "../../data/solar/solar_source_token.csv"  # source solar data
SOLAR_FILE_TARGET_TOKEN = "../../data/solar/solar_target_token.csv"  # target solar data
SOLAR_FILE_ERROR = "../../data/solar/solar_data_single_error.csv"  # solar data with errors
SOLAR_FILE_LINK = "../../data/solar/solar_link.csv"  # link solar data
LEKTOR_DIRECTORY = "../../data/lektor/"
LEKTOR_FILE_TOKEN = (
    "../../data/lektor/lektor_complete_token.csv"  # complete lektor data
)
LEKTOR_FILE_SOURCE_TOKEN = (
    "../../data/lektor/lektor_source_token.csv"  # source lektor data
)
LEKTOR_FILE_TARGET_TOKEN = (
    "../../data/lektor/lektor_target_token.csv"  # target lektor data
)
LEKTOR_FILE_ERROR = "../../data/lektor/lektor_data_single_error.csv"  # lektor data with errors
MSD_INDEX_FILE = "../../slovene_specification/MSD_index.csv"  # MSD index
CATEGORY_INDEX_FILE = "../../slovene_specification/category_index.csv"  # category index


In [None]:
def read_corpus(corpus=Corpus.SOLAR, text=FileType.COMPLETE):
    """
    Reads the solar or lektor corpus token data and returns a data frame.
    """
    data_path = SOLAR_FILE_TOKEN
    if corpus == Corpus.LEKTOR:
        if text == FileType.COMPLETE:
            data_path = LEKTOR_FILE_TOKEN
        elif text == FileType.SOURCE:
            data_path = LEKTOR_FILE_SOURCE_TOKEN
        elif text == FileType.TARGET:
            data_path = LEKTOR_FILE_TARGET_TOKEN
    else:
        if text == FileType.SOURCE:
            data_path = SOLAR_FILE_SOURCE_TOKEN
        elif text == FileType.TARGET:
            data_path = SOLAR_FILE_TARGET_TOKEN
        elif text == FileType.LINK:
            data_path = SOLAR_FILE_LINK

    # Read the solar or lektor token data from the file
    data = pd.read_csv(data_path)
    analyze_data_logger.info("Corpus data read")

    return data


In [None]:
def read_error_data(corpus=Corpus.SOLAR):
    """
    Reads the solar or lektor error data and returns a data frame.
    """
    data_path = SOLAR_FILE_ERROR
    if corpus == Corpus.LEKTOR:
        data_path = LEKTOR_FILE_ERROR

    # Read the solar or lektor error data from the file
    data = pd.read_csv(data_path, keep_default_na=False)
    analyze_data_logger.info("Corpus error data read")

    return data


In [None]:
def get_msd_index():
    """
    Reads the MSD index for Slovenian Language and returns the combination
    of value end features for each category.
    """
    msd_index = pd.read_csv(MSD_INDEX_FILE, keep_default_na=False)
    analyze_data_logger.info("MSD index read")

    return msd_index[["MSD (sl)", "Features (sl)"]]


In [None]:
def get_category_index():
    """
    Reads the category index for Slovenian Language and returns the combination
    of value end code for each category.
    """
    category_index = pd.read_csv(CATEGORY_INDEX_FILE, keep_default_na=False)
    analyze_data_logger.info("Category index read")

    return category_index[["Value (sl)", "Code (sl)"]]
