# Prepare Sloleks corpus data

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import pandas as pd

from utils.reading import read_xml
from utils.logging import get_logger


In [None]:
# Get logger
sloleks_data_logger = get_logger("Prepare Sloleks Corpus Data")


In [None]:
# Constants
SLOLEKS_FOLDER = "./../../data/sloleks/"


In [None]:
def save_sloleks_data():
    """
    Generates the sloleks metadata and data from xml files and saves it into csv
    files.
    """
    for sloleks_file in sorted(os.listdir(SLOLEKS_FOLDER)):
        # Remove all non xml files
        if sloleks_file.split(".")[-1] != "xml":
            continue

        sloleks_data_logger.info("Processing: " + sloleks_file)

        # Read the xml file
        data_xml = read_xml(SLOLEKS_FOLDER + sloleks_file)

        # Read the sloleks data
        metadata = []
        data = []

        # Loop through the list of entries
        for index_i in range(len(data_xml)):
            # Loop through the head of the entry
            id = data_xml[index_i][0][2].get("sloleksId")
            lemma = data_xml[index_i][0][1][0].text
            lexeme = data_xml[index_i][0][2][0].text
            measure = int(data_xml[index_i][0][4][0].text)
            # Loop through related entries
            related_entries = []
            for index_j in range(len(data_xml[index_i][0][5])):
                related_entries.append(data_xml[index_i][0][5][index_j].text)
            related_entries = "|".join(related_entries)
            metadata.append((id, lemma, lexeme, related_entries, measure))

            # Loop through the body of the entry
            for index_j in range(len(data_xml[index_i][1][0])):
                msd = data_xml[index_i][1][0][index_j][0].text

                # Loop through the list of orthography representations
                for index_k in range(len(data_xml[index_i][1][0][index_j][2][0])):
                    word = data_xml[index_i][1][0][index_j][2][0][index_k][0].text
                    measure = int(
                        data_xml[index_i][1][0][index_j][2][0][index_k][1][0].text
                    )
                    data.append((id, msd, word, measure))

        # Create a dataframe for metadata
        metadata_columns = ["id", "lemma", "lexeme", "related_entries", "measure"]
        metadata_rows = metadata
        metadata = pd.DataFrame(metadata_rows, columns=metadata_columns)

        # Create a dataframe for data
        data_columns = ["id", "msd", "word", "measure"]
        data_rows = data
        data = pd.DataFrame(data_rows, columns=data_columns)

        # Save created dataframes
        sloleks_file = ".".join(sloleks_file.split(".")[:-1]) + ".csv"
        metadata.to_csv(SLOLEKS_FOLDER + "metadata_" + sloleks_file, index=False)
        data.to_csv(SLOLEKS_FOLDER + "data_" + sloleks_file, index=False)
        sloleks_data_logger.info("Sloleks data and metadata saved")

    return


In [None]:
def save_sloleks_words(save_all=True):
    """
    Generates the sloleks word data from xml files and saves it into csv
    files.

    @save_all: if true, save all words into one csv file
    """
    # Store all words in one file
    all_data = []
    for sloleks_file in sorted(os.listdir(SLOLEKS_FOLDER)):
        # Remove all non xml files
        if sloleks_file.split(".")[-1] != "xml":
            continue

        sloleks_data_logger.info("Processing: " + sloleks_file)

        # Read the xml file
        data_xml = read_xml(SLOLEKS_FOLDER + sloleks_file)

        # Read the sloleks data
        data = []

        # Loop through the list of entries
        for index_i in range(len(data_xml)):
            # Loop through the body of the entry
            for index_j in range(len(data_xml[index_i][1][0])):
                # Loop through the list of orthography representations
                for index_k in range(len(data_xml[index_i][1][0][index_j][2][0])):
                    word = data_xml[index_i][1][0][index_j][2][0][index_k][0].text
                    data.append(word)
                    all_data.append(word) if save_all else None

        # Create a dataframe for data
        data_columns = ["word"]
        data_rows = data
        data = pd.DataFrame(data_rows, columns=data_columns)

        # Save created dataframes
        sloleks_file = ".".join(sloleks_file.split(".")[:-1]) + ".csv"
        data.to_csv(SLOLEKS_FOLDER + sloleks_file, index=False)
        sloleks_data_logger.info("Sloleks word data saved")

    if save_all:
        # Create a dataframe for data
        df_columns = ["word"]
        df_rows = all_data
        df = pd.DataFrame(df_rows, columns=df_columns)

        # Save created dataframes
        df.to_csv(SLOLEKS_FOLDER + "sloleks.csv", index=False)
        sloleks_data_logger.info("Sloleks all words data saved")

    return
