# Slovene Specification

### MULTEXT-East Morphosyntactic Specification for Slovene language.

_Reference: [http://nl.ijs.si/ME/V6/msd/html/msd-sl.html](http://nl.ijs.si/ME/V6/msd/html/msd-sl.html)_


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import logging
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

from os.path import exists


In [None]:
# Config logger
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - Slovene Specification: %(message)s",
    level=logging.INFO,
    datefmt="%d-%b-%y %H:%M:%S",
)


In [None]:
# Constants
# Specification is available here: http://nl.ijs.si/ME/V6/msd/html/msd-sl.html
#   or here: http://nl.ijs.si/ME/V6/msd/xml/msd-sl.spc.xml
DATA_FILE = "./slovene_specification.xml"
MSD_INDEX_FILE = "./MSD_index.csv"


In [None]:
def read_xml(file_name):
    """
    Reads an xml file and returns the root element.
    """
    # Pass the path of the xml document to enable the parsing process
    element_tree = ET.parse(file_name)

    # Get the parent tag of the xml document
    root = element_tree.getroot()

    # Return the root tag of the xml document, along with its memory location
    return root


In [None]:
def get_slovene_index(data_xml):
    """
    # 6/1: Returns the Slovene category index in the xml file.
    # 20/2: Returns the Slovene attribute index in the xml file.
    # 21/2: Returns the Slovene value index in the xml file.
    # 22/2(:): Returns the Slovene MSD index in the xml file.

    Example: get_category_index(root[#][#][1(:)])
    """
    data = []

    for row in data_xml:
        temp_data = []
        for column in row:
            temp_data.append(column.text)
        data.append(temp_data)

    df_columns = data[0]
    df_rows = data[1:]

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def get_slovene_pos(data_xml):
    """
    # 7: Returns the Slovene nouns from the xml file.
    # 8: Returns the Slovene verbs from the xml file.
    # 9: Returns the Slovene adjectives from the xml file.
    # 10: Returns the Slovene adverbs from the xml file.
    # 11: Returns the Slovene pronouns from the xml file.
    # 12: Returns the Slovene numerals from the xml file.
    # 13: Returns the Slovene prepositions from the xml file.
    # 14: Returns the Slovene conjunctions from the xml file.
    # 15: Returns the Slovene particles from the xml file.
    # 16: Returns the Slovene interjections from the xml file.
    # 17: Returns the Slovene abbreviations from the xml file.
    # 18: Returns the Slovene residuals from the xml file.
    # 19: Returns the Slovene punctuations from the xml file.

    Example: get_Slovene_pos(root[#][1][1:])
    """
    data = []

    for row in data_xml:
        temp_data = []
        if len(row) == 7:
            for column in row:
                temp_data.append(column.text)
            data.append(temp_data)
        else:
            type_data = []
            for column in row:
                if not column.attrib.get("role") == "values":
                    type_data.append(column.text)
                else:
                    for temp_row in column[0]:
                        value_data = []
                        for temp_column in temp_row:
                            value_data.append(temp_column.text)

                        temp_data = [
                            *type_data[:2],
                            *value_data[:2],
                            *type_data[2:],
                            *value_data[2:],
                        ]
                        data.append(temp_data)

    df_columns = [
        "P",
        "Attribute (sl)",
        "Value (sl)",
        "Code (sl)",
        "Attribute (en)",
        "Value (en)",
        "Code (en)",
    ]
    df_rows = data

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def get_slovene_specification():
    """
    Returns the Slovene specification from the xml file.
    """
    # Get slovene index and POS from the xml file
    slovene_index = {}
    slovene_pos = {}

    data_xml = read_xml(DATA_FILE)

    # Index:
    slovene_index["category"] = get_slovene_index(data_xml[6][1][1])
    slovene_index["attribute"] = get_slovene_index(data_xml[20][2][1])
    slovene_index["value"] = get_slovene_index(data_xml[21][2][1])
    slovene_index["MSD"] = get_slovene_index(data_xml[22][2][1:])

    # POS:
    slovene_pos["noun"] = get_slovene_pos(data_xml[7][1][1:])
    slovene_pos["verb"] = get_slovene_pos(data_xml[8][1][1:])
    slovene_pos["adjective"] = get_slovene_pos(data_xml[9][1][1:])
    slovene_pos["adverb"] = get_slovene_pos(data_xml[10][1][1:])
    slovene_pos["pronoun"] = get_slovene_pos(data_xml[11][1][1:])
    slovene_pos["numeral"] = get_slovene_pos(data_xml[12][1][1:])
    slovene_pos["preposition"] = get_slovene_pos(data_xml[13][1][1:])
    slovene_pos["conjunction"] = get_slovene_pos(data_xml[14][1][1:])
    slovene_pos["particle"] = get_slovene_pos(data_xml[15][1][1:])
    slovene_pos["interjection"] = get_slovene_pos(data_xml[16][1][1:])
    slovene_pos["abbreviation"] = get_slovene_pos(data_xml[17][1][1:])
    slovene_pos["residual"] = get_slovene_pos(data_xml[18][1][1:])
    slovene_pos["punctuation"] = get_slovene_pos(data_xml[19][1][1:])

    return slovene_index, slovene_pos


In [None]:
def save_slovene_specification():
    """
    Saves the Slovene specification as a data frame to a csv file.
    """
    # Get slovene index and POS from the xml file
    slovene_index, slovene_pos = get_slovene_specification()

    for key, value in slovene_index.items():
        value.to_csv(f"{key}_index.csv", index=False)
    logging.info("Slovene index saved to csv file")

    for key, value in slovene_pos.items():
        value.to_csv(f"{key}_pos.csv", index=False)
    logging.info("Slovene POS saved to csv file")
    
    return


In [None]:
def get_msd_index():
    """
    Returns the MSD index from the csv file if exists else read it from xml file.
    """
    if exists(MSD_INDEX_FILE):
        # Read the data from csv file if exists
        msd_index = pd.read_csv(MSD_INDEX_FILE)
    else:
        # Read the data from xml file if csv file does not exist
        data_xml = read_xml(DATA_FILE)
        msd_index = get_slovene_index(data_xml[22][2][1:])
        
    return msd_index


In [None]:
def get_msd_index_en_sl():
    """
    Returns the MSD index in English and Slovene.
    """
    return get_msd_index()[["MSD", "MSD (sl)"]].to_numpy()


In [None]:
def save_msd_index_en_sl():
    """
    Saves the MSD index in English and Slovene to a numpy file.
    """
    msd_index = get_msd_index_en_sl()
    np.save(MSD_INDEX_FILE.replace(".csv", ".npy"), msd_index)
    logging.info("MSD index saved to numpy file")
    
    return


In [None]:
def main():
    """
    Main function for Slovene Specification.
    """
    logging.info("Start")
    save_slovene_specification()
    save_msd_index_en_sl()
    logging.info("End")

    return


In [None]:
# Run Slovene specification
main()
