# Analyze Šolar and Lektor corpus word data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt

from utils.logging import get_logger
from utils.corpus_enum import Corpus
from utils.file_type_enum import FileType
from utils.analysis_type_enum import AnalysisType
from utils.font_types import FONT_TITLE, FONT_LABEL
from utils.colors import COLOR_GRID
from helper_analyze_data import (
    read_corpus,
    get_msd_index,
    get_category_index,
    SOLAR_DIRECTORY,
    LEKTOR_DIRECTORY,
)


In [None]:
# Get logger
analyze_word_logger = get_logger("Analyze Corpus Word Data")


In [None]:
# Constants
NUMBER_OF_PLOT_ROWS = 3
NUMBER_OF_PLOT_COLUMNS = 3


In [None]:
def analyze_category_index(corpus=Corpus.SOLAR, text=FileType.COMPLETE):
    """
    Analyze the category index of the solar or lektor corpus and return a data frame.

    @param corpus: corpus type (solar or lektor)
    @param text: corpus text (source or target)
    @return: data frame of category and count for category index
    """
    # Read the category index and create a dictionary with category index codes
    category_index = get_category_index()
    category = {}
    for _, code in category_index["Code (sl)"].iteritems():
        category[code] = 0

    if not (
        corpus == Corpus.SOLAR
        or corpus == Corpus.LEKTOR
        or text == FileType.COMPLETE
        or text == FileType.SOURCE
        or text == FileType.TARGET
    ):
        analyze_word_logger.warning("Invalid corpus or text parameter")
        corpus = Corpus.SOLAR
        text = FileType.COMPLETE

    # Read corpus data
    data = read_corpus(corpus, text)

    # Count the number of words in each category
    for _, word in data["msd (sl)"].iteritems():
        category[word[0]] += 1

    # Replace code with value
    for value in category_index.itertuples():
        category[value[1].upper()] = category.pop(value[2])

    # Convert dictionary to pandas dataframe
    df_columns = ["category", "count"]
    df_rows = category.items()

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def analyze_msd_index(corpus=Corpus.SOLAR, text=FileType.COMPLETE):
    """
    Analyze the msd index of the solar or lektor corpus and return a data frame.

    @param corpus: corpus type (solar or lektor)
    @param text: corpus text (source or target)
    @return: data frame of category and count for msd index
    """
    # Read the category index and create a dictionary with msd index features
    msd_index = get_msd_index()
    msd = {}
    for _, feature in msd_index["Features (sl)"].iteritems():
        # Main category is the first feature
        category = feature.split()[0].upper()
        msd[category] = 0
        # Subcategories are the rest of the features
        for index in range(1, len(feature.split())):
            msd["{} {}".format(category, feature.split()[index].upper())] = 0

    if not (
        corpus == Corpus.SOLAR
        or corpus == Corpus.LEKTOR
        or text == FileType.COMPLETE
        or text == FileType.SOURCE
        or text == FileType.TARGET
    ):
        analyze_word_logger.warning("Invalid corpus or text parameter")
        corpus = Corpus.SOLAR
        text = FileType.COMPLETE

    # Read corpus data
    data = read_corpus(corpus, text)

    # Count the number of words in each category
    for _, word in data["msd (sl)"].iteritems():
        temp_msd_index = msd_index[msd_index["MSD (sl)"] == word]
        # Get the feature for specific msd code
        if not len(temp_msd_index):
            continue

        temp_msd_index = temp_msd_index.iloc[0]["Features (sl)"]
        # Main category is the first feature
        category = temp_msd_index.split()[0].upper()
        msd[category] += 1
        # Subcategories are the rest of the features
        for index in range(1, len(temp_msd_index.split())):
            msd["{} {}".format(category, temp_msd_index.split()[index].upper())] += 1

    # Convert dictionary to dataframe
    df_columns = ["category", "count"]
    df_rows = msd.items()

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_word_analysis(
    corpus=Corpus.SOLAR, text=FileType.COMPLETE, analysis=AnalysisType.MSD
):
    """
    Saves the word analysis for the solar and lektor corpus data.

    @param corpus: corpus type (solar or lektor)
    @param text: corpus text (source or target)
    @param analysis: flag that indicates weather to analyze msd or category index
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"
    text_value = (
        "source"
        if text == FileType.SOURCE
        else ("target" if text == FileType.TARGET else "complete")
    )
    type_value = "category" if analysis == AnalysisType.CATEGORY else "msd"

    # Analyze the msd and category index
    data = (
        analyze_category_index(corpus, text)
        if analysis == AnalysisType.CATEGORY
        else analyze_msd_index(corpus, text)
    )

    data.to_csv(
        (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
        + "analysis/"
        + "{}_{}_analysis_{}_word.csv".format(corpus_name, type_value, text_value),
        index=False,
    )
    analyze_word_logger.info("{} word analysis saved to csv file".format(corpus_name))

    return


In [None]:
def plot_word_analysis(
    corpus=Corpus.SOLAR, text=FileType.COMPLETE, analysis=AnalysisType.MSD, save=False
):
    """
    Plots the word analysis for the solar and lektor corpus data.

    @param corpus: corpus type (solar or lektor)
    @param text: corpus text (source or target)
    @param analysis: flag that indicates weather to analyze msd or category index
    @param save: flag that indicates if plot is saved or shown
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"
    text_value = (
        "source"
        if text == FileType.SOURCE
        else ("target" if text == FileType.TARGET else "complete")
    )
    type_value = "category" if analysis == AnalysisType.CATEGORY else "msd"

    # Analyze the msd and category index
    data = (
        analyze_category_index(corpus, text)
        if analysis == AnalysisType.CATEGORY
        else analyze_msd_index(corpus, text)
    )

    # Specify the figure size for the plot and its background color
    plt.rcParams["figure.figsize"] = (20, 12)
    plt.rcParams["figure.facecolor"] = "white"
    plt.rcParams["font.family"] = "serif"

    # Specify the padding between the plot and the title/labels
    plt.rcParams["axes.titlepad"] = 30
    plt.rcParams["axes.labelpad"] = 20

    plt.bar(data["category"], data["count"])  # plot the data as a bar chart
    plt.title(
        "ŠTEVILO POJAVITEV KATEGORIJE BESED", fontdict=FONT_TITLE
    )  # add a title to the plot
    plt.xlabel("KATEGORIJA", fontdict=FONT_LABEL)  # add an x-label to the plot
    plt.ylabel("ŠTEVILO POJAVITEV", fontdict=FONT_LABEL)  # add a y-label to the plot
    plt.grid(color=COLOR_GRID, linestyle="--", linewidth=1, axis="y", alpha=0.5)

    # Show or save the plot
    if save:
        plt.savefig(
            (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
            + "analysis/"
            + "{}_{}_analysis_{}_word.png".format(corpus_name, type_value, text_value),
        )
        analyze_word_logger.info(
            "{} word analysis plot saved to a file".format(corpus_name)
        )
    else:
        plt.show()

    # Close the plot
    plt.close()

    return


In [None]:
def plot_separate_word_msd_analysis(
    corpus=Corpus.SOLAR, text=FileType.COMPLETE, save=False
):
    """
    Plots the word analysis of every subcategory for the solar and lektor corpus data.

    @param corpus: corpus type (solar or lektor)
    @param text: corpus text (source or target)
    @param save: flag that indicates if plot is saved or shown
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"
    text_value = (
        "source"
        if text == FileType.SOURCE
        else ("target" if text == FileType.TARGET else "complete")
    )

    # Analyze the msd index
    data = analyze_msd_index(corpus, text)
    category_index = get_category_index()

    figure_counter = 0  # count the number of figures
    for category in category_index.itertuples():
        figure_counter += 1  # increase the number of figures

        category = category[1].upper()
        # Extract the subcategories of the category (add space to remove main category)
        category_data = data[data["category"].str.contains(category + " ")]

        # If there are no subcategories, skip the category
        if not len(category_data):
            continue

        # Get a list of subcategories
        subcategories = (
            category_data["category"]
            .str.split(" ", expand=True)[1]
            .str.split("=", expand=True)[0]
            .unique()
        )

        # Create a figure for each category
        plt.figure(figure_counter)

        # Specify the figure size for the plot and its background color
        plt.rcParams["figure.figsize"] = (20, 12)
        plt.rcParams["figure.facecolor"] = "white"
        plt.rcParams["font.family"] = "serif"

        # Specify the padding between the plot and the title/labels
        plt.rcParams["axes.titlepad"] = 5
        plt.rcParams["axes.labelpad"] = 5

        plot_counter = 0
        for subcategory in subcategories:
            plot_counter += 1  # increase the counter of the plot number

            subcategory_data = category_data[
                category_data["category"].str.contains(subcategory)
            ]

            plt.subplot(NUMBER_OF_PLOT_ROWS, NUMBER_OF_PLOT_COLUMNS, plot_counter)
            plt.bar(
                (
                    subcategory_data["category"]
                    .str.split(" ", expand=True)[1]
                    .str.split("=", expand=True)[1]
                ),
                subcategory_data["count"],
            )  # plot the data as a bar chart
            plt.title(
                "{} - {}".format(category, subcategory), fontdict={"family": "serif"}
            )  # add a title to the plot
            plt.xlabel(
                "KATEGORIJA", fontdict={"family": "serif"}
            )  # add an x-label to the plot
            plt.ylabel(
                "ŠTEVILO POJAVITEV", fontdict={"family": "serif"}
            )  # add a y-label to the plot
            plt.xticks(rotation=30)  # rotate the x-axis labels

        # Set the spacing between subplots
        plt.subplots_adjust(wspace=0.5, hspace=0.5)

        # Show or save the plot
        if save:
            plt.savefig(
                (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
                + "analysis/"
                + "{}_msd_analysis_{}_{}_word.png".format(
                    corpus_name,
                    "{}_{}".format(category.lower(), subcategory.lower()),
                    text_value,
                ),
            )
            analyze_word_logger.info(
                "{} word analysis plot saved to a file".format(corpus_name)
            )
        else:
            plt.show()

        # Close the plot
        plt.close()

    return


In [None]:
def analyze_data_word():
    """
    Analyze word data of the solar and lektor corpus.
    """
    # Save the word analysis for the solar and lektor corpus data
    # Analyze the category index
    save_word_analysis(Corpus.SOLAR, FileType.COMPLETE, AnalysisType.CATEGORY)
    save_word_analysis(Corpus.SOLAR, FileType.SOURCE, AnalysisType.CATEGORY)
    save_word_analysis(Corpus.SOLAR, FileType.TARGET, AnalysisType.CATEGORY)

    save_word_analysis(Corpus.LEKTOR, FileType.COMPLETE, AnalysisType.CATEGORY)
    save_word_analysis(Corpus.LEKTOR, FileType.SOURCE, AnalysisType.CATEGORY)
    save_word_analysis(Corpus.LEKTOR, FileType.TARGET, AnalysisType.CATEGORY)

    # Analyze the msd index
    save_word_analysis(Corpus.SOLAR, FileType.COMPLETE, AnalysisType.MSD)
    save_word_analysis(Corpus.SOLAR, FileType.SOURCE, AnalysisType.MSD)
    save_word_analysis(Corpus.SOLAR, FileType.TARGET, AnalysisType.MSD)

    save_word_analysis(Corpus.LEKTOR, FileType.COMPLETE, AnalysisType.MSD)
    save_word_analysis(Corpus.LEKTOR, FileType.SOURCE, AnalysisType.MSD)
    save_word_analysis(Corpus.LEKTOR, FileType.TARGET, AnalysisType.MSD)

    # Plot the word analysis for the solar and lektor corpus data
    # Plot the category index
    plot_word_analysis(
        Corpus.SOLAR, FileType.COMPLETE, AnalysisType.CATEGORY, save=True
    )
    plot_word_analysis(Corpus.SOLAR, FileType.SOURCE, AnalysisType.CATEGORY, save=True)
    plot_word_analysis(Corpus.SOLAR, FileType.TARGET, AnalysisType.CATEGORY, save=True)

    plot_word_analysis(
        Corpus.LEKTOR, FileType.COMPLETE, AnalysisType.CATEGORY, save=True
    )
    plot_word_analysis(Corpus.LEKTOR, FileType.SOURCE, AnalysisType.CATEGORY, save=True)
    plot_word_analysis(Corpus.LEKTOR, FileType.TARGET, AnalysisType.CATEGORY, save=True)

    # Plot the msd index
    plot_separate_word_msd_analysis(Corpus.SOLAR, FileType.COMPLETE, save=True)
    plot_separate_word_msd_analysis(Corpus.SOLAR, FileType.SOURCE, save=True)
    plot_separate_word_msd_analysis(Corpus.SOLAR, FileType.TARGET, save=True)

    plot_separate_word_msd_analysis(Corpus.LEKTOR, FileType.COMPLETE, save=True)
    plot_separate_word_msd_analysis(Corpus.LEKTOR, FileType.SOURCE, save=True)
    plot_separate_word_msd_analysis(Corpus.LEKTOR, FileType.TARGET, save=True)

    return
