# Analyze Šolar and Lektor corpus error data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt

from utils.colors import COLOR_GRID
from utils.logging import get_logger
from utils.corpus_enum import Corpus
from utils.analysis_type_enum import AnalysisType
from utils.font_types import FONT_TITLE, FONT_LABEL
from helper_analyze_data import read_error_data, SOLAR_DIRECTORY, LEKTOR_DIRECTORY


In [None]:
# Get logger
analyze_error_logger = get_logger("Analyze Corpus Error Data")


In [None]:
# Constants
ERROR_KEY = "JE NAPAKA"
NO_ERROR_KEY = "NI NAPAKE"
NUMBER_OF_PLOT_ROWS = 2
NUMBER_OF_PLOT_COLUMNS = 3


In [None]:
def analyze_error_rate(corpus=Corpus.SOLAR):
    """
    Analyze the number of error sentences in the corpus.

    @param corpus: corpus type (solar or lektor)
    @return: data frame of category and count for error rate
    """
    # Read the error data
    data = read_error_data(corpus)

    # Default value for error type
    error_types = {ERROR_KEY: 0, NO_ERROR_KEY: 0}

    # Loop through the data and count the number of errors
    for _, temp_error in data["error"].iteritems():
        if temp_error == "":
            error_types[NO_ERROR_KEY] += 1
        else:
            error_types[ERROR_KEY] += 1

    # Convert dictionary to pandas data frame
    df_columns = ["category", "count"]
    df_rows = error_types.items()

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def analyze_error_type(corpus=Corpus.SOLAR, analysis=AnalysisType.ERROR_TYPE):
    """
    Analyze the error type of the solar or lektor corpus and return a data frame.

    @param corpus: corpus type (solar or lektor)
    @param analysis: analysis type (type or rate)
    @return: data frame of category and count for error type
    """
    # Read the error data
    data = read_error_data(corpus)

    # Default value for error type
    error_types = {}

    # Loop through the data and count the number of errors
    for _, temp_error in data["error"].iteritems():
        if temp_error == "":
            continue

        if not analysis == AnalysisType.ERROR_SUBTYPE:
            # Extract the main error type
            temp_error = temp_error.split(":")[0]

        # Add error type if not exists
        if not temp_error in error_types.keys():
            error_types[temp_error] = 0

        error_types[temp_error] += 1

    # Convert dictionary to pandas data frame
    df_columns = ["category", "count"]
    df_rows = error_types.items()

    df = pd.DataFrame(df_rows, columns=df_columns)
    return df


In [None]:
def save_error_analysis(corpus=Corpus.SOLAR, analysis=AnalysisType.ERROR_TYPE):
    """
    Saves the error analysis for the solar or lektor corpus data to a csv file.

    @param corpus: corpus type (solar or lektor)
    @param analysis: analysis type (type or rate)
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"
    if analysis == AnalysisType.ERROR_RATE:
        type_value = "rate"
        # Analyze the error rate
        data = analyze_error_rate(corpus)
    elif analysis == AnalysisType.ERROR_SUBTYPE:
        type_value = "subtype"
        # Analyze the error subtype
        data = analyze_error_type(corpus, AnalysisType.ERROR_SUBTYPE)
    else:
        type_value = "type"
        # Analyze the error type
        data = analyze_error_type(corpus, AnalysisType.ERROR_TYPE)

    data.to_csv(
        (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
        + "analysis/"
        + "{}_analysis_{}_error.csv".format(corpus_name, type_value),
        index=False,
    )
    analyze_error_logger.info("{} error analysis saved to csv file".format(corpus_name))

    return


In [None]:
def plot_error_analysis(
    corpus=Corpus.SOLAR, analysis=AnalysisType.ERROR_TYPE, save=False
):
    """
    Plots the error analysis for the solar or lektor corpus data.

    @param corpus: corpus type (solar or lektor)
    @param analysis: analysis type (type or rate)
    @param save: flag that indicates if plot is saved or shown
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"
    type_value = "rate" if analysis == AnalysisType.ERROR_RATE else "type"

    # Analyze the error type
    data = (
        analyze_error_rate(corpus)
        if analysis == AnalysisType.ERROR_RATE
        else analyze_error_type(corpus)
    )

    # Specify the figure size for the plot and its background color
    plt.rcParams["figure.figsize"] = (20, 12)
    plt.rcParams["figure.facecolor"] = "white"
    plt.rcParams["font.family"] = "serif"
    plt.rcParams["font.size"] = 11

    # Specify the padding between the plot and the title/labels
    plt.rcParams["axes.titlepad"] = 30
    plt.rcParams["axes.labelpad"] = 20

    plt.bar(data["category"], data["count"])  # plot the data as a bar chart
    plt.title(
        "ŠTEVILO POJAVITEV KATEGORIJE NAPAK", fontdict=FONT_TITLE
    )  # add a title to the plot
    plt.xlabel("KATEGORIJA", fontdict=FONT_LABEL)  # add an x-label to the plot
    plt.ylabel("ŠTEVILO POJAVITEV", fontdict=FONT_LABEL)  # add a y-label to the plot
    plt.grid(color=COLOR_GRID, linestyle="--", linewidth=1, axis="y", alpha=0.5)

    # Show or save the plot
    if save:
        plt.savefig(
            (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
            + "analysis/"
            + "{}_analysis_{}_error.png".format(corpus_name, type_value),
        )
        analyze_error_logger.info(
            "{} error analysis plot saved to a file".format(corpus_name)
        )
    else:
        plt.show()

    # Close the plot
    plt.close()

    return


In [None]:
def plot_separate_error_analysis(corpus=Corpus.SOLAR, save=False):
    """
    Plots the error analysis for every subcategory of the solar and lektor corpus data.

    @param corpus: corpus type (solar or lektor)
    @param save: flag that indicates if plot is saved or shown
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"
    data = analyze_error_type(corpus, analysis=AnalysisType.ERROR_SUBTYPE)

    # Specify the figure size for the plot and its background color
    plt.rcParams["figure.figsize"] = (20, 12)
    plt.rcParams["figure.facecolor"] = "white"
    plt.rcParams["font.family"] = "serif"
    plt.rcParams["font.size"] = 7

    # Specify the padding between the plot and the title/labels
    plt.rcParams["axes.titlepad"] = 5
    plt.rcParams["axes.labelpad"] = 5

    # Create a set of error types
    error_type = data["category"].str.split(":").str[0].unique()

    plot_counter = 0  # count the number of plots
    for category in error_type:
        plot_counter += 1  # increase the number of plots

        # Extract the subcategories of the category (add colon to remove subtype)
        category_data = data[data["category"].str.contains(category + ": ")]

        # If there are no subcategories, skip the category
        if not len(category_data):
            # reset the plot counter
            if plot_counter == 1:
                plot_counter = 0
            continue

        subcategories = category_data["category"].str.split(": ").str[1].unique()

        plt.subplot(NUMBER_OF_PLOT_ROWS, NUMBER_OF_PLOT_COLUMNS, plot_counter)
        plt.bar(
            (category_data["category"].str.split(": ", expand=True)[1]),
            category_data["count"],
        )  # plot the data as a bar chart
        plt.title(
            "{}".format(category), fontdict={"family": "serif"}
        )  # add a title to the plot
        plt.xlabel(
            "KATEGORIJA", fontdict={"family": "serif"}
        )  # add an x-label to the plot
        plt.ylabel(
            "ŠTEVILO POJAVITEV", fontdict={"family": "serif"}
        )  # add a y-label to the plot
        plt.xticks(rotation=30)  # rotate the x-axis labels

    # Set the spacing between subplots
    plt.subplots_adjust(wspace=0.5, hspace=0.5)

    # Show or save the plot
    if save:
        plt.savefig(
            (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
            + "analysis/"
            + "{}_analysis_subtype_error.png".format(
                corpus_name,
            ),
        )
        analyze_error_logger.info(
            "{} error analysis plot saved to a file".format(corpus_name)
        )
    else:
        plt.show()

    # Close the plot
    plt.close()

    return


In [None]:
def analyze_data_error():
    """
    Analyze error data of the solar and lektor corpus.
    """
    # Save the error analysis for the solar and lektor corpus data
    # Analyze the error rate
    save_error_analysis(Corpus.SOLAR, AnalysisType.ERROR_RATE)
    save_error_analysis(Corpus.LEKTOR, AnalysisType.ERROR_RATE)

    # Analyze the error type
    save_error_analysis(Corpus.SOLAR, AnalysisType.ERROR_TYPE)
    save_error_analysis(Corpus.LEKTOR, AnalysisType.ERROR_TYPE)

    # Analyze the error subtype
    save_error_analysis(Corpus.SOLAR, AnalysisType.ERROR_SUBTYPE)
    # save_error_analysis(Corpus.LEKTOR, AnalysisType.ERROR_SUBTYPE)

    # Plot the error analysis for the solar and lektor corpus data
    # Plot the error rate
    plot_error_analysis(Corpus.SOLAR, AnalysisType.ERROR_RATE, save=True)
    plot_error_analysis(Corpus.LEKTOR, AnalysisType.ERROR_RATE, save=True)

    # Plot the error type
    plot_error_analysis(Corpus.SOLAR, AnalysisType.ERROR_TYPE, save=True)
    plot_error_analysis(Corpus.LEKTOR, AnalysisType.ERROR_TYPE, save=True)

    # Plot the error subtype
    plot_separate_error_analysis(Corpus.SOLAR, save=True)
    # plot_separate_error_analysis(Corpus.LEKTOR, save=True)

    return
