# Analyze Šolar and Lektor corpus sentence data


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import collections
import pandas as pd
import matplotlib.pyplot as plt

from utils.logging import get_logger
from utils.corpus_enum import Corpus
from utils.colors import COLOR_GRID, COLOR_LINE
from utils.font_types import FONT_TITLE, FONT_LABEL
from helper_analyze_data import read_error_data, SOLAR_DIRECTORY, LEKTOR_DIRECTORY


In [None]:
# Get logger
analyze_sentence_logger = get_logger("Analyze Corpus Sentence Data")


In [None]:
def plot_sentence_analysis(corpus=Corpus.SOLAR, save=False):
    """
    Plots the sentence analysis for the solar or lektor corpus data.

    @param corpus: corpus type (solar or lektor)
    @param save: flag that indicates if plot is saved or shown
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"

    # Read the corpus sentence data
    data = read_error_data(corpus)
    sentence_count = collections.Counter(data.sentence.str.split().apply(len))

    # Specify the figure size for the plot and its background color
    plt.rcParams["figure.figsize"] = (20, 12)
    plt.rcParams["figure.facecolor"] = "white"
    plt.rcParams["font.family"] = "serif"
    plt.rcParams["font.size"] = 11

    # Specify the padding between the plot and the title/labels
    plt.rcParams["axes.titlepad"] = 30
    plt.rcParams["axes.labelpad"] = 20

    plt.bar(
        sentence_count.keys(), sentence_count.values()
    )  # plot the data as a bar chart
    plt.title("ŠTEVILO BESED V STAVKU", fontdict=FONT_TITLE)  # add a title to the plot
    plt.xlabel("DOLŽINA STAVKA", fontdict=FONT_LABEL)  # add an x-label to the plot
    plt.ylabel("ŠTEVILO POJAVITEV", fontdict=FONT_LABEL)  # add a y-label to the plot
    plt.grid(color=COLOR_GRID, linestyle="--", linewidth=1, axis="y", alpha=0.5)

    plt.axvline(
        x=3.5, color=COLOR_LINE, linestyle="--", linewidth=2
    )  # add vertical line
    plt.axvline(
        x=64.5, color=COLOR_LINE, linestyle="--", linewidth=2
    )  # add vertical line

    # Show or save the plot
    if save:
        plt.savefig(
            (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
            + "analysis/"
            + "{}_analysis_sentence.png".format(corpus_name),
        )
        analyze_sentence_logger.info(
            "{} error analysis plot saved to a file".format(corpus_name)
        )
    else:
        plt.show()

    # Close the plot
    plt.close()

    return


In [None]:
def save_sentence_error_ratio_analysis(corpus=Corpus.SOLAR):
    """
    Calculates the sentence error ratio and saves it to a csv file.

    @param corpus: corpus type (solar or lektor)
    @return: nothing
    """
    corpus_name = "lektor" if corpus == Corpus.LEKTOR else "solar"

    # Read the corpus sentence data
    data = read_error_data(corpus)
    sentence_number = len(data.id.unique())
    error_number = len(data.error != "")
    sentence_error_ratio = error_number / sentence_number

    # Create a data frame and save it to the csv file
    df_columns = ["sentence_number", "error_number", "ratio"]
    df_rows = [[sentence_number, error_number, sentence_error_ratio]]

    df = pd.DataFrame(df_rows, columns=df_columns)
    df.to_csv(
        (LEKTOR_DIRECTORY if corpus == Corpus.LEKTOR else SOLAR_DIRECTORY)
        + "analysis/"
        + "{}_analysis_sentence_error_ratio.csv".format(corpus_name),
        index=False,
    )
    analyze_sentence_logger.info(
        "{} sentence analysis saved to csv file".format(corpus_name)
    )

    return


In [None]:
def analyze_data_sentence():
    """
    Analyze sentence data of the solar and lektor corpus.
    """
    # Save the sentence analysis for the solar and lektor corpus data
    plot_sentence_analysis(Corpus.SOLAR, save=True)
    plot_sentence_analysis(Corpus.LEKTOR, save=True)

    # Save the sentence error ratio for the solar and lektor corpus data
    save_sentence_error_ratio_analysis(Corpus.SOLAR)
    save_sentence_error_ratio_analysis(Corpus.LEKTOR)

    return
