# Prepare grammar error recognition datasets

> **_NOTE:_** We are creating dataset similar to CoNLL-2003


In [133]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*

import pandas as pd

from utils.corpus_enum import Corpus
from datasets import (
    load_dataset,
    load_from_disk,
    Dataset,
    DatasetDict,
    Features,
    ClassLabel,
    Value,
    Sequence,
)


In [134]:
# Constants
DATASET_NAME = "cjvt/solar3"
SEED = 42
TRAIN_VALIDATION_TEST_RATIO = 0.1  # split dataset: 80% - 10% - 10%
GER_TAGS = {
    "0": 0,  # NI NAPAKE
    # 1. NAPAKA ČRKOVANJA:
    "Č/VOK": 1,
    "Č/KONZ": 2,
    "Č/W": 3,
    "Č/SKLOP": 4,
    "Č/PRED": 5,
    "Č/PREDL": 5,  # this one is required due to inaccuracy in solar corpus
    # 2. NAPAKA OBLIKE:
    "O/KAT": 6,
    "O/PAR": 7,
    "O/DOD": 8,
    # 3. NAPAKA BESEDIŠČA:
    "B/SAM": 9,
    "B/GLAG": 10,
    "B/ZAIM": 11,
    "B/PRED": 12,
    "B/VEZ": 13,
    "B/PRID": 14,
    "B/PRISL": 15,
    "B/OST": 16,
    "B/MEN": 17,
    "B/DOD": 18,
    # 4. NAPAKA SKLADNJE:
    "S/BR": 19,
    "S/IZPUST": 20,
    "S/ODVEČ": 21,
    "S/STR": 22,
    "S/DOD": 23,
    # 5. NAPAKA ZAPISA:
    "Z/MV": 24,
    "Z/SN": 25,
    "Z/KR": 26,
    "Z/ŠTEV": 27,
    "Z/LOČ": 28,
    # 6. POVEZANA NAPAKA:
    "P/OBL": 29,
    "P/SKLA": 30,
    "P/ZAP": 31,
    # 7. NEOPREDELJENA NAPAKA:
    "N/": 32,
}


In [135]:
def reset_sentence_id(dataset):
    """
    Reset sentence id in a dataset (set it to a list from 0 to N).

    @param dataset: a dataset that needs to be updated
    @return: return updated dataset where sentence ids are from 0 to N
    """
    return Dataset.from_dict(
        {
            "id": list(range(len(dataset))),
            "tokens": dataset["tokens"],
            "ger_tags": dataset["ger_tags"],
        },
        features=dataset.features,
    )


In [136]:
def prepare_corpus_dataset():
    """
    Create a dataset for grammar error recognition. Dataset consists of
    sentences, labels, which indicates weather the sentence contains an error
    and sentence ids.

    @return: dataset dictionary of train, validation and test set
    """
    dataset = load_dataset(
        DATASET_NAME, "sentence_level"
    )  # download the source dataset (sentence level)

    # Remove all unused columns and rename the used ones
    dataset = (
        dataset["train"]
        .remove_columns(
            [
                "id_doc",
                "doc_title",
                "is_manually_validated",
                "src_ling_annotations",
                "tgt_tokens",
                "tgt_ling_annotations",
            ]
        )
        .rename_columns({"src_tokens": "sentence", "corrections": "error"})
    )

    # Create a new dataframe to store tokens (words in sentence) and ger tags (error names)
    data = pd.DataFrame(columns=["id", "tokens", "ger_tags"])

    for temp_data_index in range(dataset.num_rows):
        temp_data = dataset[temp_data_index]

        # The default (non error) tag is always 0
        temp_data_error = [0] * len(temp_data["sentence"])

        # Loop through errors and replace them with numbers
        for temp_error in temp_data["error"]:
            # Loop through error tokens
            for temp_error_index in temp_error["idx_src"]:
                temp_data_error[temp_error_index] = GER_TAGS[
                    "/".join(temp_error["corr_types"][0].split("/")[:-1])
                ]

        data = pd.concat(
            [
                data,
                pd.Series(
                    {
                        "tokens": tuple(temp_data["sentence"]),
                        "ger_tags": tuple(temp_data_error),
                    }
                )
                .to_frame()
                .T,
            ]
        )

    # Format data from [id, tokens, ger_tags] to [tokens, ger_tags]
    data.drop_duplicates(
        subset=["tokens", "ger_tags"],
        keep="first",
        inplace=True,
        ignore_index=False,
    )  # remove duplicates
    data = data.reset_index()  # reset index
    data = data.drop(columns=["index", "id"])  # drop index and id column
    data = data.convert_dtypes()  # convert data types into

    # Create a features for dataset
    features = Features(
        {
            "id": Value(dtype="int32"),
            "tokens": Sequence(
                feature=Value(dtype="string", id=None), length=-1, id=None
            ),
            "ger_tags": Sequence(
                feature=ClassLabel(
                    num_classes=len(GER_TAGS),
                    names=list(GER_TAGS.keys()),
                    id=None,
                ),
                length=-1,
                id=None,
            ),
        }
    )

    # Create a dataset and specify its format
    dataset = Dataset.from_dict(
        {
            "id": data.index.values,
            "tokens": data["tokens"].values,
            "ger_tags": data["ger_tags"].values,
        },
        features=features,
    )

    # Shuffle data and split it into train - validation - test set
    dataset = dataset.shuffle(SEED)
    dataset_train = dataset.train_test_split(test_size=2 * TRAIN_VALIDATION_TEST_RATIO)
    dataset_validation_test = dataset_train["test"].train_test_split(test_size=0.5)
    dataset = DatasetDict(
        {
            "train": dataset_train["train"],
            "validation": dataset_validation_test["train"],
            "test": dataset_validation_test["test"],
        }
    )

    return dataset


In [137]:
def prepare_ger_dataset():
    """
    Prepare Solar dataset, reset sentence ids and return a dataset dictionary.

    @return: dataset dictionary of train, validation and test set
    """
    dataset = prepare_corpus_dataset()

    # Reset sentence ids for each set
    dataset["train"] = reset_sentence_id(dataset["train"])
    dataset["validation"] = reset_sentence_id(dataset["validation"])
    dataset["test"] = reset_sentence_id(dataset["test"])

    return dataset


In [138]:
def save_ger_dataset(directory_name):
    """
    Prepares source, target and error data and saves it to the generated files.

    @param directory_name: directory name on a disk
    @return: nothing
    """
    # Prepare the model data
    dataset = prepare_ger_dataset()

    # Save the data to the disk
    dataset.save_to_disk(directory_name)
    return


In [139]:
def load_ger_dataset(directory_name):
    """
    Loads source, target and error data from a disk.

    @param directory_name: directory name on a disk
    @return: data dictionary of train, validation and test data sets
    """
    # Load data from a disk
    dataset = load_from_disk(directory_name)
    return dataset
