# Prepare grammar error correction datasets

> **_NOTE:_** We are creating dataset similar to WMT (Workshop on Machine Translation)


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*

import pandas as pd

from utils.corpus_enum import Corpus
from helper_model import (
    SOLAR_FILE_MULTIPLE_ERROR,
    SOLAR_FILE_SINGLE_ERROR,
    LEKTOR_FILE_MULTIPLE_ERROR,
    LEKTOR_FILE_SINGLE_ERROR,
)
from datasets import (
    load_from_disk,
    concatenate_datasets,
    Dataset,
    DatasetDict,
    Features,
    Value,
)


In [None]:
# Constants
SEED = 42
TRAIN_VALIDATION_TEST_RATIO = 0.1  # split dataset: 80% - 10% - 10%


In [None]:
def reset_sentence_id(dataset):
    """
    Reset sentence id in a dataset (set it to a list from 0 to N).

    @param dataset: a dataset that needs to be updated
    @return: return updated dataset where sentence ids are from 0 to N
    """
    return Dataset.from_dict(
        {
            "id": list(range(len(dataset))),
            "source": dataset["source"],
            "target": dataset["target"],
        },
        features=dataset.features,
    )


In [None]:
def prepare_corpus_dataset(corpus=Corpus.SOLAR):
    """
    Create a dataset for multiple and single errors. Dataset consists of
    source and target sentences. Source sentences are sentences written in error
    language, while target sentences are sentences written in non error language.

    @param corpus: corpus type (solar or lektor)
    @return: dataset dictionary of train, validation and test set
    """
    # Set the corpus multiple and single error file paths
    multiple_error_file, single_error_file = (
        (
            LEKTOR_FILE_MULTIPLE_ERROR,
            LEKTOR_FILE_SINGLE_ERROR,
        )
        if corpus == Corpus.LEKTOR
        else (
            SOLAR_FILE_MULTIPLE_ERROR,
            SOLAR_FILE_SINGLE_ERROR,
        )
    )

    # Read the error data from the csv files
    multiple_error_data = pd.read_csv(multiple_error_file, keep_default_na=False)
    single_error_data = pd.read_csv(single_error_file, keep_default_na=False)

    # Join multiple and single error data
    raw_data = pd.concat([multiple_error_data, single_error_data], ignore_index=True)

    # Filter data - remove short and incorrect sentences
    raw_data = raw_data[raw_data.sentence.str.split().apply(len) > 3]
    raw_data = raw_data[
        ~(
            raw_data.sentence.str.isalnum()
            | raw_data.sentence.str.istitle()
            | raw_data.sentence.str.islower()
            | raw_data.sentence.str.isupper()
        )
    ]

    # Remove extra spaces
    raw_data.sentence = raw_data.sentence.apply(lambda sentence: sentence.strip())

    # Separate the error and no error data
    data_error = raw_data[raw_data["error"] != ""]
    data_no_error = raw_data[raw_data["error"] == ""]

    # Create a new dataframe to store source (with errors) and target (without error) sentences
    data = pd.DataFrame(columns=["id", "source", "target"])

    for temp_data in data_error.itertuples():
        # 0 = Index, 1 = id, 2 = sentence, 3 = error
        target = data_no_error[data_no_error["id"] == temp_data[1]]
        # Skip the case where we have only one correct sentence
        if not len(target):
            data = pd.concat(
                [
                    data,
                    pd.Series(
                        {
                            "source": temp_data[2],
                            "target": temp_data[2],
                        }
                    )
                    .to_frame()
                    .T,
                ],
                ignore_index=True,
            )
            continue

        data = pd.concat(
            [
                data,
                pd.Series(
                    {
                        "source": temp_data[2],
                        "target": target.iloc[0]["sentence"],
                    }
                )
                .to_frame()
                .T,
            ],
            ignore_index=True,
        )

    # Format data from [id, source, target] to [source, target]
    data.drop_duplicates(
        subset=["source", "target"],
        keep="first",
        inplace=True,
        ignore_index=False,
    )  # remove duplicates
    data = data.reset_index()  # reset index
    data = data.drop(columns=["id"])  # drop index and id column
    data = data.convert_dtypes()  # convert data types into

    # Create a features for dataset
    features = Features(
        {
            "id": Value(dtype="int32"),
            "source": Value(dtype="string"),
            "target": Value(dtype="string"),
        }
    )

    # Create a dataset and specify its format
    dataset = Dataset.from_dict(
        {
            "id": data.index.values,
            "source": data["source"].values,
            "target": data["target"].values,
        },
        features=features,
    )

    # Shuffle data and split it into train - validation - test set
    dataset = dataset.shuffle(SEED)
    dataset_train = dataset.train_test_split(test_size=2 * TRAIN_VALIDATION_TEST_RATIO)
    dataset_validation_test = dataset_train["test"].train_test_split(test_size=0.5)
    dataset = DatasetDict(
        {
            "train": dataset_train["train"],
            "validation": dataset_validation_test["train"],
            "test": dataset_validation_test["test"],
        }
    )

    return dataset


In [None]:
def prepare_gec_dataset():
    """
    Prepare Solar and Lektor dataset, combine them, reset sentence ids and return
    a dataset dictionary.

    @return: dataset dictionary of train, validation and test set
    """
    solar_dataset = prepare_corpus_dataset(Corpus.SOLAR)
    lektor_dataset = prepare_corpus_dataset(Corpus.LEKTOR)

    # Combine Solar and Lektor dataset
    dataset = DatasetDict(
        {
            "train": concatenate_datasets(
                [solar_dataset["train"], lektor_dataset["train"]]
            ).shuffle(SEED),
            "validation": concatenate_datasets(
                [solar_dataset["validation"], lektor_dataset["validation"]]
            ).shuffle(SEED),
            "test": concatenate_datasets(
                [solar_dataset["test"], lektor_dataset["test"]]
            ).shuffle(SEED),
        }
    )

    # Reset sentence ids for each set
    dataset["train"] = reset_sentence_id(dataset["train"])
    dataset["validation"] = reset_sentence_id(dataset["validation"])
    dataset["test"] = reset_sentence_id(dataset["test"])

    return dataset


In [None]:
def save_gec_dataset(directory_name):
    """
    Prepares source, target and error data and saves it to the generated files.

    @param directory_name: directory name on a disk
    @return: nothing
    """
    # Prepare the model data
    dataset = prepare_gec_dataset()

    # Save the data to the disk
    dataset.save_to_disk(directory_name)
    return


In [None]:
def load_gec_dataset(directory_name):
    """
    Loads source, target and error data from a disk.

    @param directory_name: directory name on a disk
    @return: data dictionary of train, validation and test data sets
    """
    # Load data from a disk
    dataset = load_from_disk(directory_name)
    return dataset
