# Train grammar error detection model


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*

import torch
import numpy as np
from sklearn.utils import class_weight

from utils.logging import get_logger
from utils.metrics import (
    metric_seqeval,
    metric_matthews_correlation,
    metric_exact_match,
)
from helper_model import GER_MODEL, GER_DIRECTORY
from prepare_ger_dataset import load_ger_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)


In [None]:
# Get logger
train_ger_model = get_logger("Train GER model")


In [None]:
# Constants
MODEL_CHECKPOINT = "EMBEDDIA/sloberta"
MODEL_NAME = GER_MODEL
BATCH_SIZE = 16  # 32


In [None]:
# Load the GER dataset
dataset = load_ger_dataset(GER_DIRECTORY)
label_list = dataset["train"].features["ger_tags"].feature.names
train_ger_model.info("{} dataset read".format(MODEL_NAME))

# Compute class weights
outputs = [
    *sum(dataset["train"]["ger_tags"], []), 
    *sum(dataset["test"]["ger_tags"], []), 
    *sum(dataset["validation"]["ger_tags"], [])
]
CLASS_WEIGHTS = [*class_weight.compute_class_weight(
    'balanced', 
    classes=np.unique(outputs), 
    y=outputs
)]

# Create the tokenizer and the model for our model (SloBERTa)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=len(label_list)
)
train_ger_model.info("{} model and tokenizer initialized".format(MODEL_NAME))


In [None]:
# Making the code device-agnostic
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transferring the model to a CUDA enabled GPU
model = model.to(DEVICE)


In [None]:
def tokenize_function(data):
    """
    Tokenize sentences with specific tokenizer which suits our model. Tokenizer
    will tokenize text inputs and put it in a format the model excepts, as well
    as generate the other inputs that model generates

    NB: we use truncation to ensure that the input longer than what the model
    can handle will be truncated to the maximum length accepted by the model.
    NB: our inputs have already been split into words, that is why we use
    is split into words flag -> there may be a case, where our words will be
    split into subwords, which means we also need to process that (word_ids)
    NB: we used batched processing to leverage the full benefit of the fast
    tokenizer.

    @param data: the data we want to tokenize
    @return: tokenized data with a specific model required tokenizer
    """
    tokenized_inputs = tokenizer(
        data["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(data["ger_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to
            # -100 so they are automatically ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the
            # current label or -100, depending on the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
# Apply tokenize function on all the sentences in our dataset
encoded_dataset = dataset.map(tokenize_function, batched=True)

# Setup the training arguments
args = TrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    auto_find_batch_size=True,
    report_to="all",
    deepspeed="./deepspeed_config.json",
)

# Data collator, which will pad the tokens and labels to make them all the same size
data_collator = DataCollatorForTokenClassification(tokenizer)


In [None]:
def compute_metrics(eval_pred):
    """
    Get a predictions, which need to be evaluated, and evaluate them with specific
    metric.

    @param eval_pred: the predictions, which needs to be evaluated
    @return: evaluation score
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    seqeval = metric_seqeval.compute(
        predictions=true_predictions, references=true_labels
    )

    matthews_correlation = np.mean(
        [
            metric_matthews_correlation.compute(
                predictions=prediction, references=label
            )["matthews_correlation"]
            for prediction, label in zip(predictions, labels)
        ]
    )

    exact_match = np.mean(
        [
            metric_exact_match.compute(predictions=prediction, references=label)[
                "exact_match"
            ]
            for prediction, label in zip(true_predictions, true_labels)
        ]
    )

    return {
        "precision": seqeval["overall_precision"],
        "recall": seqeval["overall_recall"],
        "f1": seqeval["overall_f1"],
        "accuracy": seqeval["overall_accuracy"],
        "matthews_correlation": matthews_correlation,
        "exact_match": exact_match,
    }


In [None]:
def model_init():
    """
    Create a model for sequence classification with two labels.
    @return: a model, which we will fine tune
    """
    return AutoModelForTokenClassification.from_pretrained(
        MODEL_CHECKPOINT, num_labels=len(label_list)
    ).to(DEVICE)


In [None]:
# Create a custom GER trainer, which will use class weights
class GERTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_function = torch.nn.CrossEntropyLoss(
            # This needs to be hardcoded (CLASS_WEIGHTS)
            weight=torch.tensor([
                0.030780315367896285,
                87.74089259625806,
                70.87612303943962,
                164.9631401736665,
                123.9942723942724,
                143.19135517612676,
                143.19135517612676,
                18.73709529704216,
                180.24726498208926,
                424.1909318751424,
                94.81914947797301,
                88.2901313605539,
                88.98642577191472,
                125.37198653198654,
                158.47582567245487,
                339.8638189120117,
                462.4376552409339,
                1175.3623737373737,
                42.38722309496164,
                276.55585264408796,
                23.73470506495328,
                2089.533108866442,
                11.105786208542114,
                27.400385594654658,
                63.177372832468016,
                50.59855958690039,
                42.45101124107896,
                2089.533108866442,
                633.9033026898195,
                17.153357841104878,
                94.34346812607683,
                122.64650856389987,
                64.92220246190327,
                2256.6957575757574
            ]).to(DEVICE)
        )
        loss = loss_function(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    

In [None]:
# Hyperparameter search
trainer = GERTrainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
train_ger_model.info("{} trainer initialized".format(MODEL_NAME))

# Find most optimal parameters for our model
train_ger_model.info("{} GER hyperparameter search started".format(MODEL_NAME))
hyperparameters = trainer.hyperparameter_search(direction="maximize")
train_ger_model.info("{} GER hyperparameter search ended".format(MODEL_NAME))


In [None]:
# Use most optimal parameters
for name, value in hyperparameters.hyperparameters.items():
    setattr(trainer.args, name, value)
train_ger_model.info("Hyperparameters: {}".format(hyperparameters.hyperparameters))

# Fine tune the model for GER task
train_ger_model.info("{} model training started".format(MODEL_NAME))
trainer.train()
train_ger_model.info("{} model training ended".format(MODEL_NAME))

# Check if the trainer did reload the best model and not the last
train_ger_model.info(trainer.evaluate())

# Save the model so it can be reloaded with from_pretrained()
trainer.save_model(MODEL_NAME)
train_ger_model.info("{} model saved".format(MODEL_NAME))
