# Train grammar error detection model


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*

import torch
import numpy as np
from sklearn.utils import class_weight

from utils.logging import get_logger
from utils.metrics import (
    metric_accuracy,
    metric_precision,
    metric_recall,
    metric_f1,
    metric_matthews_correlation,
)
from helper_model import GED_MODEL, GED_DIRECTORY
from prepare_ged_dataset import load_ged_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)


In [None]:
# Get logger
train_ged_model = get_logger("Train GED model")


In [None]:
# Constants
MODEL_CHECKPOINT = "EMBEDDIA/sloberta"
MODEL_NAME = GED_MODEL
BATCH_SIZE = 16  # 32


In [None]:
# Load the GED dataset
dataset = load_ged_dataset(GED_DIRECTORY)
train_ged_model.info("{} dataset read".format(MODEL_NAME))

# Compute class weights
outputs = [
    *dataset["train"]["label"], 
    *dataset["test"]["label"], 
    *dataset["validation"]["label"]
]
CLASS_WEIGHTS = [*class_weight.compute_class_weight(
    'balanced', 
    classes=np.unique(outputs), 
    y=outputs
)]

# Create the tokenizer and the model for our model (SloBERTa)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=2
)
train_ged_model.info("{} model and tokenizer initialized".format(MODEL_NAME))


In [None]:
# Making the code device-agnostic
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transferring the model to a CUDA enabled GPU
model = model.to(DEVICE)


In [None]:
def tokenize_function(data):
    """
    Tokenize sentences with specific tokenizer which suits our model. Tokenizer
    will tokenize text inputs and put it in a format the model excepts, as well
    as generate the other inputs that model generates

    NB: we use truncation to ensure that the input longer than what the model
    can handle will be truncated to the maximum length accepted by the model.
    NB: we used batched processing to leverage the full benefit of the fast
    tokenizer.

    @param data: the data we want to tokenize
    @return: tokenized data with a specific model required tokenizer
    """
    return tokenizer(data["sentence"], truncation=True)


In [None]:
# Apply tokenize function on all the sentences in our dataset
encoded_dataset = dataset.map(tokenize_function, batched=True)

# Setup the training arguments
args = TrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    auto_find_batch_size=True,
    report_to="all",
    deepspeed="./deepspeed_config.json",
)


In [None]:
def compute_metrics(eval_pred):
    """
    Get a predictions, which need to be evaluated, and evaluate them with specific
    metric.

    @param eval_pred: the predictions, which needs to be evaluated
    @return: evaluation score
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = metric_accuracy.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    precision = metric_precision.compute(predictions=predictions, references=labels)[
        "precision"
    ]
    recall = metric_recall.compute(predictions=predictions, references=labels)["recall"]
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]
    matthews_correlation = metric_matthews_correlation.compute(
        predictions=predictions, references=labels
    )["matthews_correlation"]

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "matthews_correlation": matthews_correlation,
    }


In [None]:
def model_init():
    """
    Create a model for sequence classification with two labels.
    @return: a model, which we will fine tune
    """
    return AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT, num_labels=2
    ).to(DEVICE)


In [None]:
# Create a custom GED trainer, which will use class weights
class GEDTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_function = torch.nn.CrossEntropyLoss(
            # This needs to be hardcoded (CLASS_WEIGHTS)
            weight=torch.tensor([1.514000662150461, 0.7465481624733931]).to(DEVICE)
        )
        loss = loss_function(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    

In [None]:
# Hyperparameter search
trainer = GEDTrainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
train_ged_model.info("{} trainer initialized".format(MODEL_NAME))

# Find most optimal parameters for our model
train_ged_model.info("{} GED hyperparameter search started".format(MODEL_NAME))
hyperparameters = trainer.hyperparameter_search(direction="maximize")
train_ged_model.info("{} GED hyperparameter search ended".format(MODEL_NAME))


In [None]:
# Use most optimal parameters
for name, value in hyperparameters.hyperparameters.items():
    setattr(trainer.args, name, value)
train_ged_model.info("Hyperparameters: {}".format(hyperparameters.hyperparameters))

# Fine tune the model for GED task
train_ged_model.info("{} model training started".format(MODEL_NAME))
trainer.train()
train_ged_model.info("{} model training ended".format(MODEL_NAME))

# Check if the trainer did reload the best model and not the last
train_ged_model.info(trainer.evaluate())

# Save the model so it can be reloaded with from_pretrained()
trainer.save_model(MODEL_NAME)
train_ged_model.info("{} model saved".format(MODEL_NAME))
