Hugging Face Tutorial Fine-tuning : https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb

GLEU score implementation: https://github.com/cnap/gec-ranking/tree/master


mT5 https://github.com/google-research/multilingual-t5

In [None]:
# give permission for access to google drive
from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
my_secret_key = userdata.get('HF_TOKEN')
#initial installations
!pip install huggingface_hub
!pip install sentencepiece
!pip install transformers[ja]
!pip install torch
!pip install accelerate
!pip install datasets
!pip install evaluate
!pip install sacrebleu

In [None]:
# load model
import torch
import accelerate


from transformers import MT5Tokenizer, MT5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", token=my_secret_key)

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small", token=my_secret_key)


In [None]:
# load dataset
from datasets import load_dataset, load_metric

dataset = load_dataset("HorikawaMegu/JPNErrorCorpus", token=my_secret_key)

metric = load_metric('sacrebleu')

max_input_length = 128
max_target_length = 128

# tokenize dataset

def tokenize_data(dataset):
  input = dataset['error']
  target = dataset['correction']
  tokenized_inputs = tokenizer(input, max_length = max_input_length, padding='max_length', truncation=True)
  #next tokenize targets
  with tokenizer.as_target_tokenizer():
    target_tokenized = tokenizer(target, max_length= max_target_length,padding='max_length',  truncation=True)
  model_inputs ={
      'input_ids' : tokenized_inputs['input_ids'],
      'attention_mask': tokenized_inputs['attention_mask'],
      'labels':target_tokenized['input_ids'] # target text is considered as labels
  }
  return model_inputs

In [None]:
# test the tokenizer:
with tokenizer.as_target_tokenizer():
  print(tokenizer('こんにちは、ドイツはいかがでしょうか。','これももう一つの分である。'))

In [None]:
#print out size of dataset
print(len(dataset['train']))
print(len(dataset['test']))
#use small portion for testing
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(25000))
small_eval_dataset = dataset['test'].shuffle(seed=42).select(range(2000))

#tokenize datasets:
tokenized_train_dataset = small_train_dataset.map(tokenize_data, batched=True)
tokenized_eval_dataset = small_eval_dataset.map(tokenize_data, batched=True)


In [None]:
#sample tokenized output
tokenized_train_dataset[0]

In [None]:
# create a callback to retrive epoch number
from transformers import TrainerCallback

class epoch_callback(TrainerCallback):
    def __init__(self):
        super().__init__()

    def on_epoch_end(self, args, state, control, **kwargs):
        # Custom code to execute at the end of each epoch
        epoch_number = state.epoch
        print(f"Epoch {epoch_number} completed.")

# we need 3 different items to calculate GLEU score:
# source sentence (error sentence)
# model prediction
# target

# lists to save the predictions and targets
prediction_list=[]
target_list=[]
epoch_list=[]

# save prediction function to log the model predictions to evaluate later
def save_prediction(prediction, target, epoch_num):
  prediction_list.append(prediction)
  target_list.append(target)
  epoch_list.append(epoch_num)

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Training args
batch_size = 8

training_args = Seq2SeqTrainingArguments(
    "google/mt5-small",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_accumulation_steps=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=7,
    predict_with_generate=True,
    push_to_hub=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)



In [None]:
# next step is to compute metrics from the predictions
import numpy as np

def postprocess_text (predictions, label):
  predictions = [pred.strip() for pred in predictions]
  labels = [[label.strip()] for label in label]

  return predictions, labels

def compute_metrics(eval_predictions, epoch_num):
  preds, labels = eval_predictions
  if isinstance(preds, tuple):
    preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds,skip_special_tokens=True)
  # replace -100 in the labels as they can't be decoded
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # post procssesing for the texts decoding
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  # call save function
  for pred, label in zip(decoded_preds, decoded_labels):
    save_prediction(pred, label, epoch_num)

  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"bleu": result["score"]}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}
  return result



def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
trainer = Trainer(
    model = model,
    args= training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=lambda eval_preds: compute_metrics(eval_preds, trainer.state.epoch),
    tokenizer=tokenizer,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    data_collator= data_collator,
    callbacks=[epoch_callback()] # use callback to log epoch num
)

In [None]:
trainer.train()

In [None]:
# save the log history for further evaluation

import pandas as pd
df= pd.DataFrame(trainer.state.log_history)

df.to_csv('/content/drive/My Drive/Transformers/log_historySMALL.csv', index=False)

In [None]:
df.head()

In [None]:
# save predictions in csv format
predictions_df = pd.DataFrame(list(zip(epoch_list,prediction_list,target_list)), columns=['epoch','prediction','target'])
predictions_df.to_csv('/content/drive/My Drive/Transformers/mT5small_predictions.csv', index=False)

In [None]:
trainer.push_to_hub('HorikawaMegu/JEC-mt5-small')

In [None]:
# unasign runtime
from google.colab import runtime
runtime.unassign()

small GLEU Scores:

* **Epoch 1:**.   0.00
* **Epoch 2:**	  44.69
* **Epoch 3:**	  63.83
* **Epoch 4:**	  65.05
* **Epoch 5:**	  66.56
* **Epoch 6:**	  66.18
* **Epoch 7:**	  66.13





base GLEU Scores:


* **Epoch 1:**	  21.44
* **Epoch 2:**	  63.54
* **Epoch 3:**	  62.69
* **Epoch 4:**	  64.53
* **Epoch 5:**	  65.69
* **Epoch 6:**	  67.94
* **Epoch 7:**	  67.99

