Install Dependencies

In [None]:
!pip install datasets
!pip install evaluate
!pip install torch
!pip install accelerate -U
!pip install transformers[torch]
!pip install sacrebleu

In [None]:
import evaluate
import torch
import numpy as np

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)

In [None]:
# Allow file upload in google.colab
# Note: connect to T4 runtime to use GPU to speed up training times
# Start notebook, install dependencies, then runtime->restart runtime in google colab settings -> 
# Then run rest of code *without* reinstalling dependencies to avoid errors
from google.colab import files
uploaded = files.upload()

In [None]:
# read data from .csv file
index = 0
rows = []
with open("korean_anki_deck.txt", 'r', encoding="utf-8") as f:
    for line in f:
        en = line.split('\t')[1]
        ko = line.split('\t')[0]
        # a few examples have no English translation, so filter those out
        if en == '':
            continue
        row = {'id': index, 'translation': {'en':en, 'ko':ko}}
        rows.append(row)
        index += 1

In [None]:
dataset_anki = DatasetDict({'train': Dataset.from_list(rows)})
dataset_anki

In [None]:
split_datasets_anki = dataset_anki["train"].train_test_split(train_size = .9, seed = 20)
split_datasets_anki['validation'] = split_datasets_anki.pop('test')
split_datasets_anki

In [None]:
# Choose model for korean to english translation
model_checkpoint = 'Helsinki-NLP/opus-mt-ko-en'

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors = 'pt')


In [None]:

def preprocess_function(examples):
    inputs = [example["ko"] for example in examples["translation"]]
    targets = [example["en"] for example in examples["translation"]]
    model_inputs = tokenizer(
        inputs,
        text_target=targets,
        truncation = True
    )

    return model_inputs

In [None]:
tokenized_datasets_anki = split_datasets_anki.map(
    preprocess_function,
    batched=True,
    # defines columns to remove with following names from the dataset
    remove_columns=split_datasets_anki["train"].column_names
)

In [None]:
tokenized_datasets_anki

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

Use BLEU score as evaluation metric

In [None]:

metric = evaluate.load('sacrebleu')

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # in case the model returns more than the prediciton logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens = True)

    # replace -100s in the labels to regular pad_token_id as we can't decode -100s
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    # remove leading/trailing whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references = decoded_labels)
    return {"bleu": result["score"]}



In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"finetuned-kde4-ko-to-en",
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay = .01,
    save_total_limit = 3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
)

In [None]:
from transformers import Seq2SeqTrainer
trainer_anki = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_anki["train"],
    eval_dataset=tokenized_datasets_anki["validation"],
    data_collator=data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

In [None]:
max_length = 128

In [None]:
trainer_anki.evaluate(max_length = max_length)
# Result: default starting BLEU score is 26.8 (decent translation with significant gramatical errors)
# (50-60 is considered very high quality translation, similar to human translations)

In [None]:
trainer_anki.train()

In [None]:
trainer_anki.evaluate(max_length = max_length)
# Result: Final BLEU score after fine-tuning is 35.6 (considered to be a good translation)

In [None]:
# saving fine-tuned model
trainer_anki.save_model("gdrive/My Drive/finetuned-kde4-en-to-ko2")

In [None]:
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("gdrive/My Drive/finetuned-kde4-en-to-ko2", local_files_only=True)

Use new model for translations:

In [None]:
from transformers import pipeline
p = pipeline('translation', model = "gdrive/My Drive/finetuned-kde4-en-to-ko2")

In [None]:
p = pipeline('translation', model = model_checkpoint)

In [None]:
# Testing out the fine-tuned translation model
p("무거워서 떨어뜨릴 뻔 했어요")
