Install Dependencies

In [None]:
!pip install datasets
!pip install evaluate
!pip install torch
!pip install accelerate -U
!pip install transformers[torch]
!pip install sacrebleu

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)


In [None]:
import evaluate
import torch
import numpy as np

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)

In [None]:
# Allow file upload in google.colab
# Note: connect to T4 runtime to use GPU to speed up training times
# Start notebook, install dependencies, then runtime->restart runtime in google colab settings -> 
# Then run rest of code *without* reinstalling dependencies to avoid errors
from google.colab import files
uploaded = files.upload()

Saving korean_anki_deck.txt to korean_anki_deck (1).txt


In [None]:
# read data from .csv file
index = 0
rows = []
with open("korean_anki_deck.txt", 'r', encoding="utf-8") as f:
    for line in f:
        en = line.split('\t')[1]
        ko = line.split('\t')[0]
        # a few examples have no English translation, so filter those out
        if en == '':
            continue
        row = {'id': index, 'translation': {'en':en, 'ko':ko}}
        rows.append(row)
        index += 1

In [None]:
dataset_anki = DatasetDict({'train': Dataset.from_list(rows)})
dataset_anki

In [None]:
split_datasets_anki = dataset_anki["train"].train_test_split(train_size = .9, seed = 20)
split_datasets_anki['validation'] = split_datasets_anki.pop('test')
split_datasets_anki

In [None]:
# Choose model for korean to english translation
model_checkpoint = 'Helsinki-NLP/opus-mt-ko-en'

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors = 'pt')


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]



In [None]:

def preprocess_function(examples):
    inputs = [example["ko"] for example in examples["translation"]]
    targets = [example["en"] for example in examples["translation"]]
    model_inputs = tokenizer(
        inputs,
        text_target=targets,
        truncation = True
    )

    return model_inputs

In [None]:
tokenized_datasets_anki = split_datasets_anki.map(
    preprocess_function,
    batched=True,
    # defines columns to remove with following names from the dataset
    remove_columns=split_datasets_anki["train"].column_names
)

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets_anki

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2170
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 242
    })
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

Use BLEU score as evaluation metric

In [None]:

metric = evaluate.load('sacrebleu')

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # in case the model returns more than the prediciton logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens = True)

    # replace -100s in the labels to regular pad_token_id as we can't decode -100s
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    # remove leading/trailing whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references = decoded_labels)
    return {"bleu": result["score"]}



In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"finetuned-kde4-ko-to-en",
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay = .01,
    save_total_limit = 3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
)

In [None]:
from transformers import Seq2SeqTrainer
trainer_anki = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_anki["train"],
    eval_dataset=tokenized_datasets_anki["validation"],
    data_collator=data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

In [None]:
max_length = 128

In [None]:
trainer_anki.evaluate(max_length = max_length)
# Result: default starting BLEU score is 26.8 (decent translation with significant gramatical errors)
# (50-60 is considered very high quality translation, similar to human translations)

{'eval_loss': 1.0479423999786377,
 'eval_bleu': 35.58610248687253,
 'eval_runtime': 11.0983,
 'eval_samples_per_second': 21.805,
 'eval_steps_per_second': 1.442,
 'epoch': 2.0}

In [None]:
trainer_anki.train()

Step,Training Loss
500,1.0886


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


TrainOutput(global_step=544, training_loss=1.0743724808973425, metrics={'train_runtime': 79.2763, 'train_samples_per_second': 54.745, 'train_steps_per_second': 6.862, 'total_flos': 16185409929216.0, 'train_loss': 1.0743724808973425, 'epoch': 2.0})

In [None]:
trainer_anki.evaluate(max_length = max_length)
# Result: Final BLEU score after fine-tuning is 35.6 (considered to be a good translation)

{'eval_loss': 1.6353167295455933,
 'eval_bleu': 47.749822262505624,
 'eval_runtime': 50.0356,
 'eval_samples_per_second': 0.799,
 'eval_steps_per_second': 0.06,
 'epoch': 2.0}

In [None]:
# saving fine-tuned model
trainer_anki.save_model("gdrive/My Drive/finetuned-kde4-en-to-ko2")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


In [None]:
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("gdrive/My Drive/finetuned-kde4-en-to-ko2", local_files_only=True)

Use new model for translations:

In [None]:
from transformers import pipeline
p = pipeline('translation', model = "gdrive/My Drive/finetuned-kde4-en-to-ko2")

In [None]:
p = pipeline('translation', model = model_checkpoint)



In [None]:
p("무거워서 떨어뜨릴 뻔 했어요")


[{'translation_text': "I was so heavy I'd have to drop it."}]