In [1]:
import os
import torch
import random
import datasets

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from datetime import datetime
from transformers import AdamW
from transformers import get_scheduler
from torch.utils.data import DataLoader
from IPython.display import display, HTML
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [18]:
def process_data():
    """
    Output: Translated text of the source text
    :param src_text: source string
    """ 
    
    #Load data (needs to be changed to loading from BQ or bucket)
    df = pd.read_csv('../../data/en_GB-da_DK/en_GB-da_DK_batch_0.csv', engine='python')

    df = df[['source', 'target']] # will be redundent when setup 
    df['source'] = df['source'].apply(lambda s: str(s))
    df['target'] = df['target'].apply(lambda s: str(s))

    # Transform to HuggingFace Dataset
    data = Dataset.from_pandas(pd.DataFrame({'translation': df.to_dict('records')})) 

    # Split data into training sets
    train_test_valid = data.train_test_split(shuffle=True, seed=7, test_size=0.02)
    test_valid = train_test_valid['test'].train_test_split(shuffle=True, seed=7, test_size=0.5)

    # Convert to train/validate/test
    dataset = DatasetDict({
        'train': train_test_valid['train'],
        'validation': test_valid['test'],
        'test': test_valid['train']})
     
    return dataset

In [19]:
def tokenization_processing(dataset, source_len=128, target_len=128,
                            source="source", target="target"):
    """
    Output: Generates tokenized data using the attributes of the base model
    :param dataset: raw dataset to transform
    :param source_len: maximum sentence length for source string
    :param target_len: maximum sentence length for target string
    :param source: source language
    :param target: target language
    """
    
    inputs = [s[source] for s in dataset["translation"]]
    targets = [s[target] for s in dataset["translation"]]
    
    model_inputs = tokenizer(inputs, max_length=source_len, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=target_len, truncation=True)
        
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [20]:
def compute_metrics(predictions): # n.b. can add more metrics later
    """
    Output: evaluation metrics to track model performance in training
    :param predictions: output of predictions to decode
    """
    
    metric = load_metric("sacrebleu")
    
    def process_text(predictions, labels):
        preds = [pred.strip() for pred in predictions]
        labels = [[label.strip()] for label in labels]
        return preds, labels
    
    preds, labels = predictions
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = process_text(decoded_preds, decoded_labels)
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    
    result = {k: round(v, 6) for k, v in result.items()} #round results
    
    return result

In [21]:
model_name="Helsinki-NLP/opus-mt-en-da"

dataset = process_data()

print('load models and data collator')
trained_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=trained_model)

load models and data collator


In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 155967
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1592
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1592
    })
})

In [23]:
tokenized_dataset = dataset.map(tokenization_processing, batched=True)

  0%|          | 0/156 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [24]:
time = str(datetime.now())

batch_size=16
learning_rate=2e-5 
weight_decay=0.01
save_limit=10
epochs=20

# n.b. need to add early stopping with longer training times
args = Seq2SeqTrainingArguments(
    f"../../models/checkpoints/IKEA_MT_en_GB-da_DK_{time}",
    evaluation_strategy = "epoch",
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay = weight_decay,
    save_total_limit = save_limit,
    num_train_epochs = epochs,
    predict_with_generate=True    
)


trainer = Seq2SeqTrainer(
    trained_model,
    args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 155967
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 194960


Epoch,Training Loss,Validation Loss,Bleu
1,0.8771,0.807595,52.817803
2,0.7491,0.766236,54.134318
3,0.6593,0.750477,54.496762
4,0.5885,0.746495,55.12512
5,0.5329,0.748914,55.496022
6,0.4801,0.750624,55.953125
7,0.4532,0.752074,56.264667
8,0.4196,0.760601,56.315842


Saving model checkpoint to ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-500
Configuration saved in ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-500/config.json
Model weights saved in ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-1000
Configuration saved in ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-1000/config.json
Model weights saved in ../../models/checkpoints/IKEA_MT_en_GB-da_DK_2022-12-28 17:21:10.616551/checkpoint-1000/pytorch

In [None]:
trainer.save_model('../../models/en_GB-da_DK/IKEA_MT-en_da_DK_' + time)

In [None]:
trainer.evaluate