In [2]:
import os
import torch
import random
import datasets

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from datetime import datetime
from transformers import AdamW
from transformers import get_scheduler
from torch.utils.data import DataLoader
from IPython.display import display, HTML
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [3]:
def process_data():
    """
    Output: Translated text of the source text
    :param src_text: source string
    """ 
    
    #Load data (needs to be changed to loading from BQ or bucket)
    df = pd.read_csv('../../data/en_GB-es_ES/en_GB-es_ES_batch_0.csv', engine='python')

    df = df[['source', 'target']] # will be redundent when setup 
    df['source'] = df['source'].apply(lambda s: str(s))
    df['target'] = df['target'].apply(lambda s: str(s))

    # Transform to HuggingFace Dataset
    data = Dataset.from_pandas(pd.DataFrame({'translation': df.to_dict('records')})) 

    # Split data into training sets
    train_test_valid = data.train_test_split(shuffle=True, seed=7, test_size=0.02)
    test_valid = train_test_valid['test'].train_test_split(shuffle=True, seed=7, test_size=0.5)

    # Convert to train/validate/test
    dataset = DatasetDict({
        'train': train_test_valid['train'],
        'validation': test_valid['test'],
        'test': test_valid['train']})
     
    return dataset

In [4]:
def tokenization_processing(dataset, source_len=128, target_len=128,
                            source="source", target="target"):
    """
    Output: Generates tokenized data using the attributes of the base model
    :param dataset: raw dataset to transform
    :param source_len: maximum sentence length for source string
    :param target_len: maximum sentence length for target string
    :param source: source language
    :param target: target language
    """
    
    inputs = [s[source] for s in dataset["translation"]]
    targets = [s[target] for s in dataset["translation"]]
    
    model_inputs = tokenizer(inputs, max_length=source_len, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=target_len, truncation=True)
        
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [5]:
def compute_metrics(predictions): # n.b. can add more metrics later
    """
    Output: evaluation metrics to track model performance in training
    :param predictions: output of predictions to decode
    """
    
    metric = load_metric("sacrebleu")
    
    def process_text(predictions, labels):
        preds = [pred.strip() for pred in predictions]
        labels = [[label.strip()] for label in labels]
        return preds, labels
    
    preds, labels = predictions
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = process_text(decoded_preds, decoded_labels)
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    
    result = {k: round(v, 6) for k, v in result.items()} #round results
    
    return result

In [6]:
model_name="Helsinki-NLP/opus-mt-en-es"

dataset = process_data()

print('load models and data collator')
trained_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=trained_model)

load models and data collator


In [7]:
tokenized_dataset = dataset.map(tokenization_processing, batched=True)

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [10]:
tokenized_dataset['validation']['translation']

[{'source': 'Non-binary', 'target': 'No binario'},
 {'source': '“I saw us writing a PowerPoint presentation, and I had the fear that it would be a PowerPoint slide battle.',
  'target': '"Pensé en la idea de crear un PowerPoint y me daba miedo que la presentación consistiese únicamente en pasar diapositivas.'},
 {'source': 'Jin', 'target': 'Jin'},
 {'source': 'Who designed the range?', 'target': '¿Quién diseñó la serie?'},
 {'source': 'Display techniques linked to range presentation media',
  'target': 'Técnicas de exposición relacionadas con los medios de presentación del surtido'},
 {'source': 'or that could be given a new life or function.',
  'target': 'o que puedan tener una nueva vida o función.'},
 {'source': 'Actually, no one is right.',
  'target': 'En realidad, nadie tiene razón.'},
 {'source': 'Which is why you play a vital role in helping your store keep its ASC/MSC certification.',
  'target': 'Por esta razón, desempeñas un papel fundamental para ayudar a tu tienda a mante

In [11]:
time = str(datetime.now())

batch_size=16
learning_rate=2e-5 
weight_decay=0.01
save_limit=10
epochs=20

# n.b. need to add early stopping with longer training times
args = Seq2SeqTrainingArguments(
    f"../../models/Checkpoints/IKEA_MT_en_GB-es_ES_{time}",
    evaluation_strategy = "epoch",
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay = weight_decay,
    save_total_limit = save_limit,
    num_train_epochs = epochs,
    predict_with_generate=True    
)


trainer = Seq2SeqTrainer(
    trained_model,
    args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 210502
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 263140


Epoch,Training Loss,Validation Loss,Bleu
1,0.9464,0.851741,50.672347
2,0.8287,0.82413,52.075698
3,0.7315,0.82114,52.460513
4,0.6831,0.821804,52.666289
5,0.6201,0.824958,53.431321
6,0.565,0.828585,53.987921
7,0.5339,0.842678,54.200737
8,0.4914,0.849684,54.473092
9,0.4691,0.864588,54.468591
10,0.446,0.874038,54.932497


Saving model checkpoint to ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-500
Configuration saved in ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-500/config.json
Model weights saved in ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-1000
Configuration saved in ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-1000/config.json
Model weights saved in ../../models/Checkpoints/IKEA_MT_en_GB-es_ES_2023-01-11 07:51:23.419419/checkpoint-1000/pytorch

In [13]:
trainer.save_model('../../models/en_GB-es_ES/IKEA_MT-en_GB-es_ES_' + time)

Saving model checkpoint to ../../models/en_GB-es_ES/IKEA_MT-en_GB-es_ES_2023-01-11 07:51:23.419419
Configuration saved in ../../models/en_GB-es_ES/IKEA_MT-en_GB-es_ES_2023-01-11 07:51:23.419419/config.json
Model weights saved in ../../models/en_GB-es_ES/IKEA_MT-en_GB-es_ES_2023-01-11 07:51:23.419419/pytorch_model.bin
tokenizer config file saved in ../../models/en_GB-es_ES/IKEA_MT-en_GB-es_ES_2023-01-11 07:51:23.419419/tokenizer_config.json
Special tokens file saved in ../../models/en_GB-es_ES/IKEA_MT-en_GB-es_ES_2023-01-11 07:51:23.419419/special_tokens_map.json


In [14]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2148
  Batch size = 16


{'eval_loss': 0.9461814761161804,
 'eval_bleu': 55.268718,
 'eval_runtime': 123.5558,
 'eval_samples_per_second': 17.385,
 'eval_steps_per_second': 1.093,
 'epoch': 20.0}