In [1]:
MODEL_NAME = "Helsinki-NLP/opus-mt-en-ro"
MAX_INPUT_LENGTH = 128
SOURCE_LANG = "en"
TARGET_LANG = "ro"
MODEL_OUTPUT_DIR = "trained_model/"
MODEL_EVALUATION_STRATEGY = "epoch"
MODEL_LEARNING_RATE = 2e-5
MODEL_BATCH_SIZE = 16
MODEL_WEIGHT_DECAY = 0.01
MODEL_SAVE_TOTAL_LIMIT = 3
MODEL_EPOCHS_NUM = 1
MODEL_PREDICT_WITH_GENERATE = True
MODEL_FP16 = True

In [2]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
torch.cuda.is_available()

True

In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("wmt16", "ro-en")
raw_datasets

Found cached dataset wmt16 (/home/bill/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 610320
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1999
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1999
    })
})

In [4]:
from datasets import load_metric

metric = load_metric("sacrebleu")
metric

  metric = load_metric("sacrebleu")


Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
        - `'none'`: no smoothing
        - `'floor'`: increment zero counts
        - `'add-k'`: increment num/deno

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if "mbart" in MODEL_NAME:
  tokenizer.src_lang = "en-XX"
  tokenizer.tgt_lang = "ro-RO"
tokenizer("Hello, this one sentence!")



{'input_ids': [125, 778, 3, 63, 141, 9191, 23, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
if MODEL_NAME in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
  prefix = "translate English to Romanian: "
else:
  prefix = ""
prefix

''

In [7]:
def process(examples):
  inputs = [prefix + example[SOURCE_LANG] for example in examples["translation"]]
  targets = [example[TARGET_LANG] for example in examples["translation"]]
  model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=MAX_INPUT_LENGTH, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

process(raw_datasets["train"][:2])



{'input_ids': [[393, 4462, 14, 1137, 53, 216, 28636, 0], [24385, 14, 28636, 14, 4646, 4622, 53, 216, 28636, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[42140, 494, 1750, 53, 8, 59, 903, 3543, 9, 15202, 0], [36199, 6612, 9, 15202, 122, 568, 35788, 21549, 53, 8, 59, 903, 3543, 9, 15202, 0]]}

In [8]:
tokenized_datasets = raw_datasets.map(process, batched=True)
tokenized_datasets

Loading cached processed dataset at /home/bill/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-89702f57dec378f3.arrow
Loading cached processed dataset at /home/bill/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-af72b3f5a0423bd8.arrow
Loading cached processed dataset at /home/bill/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-820db1c98dd6aaf0.arrow


DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 610320
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1999
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1999
    })
})

In [9]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [10]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
  output_dir=MODEL_OUTPUT_DIR,
  evaluation_strategy=MODEL_EVALUATION_STRATEGY,
  learning_rate=MODEL_LEARNING_RATE,
  per_device_train_batch_size=MODEL_BATCH_SIZE,
  per_device_eval_batch_size=MODEL_BATCH_SIZE,
  weight_decay=MODEL_WEIGHT_DECAY,
  save_total_limit=MODEL_SAVE_TOTAL_LIMIT,
  num_train_epochs=MODEL_EPOCHS_NUM,
  predict_with_generate=MODEL_PREDICT_WITH_GENERATE,
  fp16=MODEL_FP16
)

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
data_collator

DataCollatorForSeq2Seq(tokenizer=PreTrainedTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-ro', vocab_size=59543, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}), model=MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59543, 512, padding_idx=59542)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59543, 512, padding_idx=59542)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm

In [12]:
def postprocess(predictions, labels):
  predictions = [prediction.strip() for prediction in predictions]
  labels = [[label.strip()] for label in labels]
  return predictions, labels

In [13]:
import numpy as np

def compute_metrics(evaluation_predictions):
  predictions, labels = evaluation_predictions
  if isinstance(predictions, tuple):
    predictions = predictions[0]
  decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  decoded_predictions, decoded_labels = postprocess(decoded_predictions, decoded_labels)
  result = metric.compute(predictions=decoded_predictions, references=decoded_labels)
  result = {"bleu": result["score"]}
  prediction_lengths = [np.count_nonzero(prediction != tokenizer.pad_token_id) for prediction in predictions]
  result["get_len"] = np.mean(prediction_lengths)
  result = {key: round(value, 4) for key, value in result.items()}
  return result

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
  model,
  training_args,
  train_dataset=tokenized_datasets["train"],
  eval_dataset=tokenized_datasets["validation"],
  data_collator=data_collator,
  tokenizer=tokenizer,
  compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [15]:
train_model = True
if train_model:
  trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation. If translation are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 610320
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 38145
  Number of trainable parameters = 74624512


Epoch,Training Loss,Validation Loss,Bleu,Get Len
1,0.7441,1.288951,28.1621,34.084


Saving model checkpoint to trained_model/checkpoint-500
Configuration saved in trained_model/checkpoint-500/config.json
Model weights saved in trained_model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-500/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-37000] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-1000
Configuration saved in trained_model/checkpoint-1000/config.json
Model weights saved in trained_model/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-1000/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-37500] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-1500
Configuration saved in trained_model/checkpoint-1500/c

tokenizer config file saved in trained_model/checkpoint-9500/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-9500/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-8000] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-10000
Configuration saved in trained_model/checkpoint-10000/config.json
Model weights saved in trained_model/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-10000/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-10000/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-8500] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-10500
Configuration saved in trained_model/checkpoint-10500/config.json
Model weights saved in trained_model/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-10500/tokenizer_config.json
Special tokens f

Special tokens file saved in trained_model/checkpoint-18500/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-17000] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-19000
Configuration saved in trained_model/checkpoint-19000/config.json
Model weights saved in trained_model/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-19000/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-19000/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-17500] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-19500
Configuration saved in trained_model/checkpoint-19500/config.json
Model weights saved in trained_model/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-19500/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-19500/special_tokens_map.json
Deleting old

Deleting older checkpoint [trained_model/checkpoint-26000] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-28000
Configuration saved in trained_model/checkpoint-28000/config.json
Model weights saved in trained_model/checkpoint-28000/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-28000/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-28000/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-26500] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-28500
Configuration saved in trained_model/checkpoint-28500/config.json
Model weights saved in trained_model/checkpoint-28500/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-28500/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-28500/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-27000] due to args.save_total_limit
Saving m

Saving model checkpoint to trained_model/checkpoint-37000
Configuration saved in trained_model/checkpoint-37000/config.json
Model weights saved in trained_model/checkpoint-37000/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-37000/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-37000/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-35500] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-37500
Configuration saved in trained_model/checkpoint-37500/config.json
Model weights saved in trained_model/checkpoint-37500/pytorch_model.bin
tokenizer config file saved in trained_model/checkpoint-37500/tokenizer_config.json
Special tokens file saved in trained_model/checkpoint-37500/special_tokens_map.json
Deleting older checkpoint [trained_model/checkpoint-36000] due to args.save_total_limit
Saving model checkpoint to trained_model/checkpoint-38000
Configuration saved in trained_model/c

In [16]:
save_model = True
if save_model:
  model.save_pretrained(MODEL_OUTPUT_DIR)

Configuration saved in trained_model/config.json
Model weights saved in trained_model/pytorch_model.bin


In [17]:
trained_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_OUTPUT_DIR)

loading configuration file trained_model/config.json
Model config MarianConfig {
  "_name_or_path": "trained_model/",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59542
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59542,
  "decoder_vocab_size": 59543,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
   

In [18]:
test_texts = ["I am a French colonist.", "I am an American asshole."]
test_texts

['I am a French colonist.', 'I am an American asshole.']

In [19]:
model_input = tokenizer(test_texts, padding=True, return_tensors="pt")
model_output = trained_model.generate(**model_input, num_beams=5)
model_output



tensor([[59542,   276, 16018,   869, 11479,     2,     0, 59542, 59542],
        [59542,   276,    42,    88,  3795,  2491,  6437,     2,     0]])

In [20]:
tokenizer.batch_decode(model_output, skip_special_tokens=True)

['Sunt colonist francez.', 'Sunt un dobitoc american.']