In [None]:
# Install essential libraries for NLP tasks
!pip install transformers sentencepiece datasets

In [None]:
# Import necessary libraries for data loading, visualization, deep learning, and
# NLP model handling
from datasets import load_dataset
from google.colab import drive
from IPython.display import display
from IPython.html import widgets
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook

sns.set()

In [None]:
# Define the model repository
model_repo = 'google/mt5-small'
max_seq_len = model.config.max_length

In [None]:
# Load the tokenizer from the pre-trained model repository
tokenizer = AutoTokenizer.from_pretrained(model_repo)

In [None]:
# Load the pre-trained sequence-to-sequence model
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)
# Move the model to the GPU for faster computation
model = model.cuda()

In [None]:
input_sent = 'Here is our test sentence!'
# Encode the input sentence into token IDs and move to GPU
token_ids = tokenizer.encode(input_sent, return_tensors='pt').cuda()
token_ids

# Generate model output:
model_out = model.generate(token_ids)
print(model_out)

# Convert token IDs back to text
output_text = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(model_out[0]))
print(output_text)

In [None]:
# Encode an example input string into token IDs, then convert the token IDs back
# to tokens for verification
example_input_str = '<sl>This is a test nbuig.'
input_ids = tokenizer.encode(example_input_str, return_tensors='pt')
print('Input IDs: ', input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print('Tokens: ', tokens)

In [None]:
# Sort and display the tokenizer's vocabulary based on token IDs
sorted(tokenizer.vocab.items(), key=lambda x: x[1])

In [None]:
# Load the English to Slovenian subset of the opus-100 dataset from the
# Helsinki-NLP collection
dataset = load_dataset('Helsinki-NLP/opus-100', 'en-sl')

In [None]:
# Split the loaded dataset into training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
# Display the training dataset to inspect its contents
train_dataset

In [None]:
# Display the testing dataset to inspect its contents
test_dataset

In [None]:
# Display the first example from the training dataset to inspect a single data
# entry
train_dataset[0]

In [None]:
# Create a dictionary that maps language codes to special tokens representing
# each language
LANG_TOKEN_MAPPING = {
    'en': '<en>',
    'sl': '<sl>'
}

In [None]:
# Add special tokens for language indicators to the tokenizer and update the
# model's token embeddings
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Encode the example input string into token IDs with padding and truncation,
# and return as a PyTorch tensor
token_ids = tokenizer.encode(
    example_input_str, return_tensors='pt',
    padding='max_length',
    truncation=True, max_length=max_seq_len)
print(token_ids)

In [None]:
# Function to encode input text for model input, including special language tokens,
# with padding and truncation to ensure consistent sequence length
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

# Function to encode target text into token IDs with padding and truncation,
# ensuring uniform sequence length for model input
def encode_target_str(text, tokenizer, seq_len,
                      lang_token_map=LANG_TOKEN_MAPPING):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return token_ids[0]

# Function to format translation data by selecting random language pairs,
# encoding their texts into token IDs, and returning the encoded input and
# target sequences
def format_translation_data(translations, lang_token_map,
                              tokenizer, seq_len=128):
  langs = list(lang_token_map.keys())
  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)

  # Get the translations for the batch
  input_text = translations[input_lang]
  target_text = translations[target_lang]

  if input_text is None or target_text is None:
      return None

  input_token_ids = encode_input_str(
      input_text, target_lang, tokenizer, seq_len, lang_token_map)

  target_token_ids = encode_target_str(
      target_text, tokenizer, seq_len, lang_token_map)

  return input_token_ids, target_token_ids

# Process a batch of translation data by formatting and encoding each translation set,
# concatenate the encoded inputs and targets into tensors, and move them to the
# GPU
def transform_batch(batch, lang_token_map, tokenizer):
  inputs = []
  targets = []
  for translation_set in batch['translation']:
    formatted_data = format_translation_data(
        translation_set, lang_token_map, tokenizer, max_seq_len)

    if formatted_data is None:
      continue

    input_ids, target_ids = formatted_data
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))

  batch_input_ids = torch.cat(inputs).cuda()
  batch_target_ids = torch.cat(targets).cuda()

  return batch_input_ids, batch_target_ids

# Generate batches of formatted and encoded data from the shuffled dataset, with
# each batch processed by the 'transform_batch' function for use in model
# training or evaluation
def get_data_generator(dataset, lang_token_map, tokenizer, batch_size=32):
  dataset = dataset.shuffle()
  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i:i+batch_size]
    yield transform_batch(raw_batch, lang_token_map, tokenizer)

In [None]:
in_ids, out_ids = format_translation_data(
    train_dataset[1]['translation'], LANG_TOKEN_MAPPING, tokenizer)

print(' '.join(tokenizer.convert_ids_to_tokens(in_ids)))
print(' '.join(tokenizer.convert_ids_to_tokens(out_ids)))

data_gen = get_data_generator(train_dataset, LANG_TOKEN_MAPPING, tokenizer, 8)
data_batch = next(data_gen)

print('Input shape:', data_batch[0].shape)
print('Output shape:', data_batch[1].shape)

In [None]:
n_epochs = 5
batch_size = 16
print_freq = 50
lr = 5e-4
n_batches = int(np.ceil(len(train_dataset) / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)

In [None]:
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, n_warmup_steps, total_steps)

In [None]:
losses = []

In [None]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)
  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [None]:
test_loss = eval_model(model, test_dataset)

In [None]:
test_loss

In [None]:
for epoch_idx in range(n_epochs):

  # Randomize data order
  data_generator = get_data_generator(train_dataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)

  for batch_idx, (input_batch, label_batch) \
      in tqdm_notebook(enumerate(data_generator), total=n_batches):

      optimizer.zero_grad()

      # Forward pass
      model_out = model.forward(
          input_ids = input_batch,
          labels = label_batch)

      loss = model_out.loss
      losses.append(loss.item())
      loss.backward()
      optimizer.step()
      scheduler.step()

      # Print training update info
      if (batch_idx + 1) % print_freq == 0:
        avg_loss = np.mean(losses[-print_freq:])
        print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {}'.format(
            epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]))

  test_loss = eval_model(model, test_dataset)
  print('Test loss of {:.3f}'.format(test_loss))