<a href="https://colab.research.google.com/github/niklucky/ml-sandbox/blob/main/nmt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model: mt5
Attention based model

# Translation models

This is a playground where I'm playing around with with language models

## Materials

* Transformer paper: https://arxiv.org/abs/1706.03762
* mT5 paper: https://arxiv.org/abs/2010.11934
* Article on how transformers work: http://towardsdatascience.com/transformers-141e32e69591


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install transformers sentencepiece datasets

In [55]:
from datasets import load_dataset
from google.colab import drive
from IPython.display import display
from IPython.html import widgets
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook

sns.set()

In [56]:
# Constants
model_repo = "google/mt5-small"
max_seq_len = 20

In [57]:
tokenizer = AutoTokenizer.from_pretrained(model_repo)

In [58]:
# Model description https://huggingface.co/google/mt5-small
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)
model = model.cuda()

In [59]:
input_sentence = "Here is our test sentence!"
token_ids = tokenizer.encode(input_sentence, return_tensors='pt').cuda()

token_ids

model_out = model.generate(token_ids)
print(model_out)

output_text = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(model_out[0])
)

print(output_text)

tensor([[     0, 250099,      1]], device='cuda:0')
<pad> <extra_id_0></s>


# Steps
1. Load pretrained model and tokenizer
2. Load in the dataset
3. Transform dataset into input
4. Train/finetune the model on our dataset
5. Test the model

In [60]:
example_input_str = '<jp>This is a test nguig.'
input_ids = tokenizer.encode(example_input_str, return_tensors='pt')
print('Input IDs:', input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(tokens)

Input IDs: tensor([[ 1042,  3889,   669, 13673,   339,   259,   262,  2978,   259,  1180,
          1315,   260,     1]])
['▁<', 'jp', '>', 'This', '▁is', '▁', 'a', '▁test', '▁', 'ngu', 'ig', '.', '</s>']


In [61]:
# sorted(tokenizer.vocab.items(), key=lambda x: x[1])

dataset = load_dataset('alt')

In [62]:
train_dataset = dataset['train']
test_dataset = dataset['test']

# To see model mapping
# train_dataset[0]

In [69]:
LANG_TOKEN_MAPPING = {
    'en': '<en>',
    'ru': '<ru>',
    'ja': '<jp>',
}

In [70]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 250104. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(250104, 512)

In [71]:
token_ids = tokenizer.encode(
    example_input_str,
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=max_seq_len
  )
print(token_ids)


tensor([[ 1042,  3889,   669, 13673,   339,   259,   262,  2978,   259,  1180,
          1315,   260,     1,     0,     0,     0,     0,     0,     0,     0]])


In [72]:
def encode_input_str(
    text, target_lang, tokenizer, seq_len,
    lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
    text = target_lang_token + text,
    return_tensors = 'pt',
    padding='max_length',
    truncation=True,
    max_length=seq_len
  )

  return input_ids[0]


def encode_target_str(
    text, tokenizer, seq_len,
    lang_token_map=LANG_TOKEN_MAPPING):

  token_ids = tokenizer.encode(
    text = text,
    return_tensors = 'pt',
    padding='max_length',
    truncation=True,
    max_length=seq_len)

  return token_ids[0]

def format_translation_data(translations, lang_token_map, tokenizer, seq_len=128):
  langs = list(lang_token_map.keys())

  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)

  # Get the translations from the batch
  input_text = translations[input_lang]
  target_text = translations[target_lang]

  if input_text is None or target_text is None:
    return None

  input_token_ids = encode_input_str(
      input_text, target_lang, tokenizer, seq_len, lang_token_map
  )

  target_token_ids = encode_target_str(
      target_text, tokenizer, seq_len, lang_token_map
  )

  return input_token_ids, target_token_ids

def transform_batch(batch, lang_token_map, tokenizer):
  inputs = []
  targets = []

  for translation_set in batch['translation']:
    formatted_data = format_translation_data(translation_set, lang_token_map, tokenizer, max_seq_len)

    if formatted_data is None:
      continue

    input_ids, target_ids = formatted_data
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))

  batch_input_ids = torch.cat(inputs).cuda()
  batch_target_ids = torch.cat(targets).cuda()

  return batch_input_ids, batch_target_ids

def get_data_generator(dataset, lang_token_map, tokenizer, batch_size=32):
  dataset = dataset.shuffle()
  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i:i+batch_size]
    yield transform_batch(raw_batch, lang_token_map, tokenizer)

In [73]:
in_ids, out_ids = format_translation_data(train_dataset[0]['translation'], LANG_TOKEN_MAPPING, tokenizer)

print(' '.join(tokenizer.convert_ids_to_tokens(in_ids)))
print(' '.join(tokenizer.convert_ids_to_tokens(out_ids)))

data_gen = get_data_generator(train_dataset, LANG_TOKEN_MAPPING, tokenizer, 8)
data_batch = next(data_gen)

print("Input shape: ", data_batch[0].shape)
print("Output shape: ", data_batch[1].shape)

KeyError: ignored

In [None]:
n_epochs = 5
batch_size = 16
print_freq = 50
lr = 5e-4
n_batches = int(np.ceil(len(train_dataset) / batch_size))
total_steps = n_epochs * n_batches

n_warmup_steps = int(total_steps * 0.01)

In [None]:
# Optimizer mt5

optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, n_warmup_steps, total_steps)


In [None]:
losses = []

In [None]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset, LANG_TOKEN_MAPPING, tokenizer, batch_size)

  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch
    )
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [None]:
test_loss = eval_model(model, test_dataset)

In [None]:
test_loss

In [None]:
for epoch_idx in range(n_epochs):
  # Randomize data order
  data_generator = get_data_generator(train_dataset, LANG_TOKEN_MAPPING, tokenizer, batch_size)

  for batch_idx, (input_batch, label_batch) in tqdm_notebook(enumerate(data_generator), total=n_batches):

    optimizer.zero_grad()

    # Forward pass
    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)

    loss = model_out.loss
    losses.append(loss.item())
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Print training update info
    if (batch_idx + 1) % print_freq == 0:
      avg_loss = np.mean(losses[-print_freq:])
      print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {}'.format(
          epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]
      ))

In [78]:
# @title Translation form
input_text = "This is a test" # @param {type:"string"}
output_language = "ja" # @param ["en", "uk", "ja"]

input_ids = encode_input_str(
    text = input_text,
    target_lang = output_language,
    tokenizer = tokenizer,
    seq_len = model.config.max_length,
    lang_token_map = LANG_TOKEN_MAPPING)

input_ids = input_ids.unsqueeze(0).cuda()

output_tokens = model.generate(input_ids, num_beams=20, length_penalty=0.2)

print(input_text + ' -> ' + tokenizer.decode(output_tokens[0], skip_special_tokens=True))





KeyError: ignored