API's Needed

In [1]:
!pip install transformers sentencepiece datasets

[0m

Imports

In [21]:
# Hugging face transformers to download pretrained model and tokenizer
import transformers
# Hugging face datasets to download the dataset
import datasets
# Pytorch for tensor
import torch
# For ploting the graph
import matplotlib.pyplot as plt
# Basic arithmatic operations
import numpy as np
# To show the progress bar
import tqdm
# For data handling
import itertools
# For turning parallelism on and off
import os
# Specific functions from the libraries
from datasets import load_dataset
from transformers import AdamW, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import get_linear_schedule_with_warmup

# Model
Here we download the pre-trained model and tokenizer

In [2]:
# model repository
model_repo = 'google/mt5-base'
# download mt5 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_repo)
# download model
model= AutoModelForSeq2SeqLM.from_pretrained(model_repo)
# puts model onto GPU
model = model.cuda()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Dataset
Here we will be defining the dataset and downloading it

In [3]:
# Loading the dataset
dataset = load_dataset('alt')
# split the dataset into train validation and test
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

Adding the special tokens

In [4]:
# add language token mapping to the tokenizer
LANG_TOKEN_MAPPING = {
    'en' : '<en>',
    'fil' : '<fil>',
    'hi' : '<hi>',
    'id' : '<id>',
    'ja' : '<ja>', 
}
# create a dict of the dict
special_tokens = { 'additional_special_tokens': list(LANG_TOKEN_MAPPING.values()) }
# add special tokens to the tokenizer
tokenizer.add_special_tokens(special_tokens)
# resize the token embeddings layer to correct size
model.resize_token_embeddings(len(tokenizer))

Embedding(250105, 768)

# Data Handling
Functions to handle the data

In [5]:
# tokenizes and numericalizes input string
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

# tokenizes and numericalizes target string
def encode_target_str(text, tokenizer, seq_len):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

# get all translations between all permutations of pairs of languages
def get_all_translation_data(translations, lang_token_map,
                            tokenizer, seq_length=20):
  input_ids = []
  target_ids = []
  
  langs = list(lang_token_map.keys())
  for input_lang, target_lang in itertools.permutations(langs, 2):
    input_text = translations[input_lang]
    target_text = translations[target_lang]
    
    if input_text is None or target_text is None:
        return None, None
    
    input_ids.append(encode_input_str(input_text, target_lang, tokenizer, seq_length, 
                                    lang_token_map))
    
    target_ids.append(encode_target_str(target_text, tokenizer, seq_length))
  
  return input_ids, target_ids

# generator function
def get_full_dataloader(dataset, lang_token_map, tokenizer, batch_size=32, num_workers=8):
    # get translations from the dataset
    dataset = train_dataset['translation']
    # intialize array
    data = []
    for example in dataset:
        # get translations for all permuations of languages
        input_id, target_id = get_all_translation_data(example, lang_token_map, tokenizer)
        # case where nothing is returned
        if input_id is None or target_id is None:
            continue
        # add the list of target and inputs 
        list_of_dicts = list(map(lambda x, y: {'input_ids': x, 'target_ids': y}, input_id, target_id))
        data = data + list_of_dicts
    # load dataset into a dataloader
    loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    # return the dataloader
    return loader

In [6]:
# tokenizes and numericalizes input string
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

# tokenizes and numericalizes target string
def encode_target_str(text, tokenizer, seq_len):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

# improvement would be do this for all permutations of languages 
# or at least more than once per example
def format_translation_data(translations, lang_token_map,
                            tokenizer, seq_length=20):
  # choose 2 random languages for i/o
  langs = list(lang_token_map.keys())
  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)
  input_text = translations[input_lang]
  target_text = translations[target_lang]
  
  if input_text is None or target_text is None:
    return None, None
  
  input_ids = encode_input_str(input_text, target_lang, tokenizer, seq_length, 
                                lang_token_map)
  
  target_ids = encode_target_str(target_text, tokenizer, seq_length)
  
  return input_ids, target_ids

# gets a random batch of translations b/w two languages
def transform_batch(batch, lang_token_map, tokenizer, seq_length=20):
  input_ids = []
  target_ids = []
  
  for example in batch['translation']:
      input_id, target_id = format_translation_data(example, lang_token_map, tokenizer)
      
      if input_id is not None:
          input_ids.append(input_id)
          target_ids.append(target_id)
  
  input_ids = torch.stack(input_ids).cuda()
  target_ids = torch.stack(target_ids).cuda()
  
  return input_ids, target_ids

# generator function
def get_data_generator(dataset, lang_token_map, tokenizer, batch_size=32):
  dataset = dataset.shuffle()
  
  for i in range(0, len(dataset), batch_size):
      batch = dataset[i:i+batch_size]
      yield transform_batch(batch, lang_token_map, tokenizer)

def get_dataloader(dataset, lang_token_map, tokenizer, batch_size=32):
  dataset = dataset.shuffle()
  dataset = dataset.map(lambda batch: transform_batch(batch, lang_token_map, tokenizer), batched=True)
  dataset.set_format(type='torch', columns=['input_ids', 'target_ids'])
  data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
  
  return data_loader



# Training

Evaluation function for validation

In [7]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)
  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [17]:
# Hyperparameters
EPOCHS = 5
batch_size = 64
learning_rate = 5e-3
n_batches = np.ceil(len(train_dataset) * 20 / batch_size)
total_steps = n_batches * EPOCHS
print_freq = int(total_steps / 100)
checkpoint_freq = int(total_steps / 33)
n_warmup_steps = int(0.01 * total_steps)

In [18]:
print(print_freq, checkpoint_freq, n_warmup_steps)

282 856 282


In [19]:
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
schedular = get_linear_schedule_with_warmup(optimizer, n_warmup_steps, total_steps)

Training the model using transfer learning

In [22]:
loss_i = []

best_loss = float("inf")

for epoch in range(EPOCHS):
    # Turn parallelism on
    os.environ["TOKENIZER_PARALLELISM"] = "true"
    # Randomize data order, need to figure out a faster way to do this
    loader = get_full_dataloader(train_dataset, LANG_TOKEN_MAPPING, tokenizer, batch_size, num_workers=8)
    # Turn parallelism off
    os.environ["TOKENIZER_PARALLELISM"] = "false"
    
    for i, batch in tqdm.tqdm(enumerate(loader), total = n_batches):
        inputs, targets = batch['input_ids'].cuda(), batch['target_ids'].cuda()
        
        # Zero gradients
        optimizer.zero_grad()
        # Forward pass (computes outputs and loss)
        output = model(input_ids=inputs, labels=targets)
        loss = output.loss
        # Back propagation (computes gradients)
        loss.backward()
        # Optimization and scheduling
        optimizer.step()
        # Adjust every 100 batches
        if(i+1) % 250 == 0:
            loss_i.append(loss.item())
            schedular.step()
        # prints training updates
        if (i+1) % print_freq == 0:
            print(f'Epoch: {epoch + 1}, Batch: {i+1}/{n_batches}, Loss: {loss.item()}, LR: {schedular.get_last_lr()[0]}')
        
        if (i + 1) % checkpoint_freq == 0:
            test_loss = eval_model(model, test_dataset)
            if(test_loss < best_loss):
                print('Saving model with test loss of {:.3f}'.format(test_loss))
                torch.save(model.state_dict(), 'mt5_translator_best.pt')
                best_loss = test_loss
# Save the final model
torch.save(model.state_dict(), 'mt5_translator_final2.pt')
plt.plot(loss_i)