In [2]:
import transformers
import sentencepiece
import datasets
import torch
import matplotlib.pyplot as plt
import numpy as np
import tqdm
import itertools

from datasets import load_dataset
from transformers import AdamW, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import get_linear_schedule_with_warmup

load model and tokenizer

In [3]:
model_repo = 'google/mt5-small'
# download mt5 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_repo)
# download model
model= AutoModelForSeq2SeqLM.from_pretrained(model_repo)
# puts model onto GPU
model = model.cuda()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


load dataset

In [4]:
dataset = load_dataset('alt')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [9]:
loader = torch.utils.data.DataLoader(dataset=train_dataset['translation'], batch_size=8, shuffle=True)

In [121]:
train_dataset['translation'][0]

{'bg': 'ফ্রান্সের প্যারিসের পার্ক দি প্রিন্সেস-এ হওয়া ২০০৭-এর রাগবি বিশ্বকাপের পুল সি-তে ইটালি পর্তুগালকে ৩১-৫ গোলে হারিয়েছে।',
 'en': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes, Paris, France.',
 'en_tok': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes , Paris , France .',
 'fil': 'Natalo ng Italya ang Portugal sa puntos na 31-5 sa Grupong C noong 2007 sa Pandaigdigang laro ng Ragbi sa Parc des Princes, Paris, France.',
 'hi': '2007 में फ़्रांस, पेरिस के पार्क डेस प्रिंसेस में हुए रग्बी विश्व कप के पूल C में इटली ने पुर्तगाल को 31-5 से हराया।',
 'id': 'Italia berhasil mengalahkan Portugal 31-5 di grup C dalam Piala Dunia Rugby 2007 di Parc des Princes, Paris, Perancis.',
 'ja': 'フランスのパリ、パルク・デ・プランスで行われた2007年ラグビーワールドカップのプールCで、イタリアは31対5でポルトガルを下した。',
 'khm': 'អ៊ីតាលីបានឈ្នះលើព័រទុយហ្គាល់ 31-5 ក្នុងប៉ូលCនៃពីធីប្រកួតពានរង្វាន់ពិភពលោកនៃកីឡាបាល់ឱបឆ្នាំ2007ដែលប្រព្រឹត្តនៅប៉ាសឌេសប្រីន ក្រុងប៉ារីស បារ

In [11]:
len(loader)

2261

In [17]:
a = loader.batch_sampler

In [26]:
for batch in loader:
    print(batch['en'][1])
    break

He has rowed across the Atlantic Ocean and the North Sea, as well as cycled across Russia and the icy terrain of Greenland.


In [23]:
len(batch)

14

In [30]:
# add language token mapping to the tokenizer
LANG_TOKEN_MAPPING = {
    'en' : '<en>',
    'fil' : '<fil>',
    'hi' : '<hi>',
    'id' : '<id>',
    'ja' : '<ja>', 
}
# create a dict of the dict
special_tokens = { 'additional_special_tokens': list(LANG_TOKEN_MAPPING.values()) }
# add special tokens to the tokenizer
tokenizer.add_special_tokens(special_tokens)
# resize the token embeddings layer to correct size
model.resize_token_embeddings(len(tokenizer))

Embedding(250105, 512)

In [29]:
# tokenizes and numericalizes input string
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

# tokenizes and numericalizes target string
def encode_target_str(text, tokenizer, seq_len):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

def get_all_translation_data(translations, lang_token_map,
                            tokenizer, seq_length=20):
  input_ids = []
  target_ids = []
  
  langs = list(lang_token_map.keys())
  for input_lang, target_lang in itertools.permutations(langs, 2):
    input_text = translations[input_lang]
    target_text = translations[target_lang]
    
    if input_text is None or target_text is None:
        return None, None
    
    input_ids.append(encode_input_str(input_text, target_lang, tokenizer, seq_length, 
                                    lang_token_map))
    
    target_ids.append(encode_target_str(target_text, tokenizer, seq_length))
  
  return input_ids, target_ids

In [31]:
a, b = get_all_translation_data(train_dataset['translation'][0], LANG_TOKEN_MAPPING, tokenizer)

In [37]:
print(len(train_dataset['translation'][0]))

14


In [35]:
type(a[2])

torch.Tensor

In [None]:
def get_dataloader():
    # get translations from the dataset
    dataset = train_dataset['translation']
    # intialize array
    data = []
    for example in dataset:
    
        input_id, target_id = get_all_translation_data(example, LANG_TOKEN_MAPPING, tokenizer)
        if input_id is None or target_id is None:
            continue
        list_of_dicts = list(map(lambda x, y: {'input_ids': x, 'target_ids': y}, input_id, target_id))
        data = data + list_of_dicts
    
    loader = torch.utils.data.DataLoader(data, batch_size=8, shuffle=True, num_workers=num_workers)
    
    return loader

In [119]:
dataset = train_dataset['translation']

data = []

for example in dataset:
    
    input_id, target_id = get_all_translation_data(example, LANG_TOKEN_MAPPING, tokenizer)
    if input_id is None or target_id is None:
        continue
    list_of_dicts = list(map(lambda x, y: {'input_ids': x, 'target_ids': y}, input_id, target_id))
    data = data + list_of_dicts
    
    loader = torch.utils.data.DataLoader(data, batch_size=8, shuffle=True)
    

KeyboardInterrupt: 

In [120]:
data

[{'input_ids': tensor([250100,  20161,    783,    269,  62956,    345,  15772,    381,  62482,
             281,  23577,    371,    304,    287,   1848,    259,  93887,   4025,
           10291,      1]),
  'target_ids': tensor([33756,   268,   594, 20161,   262,   740, 15772,   327,  8218,   263,
            294,   381, 62482,   327,   259, 37503,   370,   371,   375,     1])},
 {'input_ids': tensor([250102,  20161,    783,    269,  62956,    345,  15772,    381,  62482,
             281,  23577,    371,    304,    287,   1848,    259,  93887,   4025,
           10291,      1]),
  'target_ids': tensor([  1848,    844,  46739,  81794,  25488,    261,  21799, 102577,    641,
           78910,   1650,  31417,   2312,    259, 132205,  25488,  48242,    844,
            2948,      1])},
 {'input_ids': tensor([250103,  20161,    783,    269,  62956,    345,  15772,    381,  62482,
             281,  23577,    371,    304,    287,   1848,    259,  93887,   4025,
           10291,      1]),
 

In [122]:
# for some reason we are missing 6 examples but that shouldnt be a big deal
actual = len(data)
print(f"Actual Size: {actual} | Expected Size: {len(train_dataset) * 20}")

Actual Size: 38100 | Expected Size: 361760


In [104]:
list_of_dicts = [{"input_ids": input_id, "target_ids": target_id}]

In [66]:
len(input_id), len(target_id)

(20, 20)

In [75]:
a = zip(input_id, target_id)

In [77]:
for i in a:
    print(i)

(tensor([250100,   1669,    339,   1607,  37979,    345,    304,    259, 118703,
           347,    259,    262,  20974,    351,   8257,   9713,    261,    702,
          6174,      1]), tensor([ 47174,   1099,    281, 196962,   2192,    779,    594,   1961,  83660,
         62176,    594,    259,   4435,   7334,    444,    375,   2836, 146179,
         36294,      1]))
(tensor([250102,   1669,    339,   1607,  37979,    345,    304,    259, 118703,
           347,    259,    262,  20974,    351,   8257,   9713,    261,    702,
          6174,      1]), tensor([  259,  7813,  1587,  4736,  9670,  6606,   259, 57921, 74078, 62621,
        16599, 80224,  1162, 15478, 32065,  8674,   975,   259, 30830,     1]))
(tensor([250103,   1669,    339,   1607,  37979,    345,    304,    259, 118703,
           347,    259,    262,  20974,    351,   8257,   9713,    261,    702,
          6174,      1]), tensor([  6563,   1639,    301,  75111,   3809,   3617,  18968,    321,  91526,
          1404,

In [131]:
361760/32

11305.0

In [123]:
# data loader test
loader = torch.utils.data.DataLoader(data, batch_size=8, shuffle=True)

In [107]:
print(len(loader))

45205


In [128]:
for batch in loader:
    input = batch['input_ids'][0]
    target = batch['target_ids'][0]
    break

In [130]:
print(' '.join(tokenizer.convert_ids_to_tokens(input)))
print(' '.join(tokenizer.convert_ids_to_tokens(target)))

<en> ▁खो ज ▁क्षेत्र ▁का ▁ विस् तार ▁राज्य ▁सी मा ▁पर ▁पा स्क ाग ौ ला , ▁मि </s>
▁The ▁search ▁area ▁has ▁been ▁expand ed ▁ across ▁the ▁state ▁ border ▁to wards ▁Pas ca go ula </s>


In [117]:
type(train_dataset['translation'])

list

4