In [1]:
import torch
from transformers import MBartTokenizer, MBartConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb

In [3]:
torch.backends.cudnn.benchmark = True

In [9]:
from MBart import MBart

print(torch.cuda.is_available())

tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="en_XX")
mbart_config = MBartConfig(encoder_layers=6, decoder_layers=6,
                               encoder_ffn_dim=256, decoder_ffn_dim=256,
                               encoder_attention_heads=8, decoder_attention_heads=8,
                               d_model=256, max_length=128, vocab_size=tokenizer.vocab_size)

model: MBart = MBart(mbart_config)
print(model_size(model))

True
271.2848320007324


In [5]:
from MBartDataset import MBartDataset
import datasets
from torch.utils.data import DataLoader

dataset_loaded = datasets.load_from_disk("europarl_eng_tokenized")
my_ds = MBartDataset(dataset_loaded, tokenizer, 0.001)
ds_en_loader = DataLoader(my_ds, batch_size=6, drop_last=True, shuffle=True, pin_memory=True, pin_memory_device='cuda', num_workers=8)

In [10]:
from torch.optim import *
model.fit(ds_en_loader, Adam(model.parameters()), epochs=5)

Epoch 1 of 5


  0%|          | 0/284 [00:18<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 368.00 MiB (GPU 0; 6.00 GiB total capacity; 5.02 GiB already allocated; 0 bytes free; 5.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from datasets import load_from_disk
from CustomDataset import CustomDataset
from torch.utils.data import DataLoader

dataset_loaded = load_from_disk("europarl_eng_tokenized_and_masked_128")
dataset_loaded.set_format(type='pt', columns=['input_ids', 'attention_mask', 'masked_ids'])

dataset_loaded = dataset_loaded[0:2 ** 10]

input_ids = dataset_loaded['input_ids']
attention_mask = dataset_loaded['attention_mask']
masked_ids = dataset_loaded['masked_ids']

ds_en_loader = DataLoader(CustomDataset(masked_ids, input_ids, attention_mask),
                          batch_size=8, drop_last=True, shuffle=True,
                          pin_memory=True, pin_memory_device='cuda', num_workers=2)

In [None]:
from torch.optim import *
model.fit(ds_en_loader, AdamW(model.parameters()), epochs=5)

In [13]:
sentence = "C'è stato un <mask> venerdì."
test_ids = tokenizer([sentence], add_special_tokens=True, return_tensors="pt")["input_ids"]
logits = model.model(test_ids.to('cuda')).logits
masked_index = (test_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tokenizer.decode(predictions).split()

["'", 'Unione', 'minutoistruzione']

In [None]:
outputs = model.model.generate(test_ids.to('cuda'), decoder_start_token_id=tokenizer.lang_code_to_id['en_XX'],num_beams=2)
print(tokenizer.batch_decode(outputs))