In [1]:
import datasets
import torch
from torch import nn
from transformers import MBartModel, MBartTokenizer, MBartConfig
from datasets import concatenate_datasets, load_from_disk
from torch.optim import *
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset
from CustomDataset import CustomDataset
from MBart import MBart
from OriginalDataset import OriginalDataset

In [2]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb

In [3]:
print(torch.cuda.is_available())

tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="en_XX")

mbart_config = MBartConfig(encoder_layers=6, decoder_layers=6,
                           encoder_ffn_dim=128, decoder_ffn_dim=128,
                           encoder_attention_heads=4, decoder_attention_heads=4,
                           d_model=256, max_length=128, vocab_size=tokenizer.vocab_size)

model: MBart = MBart(mbart_config)
print(model_size(model))

dataset_loaded = load_from_disk("europarl_eng_tokenized")

my_ds = OriginalDataset(dataset_loaded, tokenizer, 1e-2)
ds_en_loader = DataLoader(my_ds, batch_size=32, drop_last=True, shuffle=True, pin_memory=True, num_workers=32)

True
268.2789726257324


In [8]:
model.fit(ds_en_loader, Adam(model.parameters()), epochs=4)

Epoch 1 of 4


100%|██████████| 2137/2137 [07:42<00:00,  4.62it/s, loss=4.3118]


Epoch 2 of 4


100%|██████████| 2137/2137 [07:41<00:00,  4.63it/s, loss=4.1915]


Epoch 3 of 4


100%|██████████| 2137/2137 [07:40<00:00,  4.64it/s, loss=4.1503]


Epoch 4 of 4


100%|██████████| 2137/2137 [07:42<00:00,  4.62it/s, loss=4.1146]


In [11]:
torch.save(model.state_dict(), "weights_100.pt")

In [None]:
model.load_state_dict(torch.load("weights.pt"))

In [15]:
sentence = "We do not allow people to <mask> and drive because their coordination is gone."
test_ids = tokenizer([sentence], add_special_tokens=True, return_tensors="pt")["input_ids"]
logits = model.model(test_ids.to('cuda')).logits
masked_index = (test_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tokenizer.decode(predictions).split()

['to', 'on', 'then', 'and']