In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"

import datasets
import torch
from torch import nn
from transformers import MBartModel, MBartTokenizer, MBartConfig, MBartForConditionalGeneration
from datasets import concatenate_datasets, load_from_disk
from torch.optim import *
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset
from MBartPreTrainingDataset import MBartPreTrainingDataset
from datasets import load_dataset
from MBart import MBart
from accelerate import Accelerator
from MBartDataset import MBartDataset

In [2]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb

In [3]:
#pre_train_ds = load_dataset("text", data_files={"train": ["/data/n.dallanoce/cc100/en.txt"]},
#                            cache_dir="/data/n.dallanoce/cc100/hugg_en", split='train', ignore_verifications=True)

tok_en = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="en_XX")

#pre_train_ds = MBartPreTrainingDataset(pre_train_ds, tok_en, "en")

mbart_config = MBartConfig(encoder_layers=6, decoder_layers=6,
                           encoder_ffn_dim=2048, decoder_ffn_dim=2048,
                           encoder_attention_heads=8, decoder_attention_heads=8,
                           d_model=512, max_length=128, vocab_size=tok_en.vocab_size)

#accelerator = Accelerator(mixed_precision='fp16', gradient_accumulation_steps=1)
#model: MBart = MBart(mbart_config, device_ids=[3])
cuda_dev = "cpu"
#model = model.to(cuda_dev)
model = MBartForConditionalGeneration(mbart_config).to(cuda_dev)
model.train(False)
print(model_size(model))

#dataset_loaded = load_from_disk("europarl_eng_tokenized")
#pre_train_load = DataLoader(pre_train_ds, batch_size=8, drop_last=True, shuffle=False, pin_memory=True, num_workers=4)

661.6861991882324


In [4]:
param_tot = 0
for param in model.parameters():
    param_tot += param.nelement()
print(param_tot)

173207040


In [5]:
model.load_state_dict(
    torch.load("/home/n.dallanoce/PyCharm/pretraining/hugg_trainer/mbart_cc100_en_2/checkpoint-499800/pytorch_model.bin",
               map_location=cuda_dev))

<All keys matched successfully>

In [25]:
sentence = "Loving my pregnancy almost due can't wait to see my baby boy."
test_ids = tok_en([sentence], add_special_tokens=True, return_tensors="pt")["input_ids"]
logits = model(test_ids.to(cuda_dev)).logits
masked_index = (test_ids[0] == tok_en.mask_token_id)#.nonzero().item()
masked_index = torch.nonzero(masked_index).item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tok_en.decode(predictions).split()

['I</s>', 'The', 'It', 'If']

In [26]:
example_english_phrase = "Loving my <mask> almost due can't wait to see my baby boy."
batch = tok_en(example_english_phrase, return_tensors="pt").to(cuda_dev)
generated_ids = model.generate(batch["input_ids"], max_new_tokens=128,
                               pad_token_id=tok_en.pad_token_id,
                               eos_token_id=tok_en.eos_token_id)
print(tok_en.batch_decode(generated_ids, skip_special_tokens=True))

["Loving my baby baby girl. I almost due can't wait to see my baby boy."]


In [1]:
from transformers import MBartForConditionalGeneration, MBartTokenizer

tok_en = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

#cuda_dev = "cuda:1"
#model = model.to(cuda_dev)

In [7]:
tok_en("hello my friend. How are you?").input_ids

[33600, 31, 759, 34391, 5, 11249, 621, 398, 32, 2, 250004]

In [4]:
tok_en.batch_decode(torch.tensor([   442,      5,   1913,   2499,  34515,      4,     70,  14597,  38352,
        250026,   2673,   2198,      2, 250004,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1]), skip_special_tokens=True)

['it',
 '.',
 'At',
 'any',
 'rate',
 ',',
 'the',
 'superior',
 'speed',
 '<mask>',
 'Hi',
 'ram',
 '</s>',
 'en_XX',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 

In [7]:
tok_en = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="en_XX")
test_ids = tok_en(["Hello how are you? Fine thanks.", "Thank you."], add_special_tokens=True)["input_ids"]
test_ids

[[35378, 3642, 621, 398, 32, 67455, 45458, 5, 2, 250004],
 [25689, 398, 5, 2, 250004]]