In [1]:
import os

from transformers.utils import is_torch_fx_proxy

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import datasets
import torch
from torch import nn
from transformers import MBartModel, MBartTokenizer, MBartConfig, MBartForConditionalGeneration
from datasets import concatenate_datasets, load_from_disk
from torch.optim import *
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset

In [2]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb

In [6]:
#pre_train_ds = load_dataset("text", data_files={"train": ["/data/n.dallanoce/cc100/en.txt"]},
#                            cache_dir="/data/n.dallanoce/cc100/hugg_en", split='train', ignore_verifications=True)

tok_en = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="en_XX")
cuda_dev = "cuda:0"
#model = model.to(cuda_dev)
model = MBartForConditionalGeneration.from_pretrained(
    "/home/n.dallanoce/PyCharm/pretraining/weights/mbart_cc100/checkpoint-15000").to(cuda_dev)
model.train(False)
print(model_size(model))

#dataset_loaded = load_from_disk("europarl_eng_tokenized")
#pre_train_load = DataLoader(pre_train_ds, batch_size=8, drop_last=True, shuffle=False, pin_memory=True, num_workers=4)

661.6861991882324


In [7]:
param_tot = 0
for param in model.parameters():
    param_tot += param.nelement()
print(param_tot)

173207040


In [27]:
sentence = "Travelling alone can be a daunting prospect, not least to attend a wedding alone. The original plan had been to go with another neighbour, but she slipped a disc in her back and was unable to fly."
test_ids = tok_en([sentence], add_special_tokens=True, return_tensors="pt")["input_ids"]
logits = model(test_ids.to(cuda_dev)).logits
masked_index = (test_ids[0] == tok_en.mask_token_id)  #.nonzero().item()
masked_index = torch.nonzero(masked_index).item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tok_en.decode(predictions).split()

['was', 'had', 'would', 'could', 'said']

In [25]:
example_english_phrase = "Travelling alone can be a daunting prospect, not least to attend a wedding alone. The original plan had been to go with another neighbour, but she slipped a disc in her back and was unable to fly."
batch = tok_en(example_english_phrase, return_tensors="pt").to(cuda_dev)
print(batch["input_ids"])
generated_ids = model.generate(batch["input_ids"], max_new_tokens=128,
                               pad_token_id=tok_en.pad_token_id,
                               eos_token_id=tok_en.eos_token_id)
print(tok_en.batch_decode(generated_ids, skip_special_tokens=True))

tensor([[ 30720,   2069,  75447,    831,    186,     10,     48, 128396, 109736,
              4,    959,  19713,     47,  29966,     10,  81141,  75447,      5,
            581,   7311,   1774,   1902,   2809,     47,    738,    678,  15700,
         250026,      6,      4,   1284,   2412, 146614,     71,     10,  17116,
             23,    604,   4420,    136,    509,     51,   2886,     47,  12403,
              5,      2, 250004]], device='cuda:0')
['Travelling alone can a daunting prospect, not least to attend a wedding alone. The original plan had been to go with another, but she slipped a disc in her back and was unable to be awful of the. She was amusant to fly.']


In [None]:
example_english_phrase = "Loving my <mask> almost due can't wait to see my baby boy."
batch = tok_en(example_english_phrase, return_tensors="pt").to(cuda_dev)
print(batch["input_ids"])
generated_ids = model.generate(batch["input_ids"], max_new_tokens=128,
                               pad_token_id=tok_en.pad_token_id,
                               eos_token_id=tok_en.eos_token_id)
print(tok_en.batch_decode(generated_ids, skip_special_tokens=True))

In [14]:
cc100_fr = load_dataset("cc100", lang="fr",
                            cache_dir="/data/n.dallanoce/cc100/huggingface",
                            split=f"train[:1024]",
                            verification_mode='no_checks')
cc100_fr[5]

Found cached dataset cc100 (/data/n.dallanoce/cc100/huggingface/cc100/fr-lang=fr/0.0.0/8159941b93eb06d0288bb80be26ddfe8213c0c5e33286619c85ad8e1ee0eb91c)


{'id': '5',
 'text': "- Je penses et réfléchit tout seul sur des sujets n'ayant rien à voir avec une situation. Exemple, quand j'attends mes soeurs et ma mère devant un magasin, je réfléchit à la création du monde.\n"}

In [16]:
tok_fr = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="fr_XX")
sent = "Je penses et réfléchit tout seul sur des sujets n'ayant rien à voir avec une situation. Exemple, quand j'attends mes soeurs et ma mère devant un magasin, je réfléchit à la création du monde."
batch = tok_fr(sent, return_tensors="pt").to(cuda_dev)
print(batch["input_ids"])
generated_ids = model.generate(batch["input_ids"], max_new_tokens=128,
                               pad_token_id=tok_en.pad_token_id,
                               eos_token_id=tok_en.eos_token_id)
print(tok_en.batch_decode(generated_ids, skip_special_tokens=True))

tensor([[   845,  19067,      7,     82, 159096,     18,   2533,  35372,    613,
            224,  24693,      7,    653,     25,   5822,    660,  15324,    253,
          13283,   1609,    773,  16648,      5,   5443,     13,  33209,      4,
          12723, 250026,     82,    291,  94683,  51841,     51,  82193,      4,
             55, 159096,     18,    253,     21,  50976,    115,  11146,      5,
              2, 250008]], device='cuda:0')
["Je penses et réfléchit tout seul sur des sujets n'ayant rien à voir avec une situation. Exemple, quand je suis un peu plus loin, et ma mère devant un magasin, je réfléchit à la création du monde."]
