In [1]:
import os

from transformers.utils import is_torch_fx_proxy

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import datasets
import torch
from torch import nn
from transformers import MBartModel, MBartTokenizer, MBartConfig, MBartForConditionalGeneration
from datasets import concatenate_datasets, load_from_disk
from torch.optim import *
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset

In [2]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb

In [9]:
from transformers import AutoTokenizer

#pre_train_ds = load_dataset("text", data_files={"train": ["/data/n.dallanoce/cc100/en.txt"]},
#                            cache_dir="/data/n.dallanoce/cc100/hugg_en", split='train', ignore_verifications=True)
tok_name = "nikodallanoce/mbart-cc4-vanilla"
tok_en = AutoTokenizer.from_pretrained(tok_name, src_lang="en_XX")
cuda_dev = "cpu"
#model = model.to(cuda_dev)
model:MBartForConditionalGeneration = MBartForConditionalGeneration.from_pretrained(
    "/home/n.dallanoce/PyCharm/pretraining/weights/mbart_pre_de_ft_en-fr(Mf1-2)/checkpoint-300000").to(cuda_dev)
model.train(False)
print(model_size(model))

#dataset_loaded = load_from_disk("europarl_eng_tokenized")
#pre_train_load = DataLoader(pre_train_ds, batch_size=8, drop_last=True, shuffle=False, pin_memory=True, num_workers=4)

235.07921600341797


In [10]:
model.model.shared.weight

Parameter containing:
tensor([[ 0.1152, -0.1643,  0.1987,  ...,  0.1372, -0.0425, -0.2086],
        [ 0.1267, -0.1585,  0.1778,  ...,  0.1125, -0.0720, -0.2186],
        [-0.1357, -0.1363,  0.0593,  ...,  0.0152, -0.0454, -0.2686],
        ...,
        [ 0.1230, -0.1556,  0.1748,  ...,  0.1078, -0.0689, -0.2171],
        [ 0.1261, -0.1566,  0.1794,  ...,  0.1139, -0.0656, -0.2184],
        [ 0.1245, -0.1576,  0.1732,  ...,  0.1074, -0.0787, -0.2140]],
       requires_grad=True)

In [11]:
model.lm_head.weight

Parameter containing:
tensor([[ 0.1152, -0.1643,  0.1987,  ...,  0.1372, -0.0425, -0.2086],
        [ 0.1267, -0.1585,  0.1778,  ...,  0.1125, -0.0720, -0.2186],
        [-0.1357, -0.1363,  0.0593,  ...,  0.0152, -0.0454, -0.2686],
        ...,
        [ 0.1230, -0.1556,  0.1748,  ...,  0.1078, -0.0689, -0.2171],
        [ 0.1261, -0.1566,  0.1794,  ...,  0.1139, -0.0656, -0.2184],
        [ 0.1245, -0.1576,  0.1732,  ...,  0.1074, -0.0787, -0.2140]],
       requires_grad=True)

In [12]:
model.num_parameters()

61592576

In [None]:
param_tot = 0
for param in model.parameters():
    param_tot += param.nelement()
print(param_tot)

In [None]:
sentence = "Travelling alone can be a daunting prospect, not least to attend a wedding alone. The original plan had been to go with another neighbour, but she slipped a disc in her back and was unable to fly."
test_ids = tok_en([sentence], add_special_tokens=True, return_tensors="pt")["input_ids"]
logits = model(test_ids.to(cuda_dev)).logits
masked_index = (test_ids[0] == tok_en.mask_token_id)  #.nonzero().item()
masked_index = torch.nonzero(masked_index).item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tok_en.decode(predictions).split()

In [None]:
example_english_phrase = "Travelling alone can be a daunting prospect, not least to attend a wedding alone. The original plan had been to go with another neighbour, but she slipped a disc in her back and was unable to fly."
batch = tok_en(example_english_phrase, return_tensors="pt").to(cuda_dev)
print(batch["input_ids"])
generated_ids = model.generate(batch["input_ids"], max_new_tokens=128,
                               pad_token_id=tok_en.pad_token_id,
                               eos_token_id=tok_en.eos_token_id)
print(tok_en.batch_decode(generated_ids, skip_special_tokens=True))

In [None]:
example_english_phrase = "Loving my <mask> almost due can't wait to see my baby boy."
batch = tok_en(example_english_phrase, return_tensors="pt").to(cuda_dev)
print(batch["input_ids"])
generated_ids = model.generate(batch["input_ids"], max_new_tokens=128,
                               pad_token_id=tok_en.pad_token_id,
                               eos_token_id=tok_en.eos_token_id,
                               decoder_start_token_id=tok_en.lang_code_to_id[tok_en.src_lang])
print(tok_en.batch_decode(generated_ids, skip_special_tokens=True))

In [None]:
cc100_fr = load_dataset("cc100", lang="fr",
                        cache_dir="/data/n.dallanoce/cc100/huggingface",
                        split=f"train[:1024]",
                        verification_mode='no_checks')
cc100_fr[5]

In [None]:
tok_fr = AutoTokenizer.from_pretrained(tok_name, src_lang="fr_XX")
sent = "Je penses et réfléchit tout seul sur des sujets n'ayant rien à voir avec une situation. Exemple, quand j'attends mes soeurs et ma mère devant un magasin, je réfléchit à la création du monde."
batch = tok_fr(sent, return_tensors="pt").to(cuda_dev)
print(batch["input_ids"])
generated_ids = model.generate(batch["input_ids"], max_new_tokens=128,
                               pad_token_id=tok_fr.pad_token_id,
                               eos_token_id=tok_fr.eos_token_id,
                               decoder_start_token_id=tok_fr.lang_code_to_id[tok_fr.src_lang])
print(tok_en.batch_decode(generated_ids, skip_special_tokens=True))

In [None]:
tok_fr.lang_code_to_id[tok_fr.src_lang]