In [1]:
from datasets import load_dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, MT5Config
import torch
from transformers.utils import is_torch_fx_proxy

from noise_functions.MT6NoiseFunction import MT6NoiseFunction



In [2]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb, param_size + buffer_size

In [4]:
from MT6 import MT6
from transformers import MT5TokenizerFast, T5ForConditionalGeneration

tok_en = MT5TokenizerFast.from_pretrained("nikodallanoce/mt5-cc4-vanilla-32k-5")

cuda_dev = "cpu"

model = MT5ForConditionalGeneration.from_pretrained(
        "/home/n.dallanoce/PyCharm/pretraining/weights/mt6_pre_en-fr(M1)_twe/checkpoint-100000", decoder_start_token_id=None)
model = model.to(cuda_dev)
model.train(False)
model_dim, n_param = model_size(model)
print({'dim': model_dim, 'params': n_param / 1e6})

{'dim': 230.78466796875, 'params': 241.995264}


In [5]:
dataset = load_dataset("cc100", lang="en",
                       cache_dir="/data/n.dallanoce/cc100/huggingface",
                       split=f"train[{4096}:{4096 * 2}]",
                       verification_mode='no_checks')

Found cached dataset cc100 (/data/n.dallanoce/cc100/huggingface/cc100/en-lang=en/0.0.0/8159941b93eb06d0288bb80be26ddfe8213c0c5e33286619c85ad8e1ee0eb91c)


In [4]:
index = 43
sent = dataset[index]['text']
src_sent, tgt_sent = MT6NoiseFunction().compute(sent, seed=index)
print(f"original: {sent} \n \nsource: {src_sent}")

NameError: name 'dataset' is not defined

In [32]:
from noise_functions.MT5NoiseFunction import MT5NoiseFunction

index = 2
#sent = dataset[index]['text']
sent = "Deals and offers on Tripoli Upholstered Wood Frame Platform Bed Size: King, Color: Beige are only unbelievable here. You will love to look for your favourite product which can be obtained at such an amazing price. If you might be thinking concerning the high shipping prices then search for the reviews."

src_sent, tgt_sent = MT6NoiseFunction(n_groups=1, noise_density=0.3).compute(sent, seed=index)
#src_sent = sent + "</s> "+ src_sent
print(f"original: {sent} \n \nsource: {src_sent}")

original: Deals and offers on Tripoli Upholstered Wood Frame Platform Bed Size: King, Color: Beige are only unbelievable here. You will love to look for your favourite product which can be obtained at such an amazing price. If you might be thinking concerning the high shipping prices then search for the reviews. 
 
source: Deals and offers on Tripoli Upholstered Wood Frame Platform Bed Size: King, Color: Beige are only <extra_id_0> will <extra_id_1> favourite product which can <extra_id_2> at such an amazing price. <extra_id_3> be thinking <extra_id_4> high shipping prices then search for the reviews.


In [33]:
input_ids = tok_en(src_sent, return_tensors="pt").input_ids
sequence_ids = model.generate(input_ids.to(cuda_dev), max_length=128)
sequences = tok_en.batch_decode(sequence_ids, skip_special_tokens=False)
print(f" prediction: {sequences} \n \n source: {src_sent} \n \n label: {tgt_sent}")

 prediction: ['<pad><extra_id_0> Deals and offers on Tripoli Upholstered Wood Frame<extra_id_1> Color: Beige are only<extra_id_2></s>'] 
 
 source: Deals and offers on Tripoli Upholstered Wood Frame Platform Bed Size: King, Color: Beige are only <extra_id_0> will <extra_id_1> favourite product which can <extra_id_2> at such an amazing price. <extra_id_3> be thinking <extra_id_4> high shipping prices then search for the reviews. 
 
 label: ['<extra_id_0> unbelievable here. You <extra_id_1> love to look for your <extra_id_2> be obtained <extra_id_3> If you might <extra_id_4> concerning the <extra_id_5>']
