In [1]:
from datasets import load_dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, MT5Config
import torch
from transformers.utils import is_torch_fx_proxy

from noise_functions.MT6NoiseFunction import MT6NoiseFunction

In [2]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb

In [3]:
from transformers import MT5TokenizerFast

tok_en = MT5TokenizerFast.from_pretrained("nikodallanoce/mt5-cc4-vanilla-32k-5")

cuda_dev = "cpu"

model = MT5ForConditionalGeneration.from_pretrained(
    "/home/n.dallanoce/PyCharm/pretraining/weights/mt5_tests/checkpoint-150000")
model = model.to(cuda_dev)
model.train(False)
print(model_size(model))

269.466796875


In [4]:
dataset = load_dataset("cc100", lang="en",
                                cache_dir="/data/n.dallanoce/cc100/huggingface",
                                split=f"train[{4096}:{4096*2}]",
                                verification_mode='no_checks')

Found cached dataset cc100 (/data/n.dallanoce/cc100/huggingface/cc100/en-lang=en/0.0.0/8159941b93eb06d0288bb80be26ddfe8213c0c5e33286619c85ad8e1ee0eb91c)


In [5]:
index = 43
sent = dataset[index]['text']
src_sent, tgt_sent = MT6NoiseFunction().compute(sent, seed=index)
print(f"original: {sent} \n \nsource: {src_sent}")

original: over the bread and butter so that it made a pudding, and you have been
 
 
source: over <extra_id_0> so <extra_id_1> it <extra_id_2> a <extra_id_3> and you <extra_id_4> been



In [16]:
from noise_functions.MT5NoiseFunction import MT5NoiseFunction

index = 1
#sent = dataset[index]['text']
sent = "We introduce how to convert the following three types of the language understanding task into the text-to-text format. Under this setting, the models should be fine-tuned only on English training data but evaluated on all target languages. Moreover, for each pretrained model, only one model is used for all languages rather than selecting fine-tuned models separately."

src_sent, tgt_sent = MT5NoiseFunction().compute(sent, seed=index)
print(f"original: {sent} \n \nsource: {src_sent}")

original: We introduce how to convert the following three types of the language understanding task into the text-to-text format. Under this setting, the models should be fine-tuned only on English training data but evaluated on all target languages. Moreover, for each pretrained model, only one model is used for all languages rather than selecting fine-tuned models separately. 
 
source: We introduce how to convert the following <extra_id_0> language understanding task into the text-to-text format. <extra_id_1> models <extra_id_2> only on English training <extra_id_3> evaluated <extra_id_4> for each pretrained model, <extra_id_5> model <extra_id_6> languages rather <extra_id_7> separately.


In [17]:
input_ids = tok_en(src_sent, return_tensors="pt").input_ids
sequence_ids = model.generate(input_ids.to(cuda_dev), max_length=64, decoder_start_token_id=model.config.pad_token_id)
sequences = tok_en.batch_decode(sequence_ids, skip_special_tokens=False)
print(f" prediction: {sequences} \n \n source: {src_sent} \n \n label: {tgt_sent}")

 prediction: ['<pad><extra_id_0> as a<extra_id_1> The<extra_id_2> are based<extra_id_3> and are<extra_id_4> by the synthesis<extra_id_5> and the<extra_id_6> is as follows: as a result of the<extra_id_7> than as a synthetic as a whole'] 
 
 source: We introduce how to convert the following <extra_id_0> language understanding task into the text-to-text format. <extra_id_1> models <extra_id_2> only on English training <extra_id_3> evaluated <extra_id_4> for each pretrained model, <extra_id_5> model <extra_id_6> languages rather <extra_id_7> separately. 
 
 label: <extra_id_0> three types of the <extra_id_1> Under this setting, the <extra_id_2> should be fine-tuned <extra_id_3> data but <extra_id_4> on all target languages. Moreover, <extra_id_5> only one <extra_id_6> is used for all <extra_id_7> than selecting fine-tuned models <extra_id_8>
