In [1]:
%load_ext autoreload
%autoreload 2
%env CUDA_VISIBLE_DEVICES=1
%env TOKENIZERS_PARALLELISM=false

env: CUDA_VISIBLE_DEVICES=1
env: TOKENIZERS_PARALLELISM=false


In [1]:
import torch
from tqdm import tqdm
from utils import merge_input_and_gen_ids

In [2]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
from modeling_ct5 import CT5ForConditionalGeneration

dirname = 'ct5-small-en-wiki-pytorch'
tokenizer = AutoTokenizer.from_pretrained(dirname)
model_ct5 = CT5ForConditionalGeneration.from_pretrained(dirname)
model_t5 = T5ForConditionalGeneration.from_pretrained(dirname)

2022-07-24 17:40:09.178007: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [3]:
model_ct5 = model_ct5.eval()
model_t5 = model_t5.eval()

In [4]:
texts = [
    "The <extra_id_0> walks in <extra_id_1> park",
    "UN Chief says there is no way to <extra_id_0> in Syria",
]
input_ids = tokenizer(texts, return_tensors="pt", padding=True).input_ids

In [5]:
generated_ids = model_ct5.generate(
    input_ids, 
    attention_mask=input_ids != 0,
    use_cache=False,
    do_sample=True,
    top_p=0.95,
    top_k=30,
    num_beams=1,
    eoc_token_id=tokenizer.vocab['</c>'],
    max_chunk_size=1,
)
generated_ids

  batched_outputs = func(*batched_inputs, **kwargs)


tensor([[    0, 32099,  1061,     0, 32098,     8,     0, 32097,     1],
        [    0, 32099,   129,     0, 32098,     1,     0,     0,     0]])

In [6]:
merged_ids = merge_input_and_gen_ids(input_ids, generated_ids)
for i in range(len(merged_ids)):
    print(tokenizer.decode(merged_ids[i], skip_special_tokens=True))

The Park walks in the park
UN Chief says there is no way to get in Syria


### Outputing scores

In [7]:
out = model_ct5.generate(
    input_ids, 
    attention_mask=input_ids != 0,
    use_cache=False,
    do_sample=True,
    top_p=0.95,
    top_k=30,
    eoc_token_id=tokenizer.vocab['</c>'],
    max_chunk_size=5,
    output_scores=True,
    return_dict_in_generate=True
)
generated_ids = out['sequences']
scores = out['scores']

merged_ids = merge_input_and_gen_ids(input_ids, generated_ids)
for i in range(len(merged_ids)):
    print(tokenizer.decode(merged_ids[i], skip_special_tokens=True))
print(len(scores))
print(out['inverse_indices'])

The New Zealand Park is  walks in the park
UN Chief says there is no way to see anything. References in Syria
5
(tensor([0, 0, 1]), tensor([0, 0, 1]), tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))


In [8]:
# reorder scores according to inverse_indices
inverse_idxs = torch.cat(out['inverse_indices'])
rev_inverse_idxs = inverse_idxs.argsort()
slices = torch.unique(inverse_idxs, return_counts=True)[1].cumsum(dim=-1)
scores = torch.cat(scores)[rev_inverse_idxs].tensor_split(slices)[:-1]
len(scores)

2

## Comparing with the original T5

In [9]:
generated_ids = model_t5.generate(
    input_ids, 
    attention_mask=input_ids != 0,
    use_cache=True,
    do_sample=True,
    top_p=0.95,
    top_k=30,
    num_beams=1,
)
merged_ids = merge_input_and_gen_ids(input_ids, generated_ids)
for i in range(len(merged_ids)):
    print(tokenizer.decode(merged_ids[i], skip_special_tokens=True))

The Ling is A park (. References References External The Reference walks in park
UN Chief says there is no way to use. References Damulel History peopleles in Syria
