In [12]:
import os
import torch
import sentencepiece
from transformers import T5TokenizerFast, T5EncoderModel

from sklearn.metrics.pairwise import cosine_similarity

In [13]:
tokenizer = T5TokenizerFast.from_pretrained("t5-large", cache_dir="/scratch/mbarlow6/.cache/")
model = T5EncoderModel.from_pretrained("t5-large", cache_dir="/scratch/mbarlow6/.cache/")

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.22.layer.1.EncDecAttention.k.weight', 'decoder.block.11.layer.0.SelfAttention.o.weight', 'decoder.block.19.layer.0.SelfAttention.q.weight', 'decoder.block.8.layer.2.layer_norm.weight', 'decoder.block.8.layer.1.layer_norm.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.21.layer.2.DenseReluDense.wi.weight', 'decoder.block.23.layer.1.layer_norm.weight', 'decoder.block.8.layer.1.EncDecAttention.k.weight'

In [27]:
def squeeze(word, tokenizer=tokenizer, model=model, debug=True):
    inputs = tokenizer(word, return_tensors="pt")
    if debug:
        print('Tokens Requested:')
        print(tokenizer.batch_decode(inputs.input_ids[0]))
        print('Word ids:')
        print(inputs.word_ids())
    with torch.no_grad():
        outputs = model(**inputs)
    last = outputs.last_hidden_state
    embeddings = torch.squeeze(last, dim=0)
    return torch.mean(embeddings, dim=0)


In [30]:
def token_get(word, tokenizer=tokenizer, model=model):
    inputs = tokenizer(word, return_tensors="pt", return_attention_mask=False, add_special_tokens=False)
    with torch.no_grad():
        output = model.encoder.embed_tokens(inputs.input_ids)
    print(output.shape)
    print(output)
    print(tokenizer.batch_decode(inputs.input_ids[0]))
    print(inputs.word_ids())
    return output

In [31]:
torch.mean(torch.squeeze(token_get('All hail to the king!'), dim=0), dim=0)

torch.Size([1, 7, 1024])
tensor([[[-11.7500,   4.2188,  -5.6562,  ...,   3.2344,   7.8125,   2.5156],
         [ -7.3438,  17.6250, -13.5625,  ...,   3.0156,   9.7500,   2.3438],
         [  5.0312,   8.3750,   5.3125,  ...,   6.8750,  -6.9062,   2.7188],
         ...,
         [  7.4375,   0.1089,  -6.0938,  ...,   4.4062,  -3.0000,   1.7812],
         [-16.8750,   1.1562,  -6.6250,  ...,   1.3516,  16.7500,  29.1250],
         [  4.2500,   7.6250,  -7.1250,  ...,   7.0000,  -8.1875,   3.5000]]])
['All', 'hail', 'to', 'the', '', 'king', '!']
[0, 1, 2, 3, 4, 4, 4]


tensor([-1.7902,  6.1428, -5.6830,  ...,  4.6842,  1.3661,  6.3482])

In [32]:
squeeze('All hail to the king!').reshape(1, -1)

Tokens Requested:
['All', 'hail', 'to', 'the', '', 'king', '!', '</s>']
Word ids:
[0, 1, 2, 3, 4, 4, 4, None]


tensor([[-0.1766, -0.0676, -0.0396,  ..., -0.0131,  0.0658,  0.0263]])