## Checking required tokens for restricted vocab selector

In [45]:
from transformers import T5Tokenizer

sample_outputs = [
    "sent1 & sent20",
    "int12 & int4",
    "sent3 & int8",
    "int7 & sent19"
]

tokenizer = T5Tokenizer.from_pretrained('t5-large')
res = [str(x) for x in tokenizer(sample_outputs)['input_ids']]
for s, r in zip(sample_outputs, res):
    print(f"{s} -> {r}")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


sent1 & sent20 -> [1622, 536, 3, 184, 1622, 1755, 1]
int12 & int4 -> [16, 17, 2122, 3, 184, 16, 17, 591, 1]
sent3 & int8 -> [1622, 519, 3, 184, 16, 17, 927, 1]
int7 & sent19 -> [16, 17, 940, 3, 184, 1622, 2294, 1]


In [47]:
(
    tokenizer.convert_ids_to_tokens([16, 17, 927, 3, 184, 1622, 1298, 1]),
    tokenizer.convert_ids_to_tokens([1622, 2469, 3, 184, 16, 17, 4177, 1]),
    tokenizer.convert_ids_to_tokens([3072])
)


(['▁in', 't', '8', '▁', '&', '▁sent', '9', '</s>'],
 ['▁sent', '35', '▁', '&', '▁in', 't', '47', '</s>'],
 ['75'])

In [32]:
for token in [str(i) for i in range(100)]:
    print(f"token: {token} -> id: {tokenizer.get_vocab()[token]}")

token: 0 -> id: 632
token: 1 -> id: 536
token: 2 -> id: 357
token: 3 -> id: 519
token: 4 -> id: 591
token: 5 -> id: 755
token: 6 -> id: 948
token: 7 -> id: 940
token: 8 -> id: 927
token: 9 -> id: 1298
token: 10 -> id: 1714
token: 11 -> id: 2596
token: 12 -> id: 2122
token: 13 -> id: 2368
token: 14 -> id: 2534
token: 15 -> id: 1808
token: 16 -> id: 2938
token: 17 -> id: 2517
token: 18 -> id: 2606
token: 19 -> id: 2294
token: 20 -> id: 1755
token: 21 -> id: 2658
token: 22 -> id: 2884
token: 23 -> id: 2773
token: 24 -> id: 2266
token: 25 -> id: 1828
token: 26 -> id: 2688
token: 27 -> id: 2555
token: 28 -> id: 2577
token: 29 -> id: 3166
token: 30 -> id: 1458
token: 31 -> id: 3341
token: 32 -> id: 2668
token: 33 -> id: 4201
token: 34 -> id: 3710
token: 35 -> id: 2469
token: 36 -> id: 3420
token: 37 -> id: 4118
token: 38 -> id: 3747
token: 39 -> id: 3288
token: 40 -> id: 2445
token: 41 -> id: 4853
token: 42 -> id: 4165
token: 43 -> id: 4906
token: 44 -> id: 3628
token: 45 -> id: 2128
token: 

In [40]:
for token in [str(i) for i in range(99)]:
    assert token in tokenizer.get_vocab()

## building the vocab we need

In [41]:
tokenizer.special_tokens_map

{'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<extra_id_0>',
  '<extra_id_1>',
  '<extra_id_2>',
  '<extra_id_3>',
  '<extra_id_4>',
  '<extra_id_5>',
  '<extra_id_6>',
  '<extra_id_7>',
  '<extra_id_8>',
  '<extra_id_9>',
  '<extra_id_10>',
  '<extra_id_11>',
  '<extra_id_12>',
  '<extra_id_13>',
  '<extra_id_14>',
  '<extra_id_15>',
  '<extra_id_16>',
  '<extra_id_17>',
  '<extra_id_18>',
  '<extra_id_19>',
  '<extra_id_20>',
  '<extra_id_21>',
  '<extra_id_22>',
  '<extra_id_23>',
  '<extra_id_24>',
  '<extra_id_25>',
  '<extra_id_26>',
  '<extra_id_27>',
  '<extra_id_28>',
  '<extra_id_29>',
  '<extra_id_30>',
  '<extra_id_31>',
  '<extra_id_32>',
  '<extra_id_33>',
  '<extra_id_34>',
  '<extra_id_35>',
  '<extra_id_36>',
  '<extra_id_37>',
  '<extra_id_38>',
  '<extra_id_39>',
  '<extra_id_40>',
  '<extra_id_41>',
  '<extra_id_42>',
  '<extra_id_43>',
  '<extra_id_44>',
  '<extra_id_45>',
  '<extra_id_46>',
  '<extra_id_47>',
 

In [50]:
restricted_vocab = set()
for tokens in tokenizer(sample_outputs)['input_ids']:
    restricted_vocab.update(tokens)

restricted_vocab.update([tokenizer.get_vocab()[i] for i in [str(x) for x in range(100)]])
restricted_vocab.update([tokenizer.eos_token_id])
restricted_vocab_list = list(restricted_vocab)

In [51]:
orig_to_restricted_mapping = {}

for t in restricted_vocab_list:
    orig_to_restricted_mapping[t] = len(orig_to_restricted_mapping)+1

orig_to_restricted_mapping[tokenizer.pad_token_id] = 0
print("vocab size: ", len(orig_to_restricted_mapping))

vocab size:  107


In [52]:
orig_to_restricted_mapping

{3072: 1,
 1: 2,
 4608: 3,
 3: 4,
 4613: 5,
 519: 6,
 16: 7,
 17: 8,
 2577: 9,
 4118: 10,
 536: 11,
 4122: 12,
 2079: 13,
 2596: 14,
 3628: 15,
 2606: 16,
 3647: 17,
 3651: 18,
 4165: 19,
 2122: 20,
 591: 21,
 2128: 22,
 4177: 23,
 1622: 24,
 3166: 25,
 2658: 26,
 4201: 27,
 2668: 28,
 632: 29,
 4729: 30,
 3707: 31,
 3708: 32,
 3710: 33,
 2688: 34,
 4225: 35,
 4240: 36,
 4241: 37,
 3747: 38,
 4271: 39,
 1714: 40,
 184: 41,
 3769: 42,
 3264: 43,
 2773: 44,
 3288: 45,
 1752: 46,
 2266: 47,
 1755: 48,
 4314: 49,
 4834: 50,
 3301: 51,
 4327: 52,
 5865: 53,
 755: 54,
 4853: 55,
 2294: 56,
 5373: 57,
 3328: 58,
 3840: 59,
 4867: 60,
 3341: 61,
 1808: 62,
 1298: 63,
 1828: 64,
 4389: 65,
 4906: 66,
 3891: 67,
 3390: 68,
 2368: 69,
 2884: 70,
 3914: 71,
 3916: 72,
 4433: 73,
 4440: 74,
 2394: 75,
 3420: 76,
 4959: 77,
 4448: 78,
 4450: 79,
 3940: 80,
 357: 81,
 3436: 82,
 3951: 83,
 3959: 84,
 3449: 85,
 2938: 86,
 2445: 87,
 4508: 88,
 927: 89,
 2469: 90,
 940: 91,
 4013: 92,
 5553: 93,
 1458

In [53]:
import json
with open('modeling/restricted_t5_vocab.json', 'w') as f:
    json.dump(orig_to_restricted_mapping, f)