### Word-based

In [1]:
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']


### Character-based

### Subword-based

### Loading and saving

In [2]:
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)/main/tokenizer.json: 100%|███| 436k/436k [00:00<00:00, 21.5MB/s]


In [6]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
tokenizer.save_pretrained("tokenizers/")

('tokenizers/tokenizer_config.json',
 'tokenizers/special_tokens_map.json',
 'tokenizers/vocab.txt',
 'tokenizers/added_tokens.json',
 'tokenizers/tokenizer.json')

In [18]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

Downloading (…)lve/main/config.json: 100%|█████| 481/481 [00:00<00:00, 1.59MB/s]
Downloading (…)olve/main/vocab.json: 100%|███| 899k/899k [00:00<00:00, 46.6MB/s]
Downloading (…)olve/main/merges.txt: 100%|███| 456k/456k [00:00<00:00, 37.3MB/s]
Downloading (…)/main/tokenizer.json: 100%|█| 1.36M/1.36M [00:00<00:00, 48.5MB/s]


['Using', 'Ġa', 'ĠTrans', 'former', 'Ġnetwork', 'Ġis', 'Ġsimple']


In [19]:
## from tokens to input ids
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[36949, 10, 5428, 22098, 1546, 16, 2007]


In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [21]:
checkpoint = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [22]:
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

In [23]:
query = "What is the best source for learning Dynamic Programming?"

In [24]:
tokens = tokenizer.tokenize(query)

In [26]:
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

In [27]:
input_ids

tensor([ 2264,    16,     5,   275,  1300,    13,  2239, 29614, 39538,   116])

In [29]:
tokenized_inputs = tokenizer(query, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[    0,  2264,    16,     5,   275,  1300,    13,  2239, 29614, 39538,
           116,     2]])


In [33]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

checkpoint = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

query = "What is the best source for learning Dynamic Programming?"

tokens = tokenizer.tokenize(query)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)

Input IDs: tensor([[ 2264,    16,     5,   275,  1300,    13,  2239, 29614, 39538,   116]])


In [34]:
print(output)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-2.5217, -8.2146, -8.2114, -7.8768, -7.6339, -8.3176, -8.1205, -8.2073,
         -8.5701, -8.3297]], grad_fn=<CloneBackward0>), end_logits=tensor([[-1.5421, -7.0788, -7.2135, -6.8254, -5.6122, -6.7535, -6.7843, -6.9499,
         -6.2422, -6.3289]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)


In [38]:
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)))
print(model(torch.tensor(sequence2_ids)))
print(model(torch.tensor(batched_ids)))

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.9507, -6.2646, -6.0937]], grad_fn=<CloneBackward0>), end_logits=tensor([[-0.2857, -4.8904, -4.6521]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)
QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-2.2159, -6.9054]], grad_fn=<CloneBackward0>), end_logits=tensor([[-1.3976, -5.6106]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)
QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.9507, -6.2646, -6.0937],
        [-1.8093, -6.3697, -5.8797]], grad_fn=<CloneBackward0>), end_logits=tensor([[-0.2857, -4.8905, -4.6521],
        [-0.8618, -4.7892, -4.4480]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)


In [39]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.9507, -6.2646, -6.0937],
        [-2.2159, -6.9054, -6.8160]], grad_fn=<CloneBackward0>), end_logits=tensor([[-0.2857, -4.8905, -4.6521],
        [-1.3976, -5.6106, -5.5207]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)


In [40]:
print(model(torch.tensor(sequence1_ids)))

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.9507, -6.2646, -6.0937]], grad_fn=<CloneBackward0>), end_logits=tensor([[-0.2857, -4.8904, -4.6521]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)


In [41]:
print(model(torch.tensor(sequence2_ids)))

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-2.2159, -6.9054]], grad_fn=<CloneBackward0>), end_logits=tensor([[-1.3976, -5.6106]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)
