## Tokenizers - https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt

In [1]:
# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple with Transformers and PyTorch"
tokens = tokenizer.tokenize(sequence)

print(tokens)

  from .autonotebook import tqdm as notebook_tqdm


['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple', 'with', 'Transformers', 'and', 'P', '##y', '##T', '##or', '##ch']


In [2]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014, 1114, 25267, 1105, 153, 1183, 1942, 1766, 1732]


In [3]:
sentences= ["I’ve been waiting for a HuggingFace course my whole life." , "I hate this so much!"]

tokens = tokenizer.tokenize(sentences[0])
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

['I', '’', 've', 'been', 'waiting', 'for', 'a', 'Hu', '##gging', '##F', '##ace', 'course', 'my', 'whole', 'life', '.']
[146, 787, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119]


In [6]:
# Decoding
decoded_string = tokenizer.decode(ids)
decoded_string

'I ’ ve been waiting for a HuggingFace course my whole life.'

### Multiple sentences

In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids,ids])
print(input_ids)
# This line will fail.
model(input_ids)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])


SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [20]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [29]:
output = model(**tokenized_inputs)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


'[unused0]'

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [26]:
sentences= ["I’ve been waiting for a HuggingFace course my whole life." , "I hate this so much!"]

sentences_1_tokens = tokenizer.tokenize(sentences[0])
sentences_2_tokens = tokenizer.tokenize(sentences[1])

sentences_1_ids = tokenizer.convert_tokens_to_ids(sentences_1_tokens)
sentences_2_ids = tokenizer.convert_tokens_to_ids(sentences_2_tokens)

batched_ids = [
    sentences_1_ids,
    sentences_2_ids,
]

# add padding for batched input
batched_ids = [
    sentences_1_ids + [tokenizer.pad_token_id] * (len(sentences_2_ids) - len(sentences_1_ids)),
    sentences_2_ids + [tokenizer.pad_token_id] * (len(sentences_1_ids) - len(sentences_2_ids)),
]

attention_mask = [
    [1 if token_id != tokenizer.pad_token_id else 0 for token_id in sequence_ids]
    for sequence_ids in batched_ids
]

print('Model without attention mask',model(torch.tensor(batched_ids)).logits)
print('Model with attention mask',model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask)).logits)

# convert to labels


Model without attention mask tensor([[-2.5720,  2.6852],
        [ 2.5423, -2.1265]], grad_fn=<AddmmBackward0>)
Model with attention mask tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)


In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)
print(output.logits.argmax(dim=-1))

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([1, 1])
