In [1]:
# multiple sequences
# based on https://huggingface.co/learn/nlp-course/chapter2/5

In [2]:
# handling batch inputs

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "Ford and Zaphod were cousins"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
# models expect multiple sentences by default.
# model(input_ids)  # IndexError: too many indices for tensor of dimension 1

# this will not
input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 4811,  1998, 23564,  8458,  7716,  2020, 12334]])
Logits: tensor([[-0.9555,  1.0167]], grad_fn=<AddmmBackward0>)


In [11]:
# Batching sends multiple sentences through the model at once
# If you only have one sentence then build a batch with a single sequence:
batched_ids = [ids, ids]

print(batched_ids)

[[4811, 1998, 23564, 8458, 7716, 2020, 12334], [4811, 1998, 23564, 8458, 7716, 2020, 12334]]


In [15]:
# padding - required b/c tensors need inputs of the same length
# this would be an issue
'''
batched_ids = [
   [200, 200, 200],
    [200, 200]
]
'''
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s:  
1s indicate the corresponding tokens should be attended to  
0s indicate the corresponding tokens should not be attended to 

In [16]:
# setting the attention mask to ignore the padding
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
