# Handling multiple sequences (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
model(input_ids)

In [4]:
#Here we tried to do everything the tokenizer did behind the scenes when we applied it to one sequence.
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequence = "I've been waiting for a HuggingFace course my whole life."

# 第一段代码的操作
#torch.tensor(ids)只是一个一维tensor，其shape类似于 [n]，其中 n 是token的数量
#torch.tensor(ids)生成的tensor shape是 torch.Size([14])
# Transformers 中的模型通常期望输入是一个批次（batch）的数据，即使你只有一个句子，也需要将其作为一个批次来处理。这个额外的维度表示批次大小，即使这个批次中只有一个元素。
tokens = tokenizer.tokenize(sequence)
#这里的 tokenizer.tokenize(sequence) 期望 sequence 是一个字符串，而不是一个字符串列表。因此，当你传递一个列表时，会导致错误。
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
print("First method:", input_ids)
print("Shape of first method:", input_ids.shape)

# 第二段代码的操作
#tokenizer(sequence, return_tensors="pt")不仅仅将标记列表转换为张量，还将这个张量包装在一个额外的维度中。最终得到的张量的形状类似于 [1, n]，这里的 1 就是添加的维度。
#tokenizer(sequence, return_tensors="pt")生成的张量形状是 torch.Size([1, 14])
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print("Second method:", tokenized_inputs["input_ids"])
print("Shape of second method:", tokenized_inputs["input_ids"].shape)

print("------------------------------------------------------------------------------------------------")
input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)
output = model(input_ids)
print("Logits:", output.logits)

First method: tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
         2026,  2878,  2166,  1012])
Shape of first method: torch.Size([14])
Second method: tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])
Shape of second method: torch.Size([1, 16])
------------------------------------------------------------------------------------------------
Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequence = "I've been waiting for a HuggingFace course my whole life."
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

#if there is only one single sequence, create a batch with a single sequence, that means a batch of two identical sequences
batched_ids = [ids,ids]
"""
在深度学习中，模型通常是批量处理输入数据的，而不是单个样本。将单个序列转换为批次的目的是模拟多个样本的输入。
这里通过 [ids,ids] 这种方式创建了一个包含两个相同序列的批次。
使用 [] 是因为在 Python 中，列表可以用来存储多个元素，这里将两个相同的序列 ID 列表存储在一起，形成一个批次。
"""
input_ids = torch.tensor(batched_ids)
output = model(input_ids)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
# this  batched_ids cannot be converted to a tensor directly
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [None]:
# the padding principle: padding token
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward>)
tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward>)

In [None]:
# the result above in forth row is wrong,
# bacause the attention layers contextualizes each token, including the padding token
# so we need to tell the attention layers to ignore the padding tokens by an attention mask:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

# the attention mask need to be converted to tensor, too.
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward>)

In [None]:
sequence = sequence[:max_sequence_length]