<a href="https://colab.research.google.com/github/loganathanspr/nlp_course/blob/main/multiple_sequences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Handling multiple sequences (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

# This will fail, because the model requires list of sequences
model(input_ids)

IndexError: ignored

In [4]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [5]:
tokenized_inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print(f"Input IDs: {input_ids}")

output = model(input_ids)
print(f"Logits: {output.logits}")

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [9]:
# Batch inputs
batched_ids = [ids, ids]
input_ids = torch.tensor(batched_ids)
output = model(input_ids)
print(f"Input IDs: {input_ids}")
print(f"Output: {output.logits}")

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Output: tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


# Padding the inputs

In [12]:
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [13]:
padding_id = 100
batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id]
]

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequence_ids1 = [[200, 200, 200]]
sequence_ids2 = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]
print(model(torch.tensor(sequence_ids1)).logits)
print(model(torch.tensor(sequence_ids2)).logits)
print(model(torch.tensor(batched_ids)).logits)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


# Attention Masks

In [15]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0]
]
output = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(f"Logits: {output.logits}")

Logits: tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


## Test 1

In [39]:
seq1 = "I’ve been waiting for a HuggingFace course my whole life."
seq2 = "I hate this so much!"
batched_seq = [seq1, seq2]

seq1_tokens = tokenizer.tokenize(seq1)
seq2_tokens = tokenizer.tokenize(seq2)
seq1_ids = tokenizer.convert_tokens_to_ids(seq1_tokens)
seq2_ids = tokenizer.convert_tokens_to_ids(seq2_tokens)
seq1_output = model(torch.tensor([seq1_ids]))
seq2_output = model(torch.tensor([seq2_ids]))
print(f"Seq1 ids: {seq1_ids}")
print(f"Seq2 ids: {seq2_ids}")
print(f"Shape (seq1, seq2): ({len(seq1_ids)}, {len(seq2_ids)})")
print(f"Seq1 logits: {seq1_output.logits}")
print(f"Seq2 logits: {seq2_output.logits}")

print(f"\nBatched seq: {batched_seq}")

Seq1 ids: [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
Seq2 ids: [1045, 5223, 2023, 2061, 2172, 999]
Shape (seq1, seq2): (14, 6)
Seq1 logits: tensor([[-2.5720,  2.6852]], grad_fn=<AddmmBackward0>)
Seq2 logits: tensor([[ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)

Batched seq: ['I’ve been waiting for a HuggingFace course my whole life.', 'I hate this so much!']


In [31]:
batched_inputs = tokenizer(batched_seq)
batched_inputs

{'input_ids': [[101, 1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [24]:
len(batched_inputs["input_ids"][0]), len(batched_inputs["input_ids"][1])

(16, 8)

In [42]:
batched_inputs["input_ids"][0] = seq1_ids
batched_inputs["input_ids"][1] = seq2_ids + [tokenizer.pad_token_id] * 8
batched_inputs["attention_mask"][0] = [1] * 14
batched_inputs["attention_mask"][1] = [1] * 6 + [0] * 8
print(f"Shape input_ids: {len(batched_inputs['input_ids'][0])}, {len(batched_inputs['input_ids'][1])}")
print(f"Shape attention mask: {len(batched_inputs['attention_mask'][0])}, {len(batched_inputs['attention_mask'][1])}")

Shape input_ids: 14, 14
Shape attention mask: 14, 14


In [43]:
batched_output = model(input_ids=torch.tensor(batched_inputs["input_ids"]), attention_mask=torch.tensor(batched_inputs["attention_mask"]))
print(f"Batched output logits: {batched_output.logits}")

Batched output logits: tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)
