In [23]:

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding, AutoTokenizer


In [24]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [27]:
sequences = [
    "Hello, how are you?",
    "I'm fine, thank you.",
    "This is an example of different length sentences.",
    "NewYork, what the hell?",
    "How is everybody doing?",
    "I wanna dance with somebody"
]

In [28]:
# Encode sequences
encoded_sequences = [tokenizer.encode_plus(sequence, truncation=True, padding=False) for sequence in sequences]
encoded_sequences # bert based encoding

[{'input_ids': [101, 7592, 1010, 2129, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [101, 1045, 1005, 1049, 2986, 1010, 4067, 2017, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [101, 2023, 2003, 2019, 2742, 1997, 2367, 3091, 11746, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [101, 2047, 7677, 8024, 1010, 2054, 1996, 3109, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [101, 2129, 2003, 7955, 2725, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [101, 1045, 10587, 3153, 2007, 8307, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}]

In [29]:
# Define a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [30]:
# Collate the batch of data. 
collated_batch = data_collator(encoded_sequences)

print(collated_batch)
print(collated_batch['input_ids'].shape)
print(collated_batch['token_type_ids'].shape)
print(collated_batch['attention_mask'].shape)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  7592,  1010,  2129,  2024,  2017,  1029,   102,     0,     0,
             0],
        [  101,  1045,  1005,  1049,  2986,  1010,  4067,  2017,  1012,   102,
             0],
        [  101,  2023,  2003,  2019,  2742,  1997,  2367,  3091, 11746,  1012,
           102],
        [  101,  2047,  7677,  8024,  1010,  2054,  1996,  3109,  1029,   102,
             0],
        [  101,  2129,  2003,  7955,  2725,  1029,   102,     0,     0,     0,
             0],
        [  101,  1045, 10587,  3153,  2007,  8307,   102,     0,     0,     0,
             0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1,

In [33]:
batch_size = 2
# Create a DataLoader
dataloader = DataLoader(encoded_sequences, batch_size=batch_size, collate_fn=data_collator)
dataloader

<torch.utils.data.dataloader.DataLoader at 0x2840280dc50>

In [34]:
# dataloader is a generator
# Convert the first batch to a list and check the keys
first_batch = next(iter(dataloader))
print("first_batch",first_batch)
print(list(first_batch.keys()))

first_batch {'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0,    0],
        [ 101, 1045, 1005, 1049, 2986, 1010, 4067, 2017, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['input_ids', 'token_type_ids', 'attention_mask']


In [35]:
# Iterate over batches
for batch in dataloader:
    print("Input IDs:", batch['input_ids'].size())
    print("Attention mask:", batch['attention_mask'].size())
    print("---")
    
    # Input IDs: torch.Size([2, 10]) ==> because the batch size is 2. 

Input IDs: torch.Size([2, 10])
Attention mask: torch.Size([2, 10])
---
Input IDs: torch.Size([2, 11])
Attention mask: torch.Size([2, 11])
---
Input IDs: torch.Size([2, 7])
Attention mask: torch.Size([2, 7])
---
