In [1]:
# preprocessing on the GLUE SST-2 dataset
# composed of single sentences, not pairs

In [2]:
# download the MRPC
print("get a DatasetDict object which contains the training set, the validation set, and the test set")
from datasets import load_dataset

raw_datasets = load_dataset("glue", "sst2")
raw_datasets

get a DatasetDict object which contains the training set, the validation set, and the test set


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [3]:
# access each sentence in the raw_datasets object by indexing, like with a dictionary
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [4]:
print("the names=['negative', 'positive'] indicates the values: 0 is negative, 1 is positive")
raw_train_dataset.features

the names=['negative', 'positive'] indicates the values: 0 is negative, 1 is positive


{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [5]:
raw_train_dataset = raw_datasets["train"]
print("print element 15")
raw_train_dataset[15]

print element 15


{'sentence': 'the greatest musicians ', 'label': 1, 'idx': 15}

In [6]:
print("print element 87")
raw_train_dataset[87]

print element 87


{'sentence': 'khouri manages , with terrific flair , to keep the extremes of screwball farce and blood-curdling family intensity on one continuum . ',
 'label': 1,
 'idx': 87}

In [8]:
# process the sentences, not a good approach
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence"])
print("token_type_ids indicate which sentence")
inputs = tokenizer("This is the first sentence.")
inputs

token_type_ids indicate which sentence


{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
# decode the IDs
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]']

In [10]:
# tokenize entire dataset
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence"],
    padding=True,
    truncation=True,
)

In [14]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

In [17]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [19]:
# dynamic padding -  pad all the examples to the length of the longest element 
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
# look and see what the longest length is by sampling so guessing
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence"]}
[len(x) for x in samples["input_ids"]]

[10, 11, 15, 10, 22, 13, 29, 6]

In [26]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 29]),
 'token_type_ids': torch.Size([8, 29]),
 'attention_mask': torch.Size([8, 29]),
 'labels': torch.Size([8])}