<a href="https://colab.research.google.com/github/mydreamisto/notebooks/blob/main/datasets_processing_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
raw_dataset = load_dataset("gimmaru/glue-sst2")

In [17]:
raw_validation_dataset = raw_dataset["validation"]

In [19]:
raw_validation_dataset[0]

{'sentence': 'it gets onto the screen just about as much of the novella as one could reasonably expect , and is engrossing and moving in its own right . ',
 'label': 1,
 'idx': 726}

In [20]:
raw_validation_dataset.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [21]:
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [24]:
def tokenize_function(example):
  return tokenizer(example["sentence"], truncation = True)
tokenized_dataset = raw_dataset.map(tokenize_function, batched = True)

In [23]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [28]:
samples = tokenized_dataset["validation"][:8]
# 这是一个字典推导式。它遍历 samples 中的每一个键值对（使用 items() 方法），并对键 k 和值 v 进行筛选。
# if k not in ["sentence", "idx"] 是一个条件判断，只有当键 k 不在列表 ["sentence", "idx"] 中时，才将该键值对包含在新的字典 samples 中。
# sentence 通常是原始的文本字符串。在深度学习中，模型通常需要将输入数据表示为张量（tensors），而不能直接处理字符串。
# 对于模型训练来说，idx 可能只是一个样本的索引，在模型训练过程中通常不是必需的输入。
# 将这些元素排除是因为它们无法直接作为张量输入到模型中进行计算。
samples = {k: v for k, v in samples.items() if k not in ["sentence", "idx"]}
[len(x) for x in samples["input_ids"]]

[32, 24, 30, 40, 42, 16, 16, 15]

In [30]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 42]),
 'token_type_ids': torch.Size([8, 42]),
 'attention_mask': torch.Size([8, 42]),
 'labels': torch.Size([8])}