<a href="https://colab.research.google.com/github/mydreamisto/notebooks/blob/main/section2_pt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]["sentence1"] # 一层一层往下走，所以一层一层地根据包含关系输入

In [None]:
raw_train_dataset.features

In [None]:
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
print(tokenized_sentences_1)
print(tokenized_sentences_2)

In [None]:
# tokenize the 15th row of the train dataset
tokenized_sentences_15_1 = tokenizer(raw_datasets["train"][14]["sentence1"])
tokenized_sentences_15_2 = tokenizer(raw_datasets["train"][14]["sentence2"])
print(tokenized_sentences_15_1)
print(tokenized_sentences_15_2)
print("----------------------------------------------------------------------------------------------------------------------------------------------")
tokenized_sentences_15_both = tokenizer(raw_datasets["train"][14]["sentence1"], raw_datasets["train"][14]["sentence1"])
print(tokenized_sentences_15_both)

In [None]:
# decode the 15th element
print(tokenizer.convert_ids_to_tokens(tokenized_sentences_15_both["input_ids"]))

In [None]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

In [None]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [9]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding = True,
    truncation = True
)

# 以下需要重点理解：

In [None]:
def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation = True)
# Dataset.map() 方法用于对数据集中的每个元素应用一个函数：
# 应用 tokenize_function 到所有数据集
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)
tokenized_datasets

# Dynamic Padding

**DataCollatorWithPadding类：用于数据填充**

DataCollatorWithPadding函数（tokenizer属性：指明填充标记和填充位置）是一个collate *function（将多个样本组合成一个batch批次的函数），能够将样本转换为* PyTorch tensors并将它们连接起来

In [None]:
# 在实际操作中，当我们处理数据集并将其分批（batch）时，需要确保每个批次中的元素长度相同，以便能够将它们输入到深度学习模型中进行处理。
# 因为不同的文本输入长度可能不同，为了将它们组合成一个批次，需要进行填充（padding）操作，使它们的长度一致。

# DataCollatorWithPadding类。这个类是用于数据填充。
# tokenizer：这是一个已经实例化的分词器对象。将其传递给 DataCollatorWithPadding 的目的是让它知道如何进行填充操作。具体来说，分词器可以告诉 DataCollatorWithPadding 以下信息：
# ①使用哪个填充标记（padding token）：不同的分词器可能使用不同的填充标记，比如对于一些自然语言处理任务，可能使用 [PAD] 作为填充标记。
# ②填充的位置：有些模型期望填充在输入的左边，有些则期望在右边，分词器可以提供这方面的信息。
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
samples = tokenized_datasets["train"][:8]
# 这是一个字典推导式。它遍历 samples 中的每一个键值对（使用 items() 方法），并对键 k 和值 v 进行筛选。
# if k not in ["idx", "sentence1", "sentence2"] 是一个条件判断，只有当键 k 不在列表 ["idx", "sentence1", "sentence2"] 中时，才将该键值对包含在新的字典 samples 中。
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}