### 2.1.3 划分数据集

In [1]:
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence

bert_tokenizer = BertTokenizer.from_pretrained("./BERT中文词典")

# 输入文本
text = "乘风破浪会有时，直挂云帆济沧海。"

# 分词
tokens = bert_tokenizer.tokenize(text)

# 限制最大序列长度
max_len = 8
min_len = 1

# 生成训练数据
input_ids = []
labels = []

# 遍历不同的序列长度
for seq_len in range(min_len, max_len + 1):
    for i in range(len(tokens) - seq_len):
        # 获取当前的输入序列
        input_seq = tokens[i:i+seq_len]
        # 获取下一个token作为标签
        label_seq = tokens[i+seq_len] if i+seq_len < len(tokens) else None
        
        # 将token转化为id
        input_ids_seq = bert_tokenizer.convert_tokens_to_ids(input_seq)
        label_id = bert_tokenizer.convert_tokens_to_ids([label_seq])[0] if label_seq else -1  # 如果没有标签则使用-1作为填充值
        
        # 保存输入和标签
        input_ids.append(input_ids_seq)
        labels.append(label_id)
        
print(input_ids)
print(labels)

# 填充输入序列，使其长度一致
# 使用 pad_token_id 填充
input_ids_padded = pad_sequence([torch.tensor(seq) for seq in input_ids], 
                                batch_first=True, 
                                padding_value=bert_tokenizer.pad_token_id)

# 转换为PyTorch tensor
labels_tensor = torch.tensor(labels)

# 打印tensor数据
print(input_ids_padded.shape, labels_tensor.shape)
print(input_ids_padded)
print(labels_tensor)

# 打印前几个样本
for i in range(len(input_ids_padded)): 
    # 解码输入序列，过滤掉 pad_token_id
    input_ids_filtered = [id for id in input_ids_padded[i].tolist() if id != bert_tokenizer.pad_token_id]
    input_seq_decoded = bert_tokenizer.decode(input_ids_filtered, skip_special_tokens=True)
    
    # 解码标签序列，如果标签是-1则跳过
    if labels_tensor[i].item() != -1:
        label_decoded = bert_tokenizer.decode([labels_tensor[i].item()], skip_special_tokens=True)
    else:
        label_decoded = "N/A"  # 如果标签是无效的，标记为 N/A
    
    print(f"输入序列: {input_seq_decoded.replace(' ','')} --> 标签: {label_decoded}")


[[733], [7599], [4788], [3857], [833], [3300], [3198], [8024], [4684], [2899], [756], [2359], [3845], [3771], [3862], [733, 7599], [7599, 4788], [4788, 3857], [3857, 833], [833, 3300], [3300, 3198], [3198, 8024], [8024, 4684], [4684, 2899], [2899, 756], [756, 2359], [2359, 3845], [3845, 3771], [3771, 3862], [733, 7599, 4788], [7599, 4788, 3857], [4788, 3857, 833], [3857, 833, 3300], [833, 3300, 3198], [3300, 3198, 8024], [3198, 8024, 4684], [8024, 4684, 2899], [4684, 2899, 756], [2899, 756, 2359], [756, 2359, 3845], [2359, 3845, 3771], [3845, 3771, 3862], [733, 7599, 4788, 3857], [7599, 4788, 3857, 833], [4788, 3857, 833, 3300], [3857, 833, 3300, 3198], [833, 3300, 3198, 8024], [3300, 3198, 8024, 4684], [3198, 8024, 4684, 2899], [8024, 4684, 2899, 756], [4684, 2899, 756, 2359], [2899, 756, 2359, 3845], [756, 2359, 3845, 3771], [2359, 3845, 3771, 3862], [733, 7599, 4788, 3857, 833], [7599, 4788, 3857, 833, 3300], [4788, 3857, 833, 3300, 3198], [3857, 833, 3300, 3198, 8024], [833, 3300, 

In [2]:
%%time
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
from tqdm import tqdm

# 读取语料库
with open('./data/people.cn/news.txt', 'r', encoding='utf-8') as f:
    corpus = f.read()
train_data = corpus[:1000000]
val_data = corpus[-50000:]

# 假设这里的 bert_tokenizer 已经初始化并可用
bert_tokenizer = BertTokenizer.from_pretrained('./BERT中文词典')

# 整个语料库分词
bert_tokens = bert_tokenizer.tokenize(corpus)

# 限制最大序列长度
max_len = 8
min_len = 1

# 生成训练数据
input_ids = []
labels = []

# 遍历不同的序列长度
for seq_len in range(min_len, max_len + 1):
    for i in tqdm(range(len(bert_tokens) - seq_len), desc=f'Processing seq_len {seq_len}', unit='token'):
        # 获取当前的输入序列
        input_seq = bert_tokens[i:i + seq_len]
        # 获取下一个token作为标签
        label_seq = bert_tokens[i + seq_len] if i + seq_len < len(bert_tokens) else None
        
        # 将token转化为id
        input_ids_seq = bert_tokenizer.convert_tokens_to_ids(input_seq)
        # 如果没有标签则使用-1作为填充值
        label_id = bert_tokenizer.convert_tokens_to_ids([label_seq])[0] if label_seq else -1
        
        # 保存输入和标签
        input_ids.append(input_ids_seq)
        labels.append(label_id)

# 填充输入序列，使其长度一致，使用 pad_token_id 填充
input_ids_padded = pad_sequence([torch.tensor(seq) for seq in 
                                tqdm(input_ids, desc="Padding sequences", unit="sequence")], 
                                batch_first=True, 
                                padding_value=bert_tokenizer.pad_token_id)

# 转换为 PyTorch tensor
labels_tensor = torch.tensor(labels)

# 创建自定义 Dataset 类
class TextDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

# 创建 TextDataset 实例
dataset = TextDataset(input_ids_padded, labels_tensor)

# 保存数据集到磁盘
torch.save(dataset, 'data/text_dataset.pt')


Processing seq_len 1: 100%|███████████████████████████████████████████| 5271128/5271128 [00:22<00:00, 237381.88token/s]
Processing seq_len 2: 100%|███████████████████████████████████████████| 5271127/5271127 [00:29<00:00, 181250.14token/s]
Processing seq_len 3: 100%|███████████████████████████████████████████| 5271126/5271126 [00:36<00:00, 145816.78token/s]
Processing seq_len 4: 100%|███████████████████████████████████████████| 5271125/5271125 [00:42<00:00, 123710.35token/s]
Processing seq_len 5: 100%|███████████████████████████████████████████| 5271124/5271124 [00:50<00:00, 103940.46token/s]
Processing seq_len 6: 100%|████████████████████████████████████████████| 5271123/5271123 [00:57<00:00, 91305.72token/s]
Processing seq_len 7: 100%|████████████████████████████████████████████| 5271122/5271122 [01:05<00:00, 80879.33token/s]
Processing seq_len 8: 100%|████████████████████████████████████████████| 5271121/5271121 [01:08<00:00, 76446.81token/s]
Padding sequences: 100%|████████████████

CPU times: total: 12min 59s
Wall time: 13min 22s


In [3]:
from torch.utils.data import Dataset

# 重新定义 TextDataset 类
class TextDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

# 加载保存的 Dataset 数据集
loaded_dataset = torch.load('data/text_dataset.pt', weights_only=False)

# 使用 DataLoader 进行批量加载
from torch.utils.data import DataLoader

dataloader = DataLoader(loaded_dataset, batch_size=4, shuffle=True)

# 测试加载的 DataLoader
for batch_idx, (batchX, batchY) in enumerate(dataloader):
    print(f"批次 {batch_idx + 1}:")
    print("输入序列：", batchX)
    print("标签：", batchY)
    print()
    if batch_idx + 1 >= 5:
        break

批次 1:
输入序列： tensor([[4385,    0,    0,    0,    0,    0,    0,    0],
        [ 852, 6632, 3221, 6821,    0,    0,    0,    0],
        [2137, 6206, 5050, 1920, 6572,    0,    0,    0],
        [4638, 5295, 1737, 2669,  821,  100,    0,    0]])
标签： tensor([3291,  702,  510, 3124])

批次 2:
输入序列： tensor([[ 872,  812, 4638,    0,    0,    0,    0,    0],
        [2398, 2190, 1912, 2458, 3123, 4638, 1104, 2552],
        [5276, 8612,    0,    0,    0,    0,    0,    0],
        [ 868, 4500,  511, 1126,    0,    0,    0,    0]])
标签： tensor([2968, 1469, 1399, 1079])

批次 3:
输入序列： tensor([[5273, 5682, 3736,    0,    0,    0,    0,    0],
        [4415, 6389,    0,    0,    0,    0,    0,    0],
        [ 772, 6631, 8623,  674,    0,    0,    0,    0],
        [2521, 2130, 1587, 8024,    0,    0,    0,    0]])
标签： tensor([2255, 1158, 1039, 2213])

批次 4:
输入序列： tensor([[1277, 6816, 3341,  749,    0,    0,    0,    0],
        [1301, 3813,    0,    0,    0,    0,    0,    0],
        [4050,  691, 12

In [4]:
%%time
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
from tqdm import tqdm

# 读取语料库
with open('./data/people.cn/news.txt', 'r', encoding='utf-8') as f:
    corpus = f.read()
# train_data = corpus[:1000000]
val_data = corpus[-50000:]

# 假设这里的 bert_tokenizer 已经初始化并可用
bert_tokenizer = BertTokenizer.from_pretrained('./BERT中文词典')

# 整个语料库分词
bert_tokens = bert_tokenizer.tokenize(val_data)

# 限制最大序列长度
max_len = 8
min_len = 1

# 生成训练数据
input_ids = []
labels = []

# 遍历不同的序列长度
for seq_len in range(min_len, max_len + 1):
    for i in tqdm(range(len(bert_tokens) - seq_len), desc=f'Processing seq_len {seq_len}', unit='token'):
        # 获取当前的输入序列
        input_seq = bert_tokens[i:i + seq_len]
        # 获取下一个token作为标签
        label_seq = bert_tokens[i + seq_len] if i + seq_len < len(bert_tokens) else None
        
        # 将token转化为id
        input_ids_seq = bert_tokenizer.convert_tokens_to_ids(input_seq)
        # 如果没有标签则使用-1作为填充值
        label_id = bert_tokenizer.convert_tokens_to_ids([label_seq])[0] if label_seq else -1
        
        # 保存输入和标签
        input_ids.append(input_ids_seq)
        labels.append(label_id)

# 填充输入序列，使其长度一致，使用 pad_token_id 填充
input_ids_padded = pad_sequence([torch.tensor(seq) for seq in 
                                tqdm(input_ids, desc="Padding sequences", unit="sequence")], 
                                batch_first=True, 
                                padding_value=bert_tokenizer.pad_token_id)

# 转换为 PyTorch tensor
labels_tensor = torch.tensor(labels)

# 创建自定义 Dataset 类
class TextDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

# 创建 TextDataset 实例
dataset = TextDataset(input_ids_padded, labels_tensor)

# 保存数据集到磁盘
torch.save(dataset, 'data/val_dataset.pt')


Processing seq_len 1: 100%|███████████████████████████████████████████████| 47767/47767 [00:00<00:00, 176564.03token/s]
Processing seq_len 2: 100%|███████████████████████████████████████████████| 47766/47766 [00:00<00:00, 146713.12token/s]
Processing seq_len 3: 100%|███████████████████████████████████████████████| 47765/47765 [00:00<00:00, 133926.60token/s]
Processing seq_len 4: 100%|███████████████████████████████████████████████| 47764/47764 [00:00<00:00, 108760.92token/s]
Processing seq_len 5: 100%|████████████████████████████████████████████████| 47763/47763 [00:00<00:00, 94799.03token/s]
Processing seq_len 6: 100%|████████████████████████████████████████████████| 47762/47762 [00:00<00:00, 80296.69token/s]
Processing seq_len 7: 100%|████████████████████████████████████████████████| 47761/47761 [00:00<00:00, 74775.97token/s]
Processing seq_len 8: 100%|████████████████████████████████████████████████| 47760/47760 [00:00<00:00, 70775.08token/s]
Padding sequences: 100%|████████████████

CPU times: total: 13.5 s
Wall time: 17 s
