In [None]:
from transformers import BertTokenizer, BertForMaskedLM, BertConfig
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size=512):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            for line in f:
                batch_encoding = tokenizer(line, add_special_tokens=True, truncation=True, max_length=self.block_size, return_token_type_ids=False)
                self.examples.append(batch_encoding.input_ids)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

# 初始化tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # 如果您的数据是中文

# 加载数据集
file_path = 'harry_potter_1.txt'  # 您的数据文件路径
dataset = TextDataset(tokenizer, file_path)

# 创建数据collator，用于动态padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# 配置模型
config = BertConfig.from_pretrained("bert-base-uncased")  # 根据您的需求选择适当的预训练模型
model = BertForMaskedLM(config)

# 训练参数设置
training_args = TrainingArguments(
    output_dir="./bert_output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# 开始训练
trainer.train()