In [13]:
import warnings
warnings.filterwarnings("ignore")

# 加载模型权重
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM

model_checkpoint = r"F:\pythonProject\distilbert\distilbert_base_uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

# 查看模型参数量
distilbert_num_parameters = model.num_parameters() / 1_000_000

# hugging face训练的distilbert模型的推理效果
import torch
text = "This is a great [MASK]."
inputs = tokenizer(text, return_tensors="pt")
"""
**在函数中的作用就是把后面紧跟着的这个参数，从一个字典的格式，解压成一个个单独的参数。
tokenizer的输出是一个包含了input_ids和attention_mask两个key的字典，因此通过**的解压，
inputs = {'input_ids': tensor([[ 101, 2023, 2003, 1037, 2307,  103, 1012,  102]])
            , 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
相当于变成了intput_ids=..., attention_mask=...喂给函数model()
"""
token_logits = model(**inputs).logits

# 1、找到mask对应的词的索引
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
# 2、找到mask词对应的分类特征，0是第一句话（只有一句话）；1是mask_token_index是mask对应的索引；：表示30522个分类特征
mask_token_logits = token_logits[0, mask_token_index, :]
# 3、取出概率前5的结果
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
# 4、将5个最终的预测结果遍历，decode([token])是将预测的索引转换为词
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 2307,  103, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [2]:
"""使用imdb影评数据集微调distilbert模型"""
# 1、加载imdb数据集
from datasets import load_from_disk
path = r'F:\pythonProject\datasets\stanfordnlp\imdb'  
imdb_dataset = load_from_disk(path)

In [3]:
# 2、加载分词器
model_checkpoint = r"F:\pythonProject\distilbert\distilbert_base_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        # 计算每一个文本的长度（word_ids）
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# map操作是对数据集的每个样本都执行该操作，batched表示批量处理
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# 对整个数据集进行分块操作
def group_texts(examples):
    chunk_size = 128
    # 拼接到一起
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 计算长度
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # //就相当于咱们不要最后多余的了
    total_length = (total_length // chunk_size) * chunk_size
    # 切分
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # 完型填空会用到标签的，就是原来被mask掉的文本，所以标签是和原来的id是一样的
    result["labels"] = result["input_ids"].copy()
    return result

In [5]:
# 使用map方法执行group_texts()这个函数
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# 3、数据封装，将数据封装为一个batch
from transformers import DataCollatorForLanguageModeling
# mlm_probability参数定义了在输入序列中每个单词被选择为掩盖的概率,默认值0.15
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [7]:
# 采样数据集，并分割数据集
train_size = 10000
test_size = int(0.1 * train_size)
downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

In [8]:
# 指定训练参数
from transformers import TrainingArguments
batch_size = 64
logging_steps = len(downsampled_dataset["train"]) // batch_size
training_args = TrainingArguments(
    output_dir="./FinetuneModel/distilbert_finetuned_imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    num_train_epochs=1,
    save_strategy='epoch',
)

In [9]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.6907,2.531834


TrainOutput(global_step=157, training_loss=2.689167912598628, metrics={'train_runtime': 3666.3116, 'train_samples_per_second': 2.728, 'train_steps_per_second': 0.043, 'total_flos': 331402890240000.0, 'train_loss': 2.689167912598628, 'epoch': 1.0})

In [10]:
# 评估指标：困惑度
import math
eval_results = trainer.evaluate()
# 困惑度就是交叉熵的指数形式，简单的说就是当在选择mask位置什么词合适时，平均挑了多少个词才能答对。
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 12.68


In [12]:
# 使用imdb影评数据集微调distilbert后的模型推理
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM

model_checkpoint = r"F:\pythonProject\distilbert\distilbert_base_uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(r"F:\pythonProject\imdb_distilbert_finetuned\checkpoint-157")

# 新模型的效果
import torch
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great idea.'
'>>> This is a great deal.'
'>>> This is a great adventure.'
'>>> This is a great job.'
'>>> This is a great one.'
