# Hugging face eval cnn-dm
+ 使用 cnn dailymail 通过 Hugging Face 的各个模型效果评估
+ https://github.com/hellotransformers/Natural_Language_Processing_with_Transformers
+ https://github.com/hellotransformers/Natural_Language_Processing_with_Transformers/blob/main/chapter6.md
+ https://xiaosheng.run/2022/03/29/transformers-note-8.html
+ https://github.com/datawhalechina/learn-nlp-with-transformers/blob/main/docs/%E7%AF%87%E7%AB%A04-%E4%BD%BF%E7%94%A8Transformers%E8%A7%A3%E5%86%B3NLP%E4%BB%BB%E5%8A%A1/4.7-%E7%94%9F%E6%88%90%E4%BB%BB%E5%8A%A1-%E6%91%98%E8%A6%81%E7%94%9F%E6%88%90.md

In [None]:
# !conda info --envs
# !conda init bash
# !conda activate bot-mvp


# CNN DM 数据介绍
+ CNN/DailyMail数据集由大约300,000对新闻文章及其相应的摘要组成，这些摘要由CNN和DailyMail在其文章中附加的要点组成
+ 该数据集的一个重要方面是，摘要是抽象的，而不是摘录的，这意味着它们由新的句子而不是简单的摘录组成
+ 该数据集可在Hub上找到；我们将使用3.0.0版本，这是一个为摘要而设置的非匿名版本
+ 训练集大小： 286817
+ 验证集大小： 13368
+ 测试集大小： 11487
+ 训练集中平均摘要句子数： 3.72

In [None]:
# !pip install datasets==2.5.2
# !pip uninstall transformers
# !pip install transformers # 需要 3.1.0, 4.x 会报错
# !export http_proxy='http://172.19.57.45:3128/'
# !export http_proxy='http://172.19.57.45:3128/'
# !export http_proxy=''
# !export http_proxy=''

In [None]:
!which python3
!which python

In [None]:
!pip list

In [None]:
# import pytorch_pretrained_bert as ppb
# assert 'bert-large-cased' in ppb.modeling.PRETRAINED_MODEL_ARCHIVE_MAP

In [None]:
# !pip install transformers==4.18.0

In [None]:
# !pip list

# 数据准备
+ 因为网络或者代理的问题，数据从云端直接下载有问题，解决方案如下
+ 远程加载 可以参考 https://github.com/huggingface/datasets/issues/996
+ 本地加载 可以参考 https://blog.csdn.net/PolarisRisingWar/article/details/124042709

In [None]:
# 代理必须关闭
# 服务器上也需要关闭代理
# hide_output
import datasets
from datasets import load_dataset

# 远程加载
# dataset = load_dataset("cnn_dailymail",  version="3.0.0") # 有bug
# dataset = load_dataset("ccdv/cnn_dailymail",  version="3.0.0")
# 本地加载
dataset = datasets.load_from_disk('hf_cnn-dm')


该数据集有三列：文章，其中包含新闻文章，亮点与摘要，以及唯一标识每篇文章的ID

In [None]:
print(f"Features: {dataset['train'].column_names}")

In [None]:
sample = dataset["train"][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])


In [None]:
sample_text = dataset["train"][1]["article"][:2000]
print(sample_text)
# We'll collect the generated summaries of each model in a dictionary
summaries = {}

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

In [None]:
# # use bart in pytorch
# from transformers import  pipeline
# summarizer = pipeline("summarization")
# summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)

# # use t5 in tf
# summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
# summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)

# baseline

In [None]:
def three_sentence_summary(text): 
	return "\n".join(sent_tokenize(text)[:3]) 
summaries["baseline"] = three_sentence_summary(sample_text)

# gpt2

In [None]:
from transformers import set_seed 
from transformers import pipeline
import torch
set_seed(42) 
pipe = pipeline("text-generation", model="gpt2-xl", from_tf=True) 
gpt2_query = sample_text + "\nTL;DR:\n" 
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)

summaries["gpt2"] = "\n".join( sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

# t5

In [None]:
pipe = pipeline("summarization", model="t5-large") 
pipe_out = pipe(sample_text) 
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# bart

In [None]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn") 
pipe_out = pipe(sample_text) 
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# pegasus

In [None]:
import torch
with torch.no_grad():
    pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail") 
    pipe_out = pipe(sample_text) 
    summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

# 不同模型效果对比

In [None]:
print("GROUND TRUTH") 
print(dataset["train"][1]["highlights"]) 
print("") 
for model_name in summaries: 
	print(model_name.upper()) 
	print(summaries[model_name]) 
	print("")

# 评估指标

#!pip install sacrebleu==2.3.1

In [None]:
from datasets import load_metric 
bleu_metric = load_metric("sacrebleu")

In [None]:
import pandas as pd 
import numpy as np 
bleu_metric.add( prediction="the the the the the the", reference=["the cat is on the mat"]) 
results = bleu_metric.compute(smooth_method="floor", smooth_value=0) 
results["precisions"] = [np.round(p, 2) for p in results["precisions"]] 
pd.DataFrame.from_dict(results, orient="index", columns=["Value"])

In [None]:
bleu_metric.add( prediction="the cat is on mat", reference=["the cat is on the mat"])
results = bleu_metric.compute(smooth_method="floor", smooth_value=0) 
results["precisions"] = [np.round(p, 2)for p in results["precisions"]] 
pd.DataFrame.from_dict(results, orient="index", columns=["Value"])

In [None]:
# rouge_score==0.0.4 work well
rouge_metric = load_metric("rouge")

In [None]:
reference = dataset["train"][1]["highlights"] 
records = [] 
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] 
for model_name in summaries: 
	rouge_metric.add(prediction=summaries[model_name], reference=reference) 
	score = rouge_metric.compute() 
	rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
	records.append(rouge_dict) 
pd.DataFrame.from_records(records, index=summaries.keys())

# 使用pegsus 抽样评估 测试集


In [None]:
def evaluate_summaries_baseline(dataset, metric, column_text="article", column_summary="highlights"): 
    summaries = [three_sentence_summary(text) for text in dataset[column_text]] 
    metric.add_batch(predictions=summaries, references=dataset[column_summary]) 
    score = metric.compute() 
    return score

In [None]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1000)) 
score = evaluate_summaries_baseline(test_sampled, rouge_metric) 
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names) 
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

In [None]:
from tqdm import tqdm
import torch 
device = "cuda" if torch.cuda.is_available() else "cpu" 
def chunks(list_of_elements, batch_size): 
	"""Yield successive batch-sized chunks from list_of_elements.""" 
	for i in range(0, len(list_of_elements), batch_size): 
	    yield list_of_elements[i : i + batch_size] 
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text="article", column_summary="highlights"): 
	article_batches = list(chunks(dataset[column_text], batch_size)) 
	target_batches = list(chunks(dataset[column_summary], batch_size)) 
	for article_batch, target_batch in tqdm( zip(article_batches, target_batches), total=len(article_batches)): 
		inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt") 
		summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128) 
		decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries] 
		decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries] 
		metric.add_batch(predictions=decoded_summaries, references=target_batch) 
	score = metric.compute() 
	return score

In [None]:
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 

# import torch
# with torch.no_grad():
#     model_ckpt = "google/pegasus-cnn_dailymail" 
#     tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 
#     model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device) 
#     score = evaluate_summaries_pegasus(test_sampled, rouge_metric, model, tokenizer, batch_size=8) 
#     rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names) 
#     pd.DataFrame(rouge_dict, index=["pegasus"])

# print(rouge_dict)

In [None]:
print(rouge_dict)

# 训练一个摘要模型
+ 使用 SAMSum
+ SAMSum 数据介绍如下

In [None]:
# dataset_samsum = load_dataset("samsum") 
dataset_samsum = datasets.load_from_disk('hf_samsum')
split_lengths = [len(dataset_samsum[split])for split in dataset_samsum] 
print(f"Split lengths: {split_lengths}") 
print(f"Features: {dataset_samsum['train'].column_names}") 
print("\nDialogue:") 
print(dataset_samsum["test"][0]["dialogue"]) 
print("\nSummary:") 
print(dataset_samsum["test"][0]["summary"])

# 流水线评估

In [None]:
pipe_out = pipe(dataset_samsum["test"][0]["dialogue"]) 
print("Summary:") 
print(pipe_out[0]["summary_text"].replace(" .<n>", ".\n")) 

我们可以看到，该模型大多试图通过提取对话中的关键句子来进行文本摘要。这在CNN/DailyMail数据集上可能效果相对较好，但SAMSum中的文本摘要更加抽象。让我们通过在测试集上运行完整的ROUGE评估来确认这一点:

In [None]:
score = evaluate_summaries_pegasus(dataset_samsum["test"], rouge_metric, model, tokenizer, column_text="dialogue", column_summary="summary", batch_size=8) 

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names) 
pd.DataFrame(rouge_dict, index=["pegasus"])

结果不是很好，但这并不意外，因为我们已经远离了CNN/DailyMail的数据分布。尽管如此，在训练前设置评估流水线有两个好处：我们可以直接用指标来衡量训练的成功与否，而且我们有一个好的基线。在我们的数据集上对模型进行微调，应该会使ROUGE指标立即得到改善，如果不是这样，我们就知道我们的训练循环出了问题。

# 微调 pegsus


In [None]:
import matplotlib.pyplot as plt
d_len = [len(tokenizer.encode(s)) for s in dataset_samsum["train"] ["dialogue"]] 
s_len = [len(tokenizer.encode(s)) for s in dataset_samsum["train"]["summary"]] 
fig, axes = plt.subplots(1, 2, figsize=(10, 3.5), sharey=True)
axes[0].hist(d_len, bins=20, color="C0", edgecolor="C0") 
axes[0].set_title("Dialogue Token Length") 
axes[0].set_xlabel("Length") 
axes[0].set_ylabel("Count") 
axes[1].hist(s_len, bins=20, color="C0", edgecolor="C0") 
axes[1].set_title("Summary Token Length") 
axes[1].set_xlabel("Length") 
plt.tight_layout() 

plt.show()


我们看到，大多数对话比CNN/DailyMail的文章短得多，每个对话有100-200个标记。同样，摘要也短得多，大约有20-40个符号（一条推文的平均长度）
让我们在为训练者建立数据整理器时牢记这些意见。首先，我们需要对数据集进行标记。现在，我们将对话和摘要的最大长度分别设置为1024和128:

In [None]:
# def convert_examples_to_features(example_batch): 
# 	input_encodings = tokenizer(example_batch["dialogue"], max_length=1024, truncation=True) 
# 	with tokenizer.as_target_tokenizer(): 
# 		target_encodings = tokenizer(example_batch["summary"], max_length=128, truncation=True) 
# 	return {"input_ids": input_encodings["input_ids"], "attention_mask": input_encodings["attention_mask"], "labels": target_encodings["input_ids"]} 
# dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True) 
# columns = ["input_ids", "labels", "attention_mask"] 
# dataset_samsum_pt.set_format(type="torch", columns=columns)

def convert_examples_to_features(example_batch): 
	input_encodings = tokenizer(example_batch["dialogue"], max_length=1024, truncation=True) 
	# with tokenizer.as_target_tokenizer(): 
	target_encodings = tokenizer(example_batch["summary"], max_length=128, truncation=True) 
	return {"input_ids": input_encodings["input_ids"], "attention_mask": input_encodings["attention_mask"], "labels": target_encodings["input_ids"]} 
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True) 
columns = ["input_ids", "labels", "attention_mask"] 
dataset_samsum_pt.set_format(type="torch", columns=columns)

使用标记化步骤的一个新东西是tokenizer.as_target_tokenizer()上下文。有些模型在解码器输入中需要特殊的标记，所以区分编码器和解码器输入的标记很重要。在with语句（称为上下文管理器）中，标记器知道它正在为解码器进行标记，并可以相应地处理序列。

In [None]:
from transformers import DataCollatorForSeq2Seq 
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) 

#然后，像往常一样，我们为训练设置了一个TrainingArguments:

from transformers import TrainingArguments, Trainer 
training_args = TrainingArguments( output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500, per_device_train_batch_size=1, per_device_eval_batch_size=1, weight_decay=0.01, logging_steps=10, push_to_hub=True,
evaluation_strategy='steps', eval_steps=500, save_steps=1e6, gradient_accumulation_steps=16)

In [None]:
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, data_collator=seq2seq_data_collator, train_dataset=dataset_samsum_pt["train"], eval_dataset=dataset_samsum_pt["validation"])


In [None]:
trainer.train() 
score = evaluate_summaries_pegasus( dataset_samsum["test"], rouge_metric, trainer.model, tokenizer, batch_size=2, column_text="dialogue", column_summary="summary") 
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names) pd.DataFrame(rouge_dict, index=[f"pegasus"])
