# Hugging face eval cnn-dm
+ 使用 cnn dailymail 通过 Hugging Face 的各个模型效果评估
+ https://github.com/hellotransformers/Natural_Language_Processing_with_Transformers
+ https://github.com/hellotransformers/Natural_Language_Processing_with_Transformers/blob/main/chapter6.md
+ https://xiaosheng.run/2022/03/29/transformers-note-8.html
+ https://github.com/datawhalechina/learn-nlp-with-transformers/blob/main/docs/%E7%AF%87%E7%AB%A04-%E4%BD%BF%E7%94%A8Transformers%E8%A7%A3%E5%86%B3NLP%E4%BB%BB%E5%8A%A1/4.7-%E7%94%9F%E6%88%90%E4%BB%BB%E5%8A%A1-%E6%91%98%E8%A6%81%E7%94%9F%E6%88%90.md

In [1]:
# !pip install datasets==2.5.2
# pip install transformers==4.18.0
# !export http_proxy='http://172.19.57.45:3128/'
# !export http_proxy='http://172.19.57.45:3128/'
# !export http_proxy=''
# !export http_proxy=''

In [2]:
import datasets
from datasets import load_dataset
# 本地加载
dataset = datasets.load_from_disk('../../../resources/dataset/hf_cnn-dm/')

sample = dataset["train"][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])

sample_text = dataset["train"][1]["article"][:2000]
print(sample_text)
# We'll collect the generated summaries of each model in a dictionary
summaries = {}

  from .autonotebook import tqdm as notebook_tqdm



Article (excerpt of 500 characters, total length: 3192):

(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay. The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds. The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover. The 26-year-old Bolt has n

Summary (length: 180):
Usain Bolt wins third gold of world championship .
Anchors Jamaica to 4x100m relay victory .
Eighth gold at the championships for Bolt .
Jamaica double up in women's 4x100m relay .
(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay. The fastest man in the world charged clear of United States rival Justin G

# 使用nltk 对英文的句子进行处理

In [3]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

[nltk_data] Error loading punkt: <urlopen error [Errno 101] Network is
[nltk_data]     unreachable>


False

# baseline

In [4]:
from transformers import set_seed 
from transformers import pipeline
import torch
set_seed(42) 

def three_sentence_summary(text): 
	return "\n".join(sent_tokenize(text)[:3]) 
	
summaries["baseline"] = three_sentence_summary(sample_text)

# gpt2

In [5]:
pipe = pipeline("text-generation", model="../../../resources/embedding/gpt2-xl/") 
gpt2_query = sample_text + "\nTL;DR:\n" 
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)

summaries["gpt2"] = "\n".join( sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# t5

In [6]:
pipe = pipeline("summarization", model="../../../resources/embedding/t5-large/") 
pipe_out = pipe(sample_text) 

summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# bart

In [7]:
pipe = pipeline("summarization", model="../../../resources/embedding/facebook_bart-large-cnn/") 
pipe_out = pipe(sample_text) 

summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# pegasus

In [8]:
# with torch.no_grad():
#     pipe = pipeline("summarization", model="../../../resources/embedding/google_pegasus-cnn-dm") 
#     pipe_out = pipe(sample_text) 

#     summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

pipe = pipeline("summarization", model="../../../resources/embedding/google_pegasus-cnn-dm") 
pipe_out = pipe(sample_text) 

summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

# 不同模型效果对比

In [9]:
print("GROUND TRUTH") 
print(dataset["train"][1]["highlights"]) 
print("") 
for model_name in summaries: 
	print(model_name.upper()) 
	print(summaries[model_name]) 
	print("")

GROUND TRUTH
Usain Bolt wins third gold of world championship .
Anchors Jamaica to 4x100m relay victory .
Eighth gold at the championships for Bolt .
Jamaica double up in women's 4x100m relay .

BASELINE
(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.
The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.
The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover.

GPT2
Nesta, the fastest man in the world.
Gatlin, the most successful Olympian ever.
Kemar, a Jamaican legend.
Shelly-Ann, the fastest woman ever.
Bolt, the world's greatest athlete.
The team sport of pole vaulting

T5
usain bolt wins his third gold medal of the world championships in the men's 4x100m relay .
the 26-yea

# 评估指标

In [10]:
from datasets import load_metric 
import pandas as pd 
bleu_metric = load_metric("sacrebleu")
rouge_metric = load_metric("rouge") # rouge_score==0.0.4 work well

Using the latest cached version of the module from /home/users/sunhongchao/.cache/huggingface/modules/datasets_modules/metrics/sacrebleu/31e1673407d8789b8f5ddfd979948f6a1de0a6d691426d55fa74a35ffb0c1bdf (last modified on Sat Oct 29 23:06:04 2022) since it couldn't be found locally at sacrebleu, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/users/sunhongchao/.cache/huggingface/modules/datasets_modules/metrics/rouge/0ffdb60f436bdb8884d5e4d608d53dbe108e82dac4f494a66f80ef3f647c104f (last modified on Sat Oct 29 23:10:57 2022) since it couldn't be found locally at rouge, or remotely on the Hugging Face Hub.


## 抽样评估1条

In [11]:
reference = dataset["train"][1]["highlights"] 
records = [] 
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] 
for model_name in summaries: 
	rouge_metric.add(prediction=summaries[model_name], reference=reference) 
	score = rouge_metric.compute() 
	rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
	records.append(rouge_dict) 
pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.303571,0.090909,0.214286,0.232143
gpt2,0.1875,0.0,0.125,0.1875
t5,0.486486,0.222222,0.378378,0.486486
bart,0.582278,0.207792,0.455696,0.506329
pegasus,0.866667,0.655172,0.8,0.833333


## 抽样 1000条 

In [12]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1000)) 

Loading cached shuffled indices for dataset at ../../../resources/dataset/hf_cnn-dm/test/cache-f551f6b1b06308aa.arrow


## 抽样评估 1000 条 baseline

In [13]:
def evaluate_summaries_baseline(dataset, metric, column_text="article", column_summary="highlights"): 
    summaries = [three_sentence_summary(text) for text in dataset[column_text]] 
    metric.add_batch(predictions=summaries, references=dataset[column_summary]) 
    score = metric.compute() 
    return score


score = evaluate_summaries_baseline(test_sampled, rouge_metric) 
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names) 
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.388071,0.170554,0.247146,0.354972


## 抽样评估 1000条 pegasus


In [16]:
import torch
torch.cuda.device_count()

4

In [18]:
from tqdm import tqdm
import torch 
device = "cuda" if torch.cuda.is_available() else "cpu" 
def chunks(list_of_elements, batch_size): 
	"""Yield successive batch-sized chunks from list_of_elements.""" 
	for i in range(0, len(list_of_elements), batch_size): 
	    yield list_of_elements[i : i + batch_size] 
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text="article", column_summary="highlights"): 
	article_batches = list(chunks(dataset[column_text], batch_size)) 
	target_batches = list(chunks(dataset[column_summary], batch_size)) 
	for article_batch, target_batch in tqdm( zip(article_batches, target_batches), total=len(article_batches)): 
		inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt") 
		summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128) 
		decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries] 
		decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries] 
		metric.add_batch(predictions=decoded_summaries, references=target_batch) 
	score = metric.compute() 
	return score

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 

model = None
tokenizer = None

with torch.no_grad():
    model_ckpt = "../../../resources/embedding/google_pegasus-cnn-dm" 
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 
    model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
    #model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt) 
    score = evaluate_summaries_pegasus(test_sampled, rouge_metric, model, tokenizer, batch_size=8) 
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names) 
    pd.DataFrame(rouge_dict, index=["pegasus"])

print(rouge_dict)

100%|██████████| 125/125 [12:58<00:00,  6.23s/it]


{'rouge1': 0.427098293379068, 'rouge2': 0.2072630667891052, 'rougeL': 0.30511255367162227, 'rougeLsum': 0.3691882873159909}
