In [1]:
import torch
import datasets
from transformers import pipeline

dataset_samsum = datasets.load_from_disk('../../../resources/dataset/hf_samsum/')
pipe = pipeline("summarization", model="../../../resources/embedding/google_pegasus-cnn-dm") 

  from .autonotebook import tqdm as notebook_tqdm


+ 我们可以看到，该模型大多试图通过提取对话中的关键句子来进行文本摘要。这在CNN/DailyMail数据集上可能效果相对较好，但SAMSum中的文本摘要更加抽象。让我们通过在测试集上运行完整的ROUGE评估来确认这一点:

In [2]:
pipe_out = pipe(dataset_samsum["test"][0]["dialogue"]) 
print("Dialogue:")
print(dataset_samsum["test"][0]["dialogue"])
print("Summary:") 
print(pipe_out[0]["summary_text"].replace(" .<n>", ".\n")) 

Your max_length is set to 128, but you input_length is only 122. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
Summary:
Amanda: Ask Larry Amanda: He called her last time we were at the park together.
Hannah: I'd rather you texted him.
Amanda: Just text him .


In [3]:
from datasets import load_metric 
import pandas as pd 
bleu_metric = load_metric("sacrebleu")
rouge_metric = load_metric("rouge") # rouge_score==0.0.4 work well

Using the latest cached version of the module from /home/users/sunhongchao/.cache/huggingface/modules/datasets_modules/metrics/sacrebleu/31e1673407d8789b8f5ddfd979948f6a1de0a6d691426d55fa74a35ffb0c1bdf (last modified on Sat Oct 29 23:06:04 2022) since it couldn't be found locally at sacrebleu, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/users/sunhongchao/.cache/huggingface/modules/datasets_modules/metrics/rouge/0ffdb60f436bdb8884d5e4d608d53dbe108e82dac4f494a66f80ef3f647c104f (last modified on Sat Oct 29 23:10:57 2022) since it couldn't be found locally at rouge, or remotely on the Hugging Face Hub.


In [4]:
print(dataset_samsum["test"][0])
test_sampled = dataset_samsum["test"].shuffle(seed=42).select(range(10)) 

Loading cached shuffled indices for dataset at ../../../resources/dataset/hf_samsum/test/cache-52b2c51951d462ff.arrow


{'id': '13862856', 'dialogue': "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye", 'summary': "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."}


In [11]:
from tqdm import tqdm

device = "cuda:1" if torch.cuda.is_available() else "cpu" 

print(device)

def chunks(list_of_elements, batch_size): 
	"""Yield successive batch-sized chunks from list_of_elements.""" 
	for i in range(0, len(list_of_elements), batch_size): 
	    yield list_of_elements[i : i + batch_size] 
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text="dialogue", column_summary="summary"): 
	article_batches = list(chunks(dataset[column_text], batch_size)) 
	target_batches = list(chunks(dataset[column_summary], batch_size)) 
	for article_batch, target_batch in tqdm( zip(article_batches, target_batches), total=len(article_batches)): 
		inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt") 
		summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128) 
		decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries] 
		decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries] 
		metric.add_batch(predictions=decoded_summaries, references=target_batch) 
	score = metric.compute() 
	return score

cuda:1


In [12]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 

model = None
tokenizer = None

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

with torch.no_grad():
    model_ckpt = "../../../resources/embedding/google_pegasus-cnn-dm" 
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device) 
    score = evaluate_summaries_pegasus(test_sampled, rouge_metric, model, tokenizer, batch_size=2) 
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names) 
    pd.DataFrame(rouge_dict, index=["pegasus"])

print(rouge_dict)


100%|██████████| 5/5 [00:09<00:00,  1.87s/it]

{'rouge1': 0.27189863876138387, 'rouge2': 0.05117642754979565, 'rougeL': 0.1987865541474696, 'rougeLsum': 0.197517473378596}





+ 结果不是很好，但这并不意外，因为我们已经远离了CNN/DailyMail的数据分布。尽管如此，在训练前设置评估流水线有两个好处：我们可以直接用指标来衡量训练的成功与否，而且我们有一个好的基线。在我们的数据集上对模型进行微调，应该会使ROUGE指标立即得到改善，如果不是这样，我们就知道我们的训练循环出了问题。