# LLM Summarization Evals
### Comparing the performance of Claude 3 Opus and GPT-4 on a summarization task

## Set up and data exploration

In [None]:
!pip install openai
!pip install datasets
!pip install pandas
import pandas as pd
import os

In [None]:
from datasets import load_dataset
billsum = load_dataset("billsum", split="train")

In [3]:
# Preview data
billsum[0]["summary"]

"Shields a business entity from civil liability relating to any injury or death occurring at a facility of that entity in connection with a use of such facility by a nonprofit organization if: (1) the use occurs outside the scope of business of the business entity; (2) such injury or death occurs during a period that such facility is used by such organization; and (3) the business entity authorized the use of such facility by the organization. \nMakes this Act inapplicable to an injury or death that results from an act or omission of a business entity that constitutes gross negligence or intentional misconduct, including misconduct that: (1) constitutes a hate crime or a crime of violence or act of international terrorism for which the defendant has been convicted in any court; or (2) involves a sexual offense for which the defendant has been convicted in any court or misconduct for which the defendant has been found to have violated a Federal or State civil rights law. \nPreempts Stat

In [4]:
# Test set: first 5 bills with at least 15,000 characters
i = 0
large_docs_idxs = []
while len(large_docs_idxs) < 5:
    if len(billsum[i]["text"]) > 15000:
        large_docs_idxs.append(i)
        print(f"Index: {i}, Text Length: {len(billsum[i]['text'])}, Summary Length: {len(billsum[i]['summary'])}")
    i+=1

Index: 1, Text Length: 19094, Summary Length: 1317
Index: 4, Text Length: 18047, Summary Length: 4462
Index: 6, Text Length: 17714, Summary Length: 1022
Index: 7, Text Length: 19915, Summary Length: 716
Index: 13, Text Length: 16483, Summary Length: 1749


## Set up eval prompt

In [6]:
# Evaluation prompt template based on G-Eval
EVALUATION_PROMPT_TEMPLATE = """
You will be given one summary written for an article. Your task is to rate the summary on one metric.
Please make sure you read and understand these instructions very carefully.
Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{document}

Summary:

{summary}

Evaluation Form (return scores ONLY):

- {metric_name}
"""

# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
Relevance(1-10) - selection of important content from the source. \
The summary should include only important information from the source document. \
Annotators were instructed to penalize summaries which contained redundancies and excess information.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 10. ONLY return the number of the score, no additional text.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """
Coherence(1-10) - the collective quality of all sentences. \
We align this dimension with the DUC quality question of structure and coherence \
whereby "the summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a\
coherent body of information about a topic."
"""

COHERENCE_SCORE_STEPS = """
1. Read the article carefully and identify the main topic and key points.
2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 10, where 1 is the lowest and 10 is the highest based on the Evaluation Criteria. ONLY return the number of the score, no additional text.
"""

# Metric 3: Consistency

CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-10) - the factual alignment between the summary and the summarized source. \
A factually consistent summary contains only statements that are entailed by the source document. \
Annotators were also asked to penalize summaries that contained hallucinated facts.
"""

CONSISTENCY_SCORE_STEPS = """
1. Read the article carefully and identify the main facts and details it presents.
2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
3. Assign a score for consistency based on the Evaluation Criteria. ONLY return the number of the score, no additional text.
"""

# Metric 4: Fluency

FLUENCY_SCORE_CRITERIA = """
Fluency(1-5): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
3: Good. The summary has few or no errors and is easy to read and follow.
"""

FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 5. ONLY return the number of the score, no additional text.
"""

## Anthropic Evaluation

In [7]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.19.2-py3-none-any.whl (850 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m850.2/850.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: anthropic
Successfully installed anthropic-0.19.2


In [8]:
import anthropic
# add API key to os.environ
anthropic_client = anthropic.Anthropic()

In [9]:
import math
import time

class ClaudeSummarizer:
  """
  A class to summarize text using a specified language model.

  Attributes:
      system (str): A system message to initialize the summarizer.
      model (str): The language model to use for summarization. Defaults to "claude-3-opus-20240229".
  """
  def __init__(self, system="", model="claude-3-opus-20240229"):
    """
    Initializes the Summarizer with a system message and a model choice.
    """
    self.system = system
    self.model = model

  def __call__(self, message):
    result = self.execute(message)
    return result

  def execute(self, message):
    start_time = time.time()
    message = anthropic_client.messages.create(
      model=self.model,
      temperature=0,
      max_tokens=500,
      system=self.system,
      messages=[
          {
              "role": "user",
              "content": [
                  {
                      "type": "text",
                      "text": message
                  }
              ]
          }
      ]
    )
    elapsed_time = time.time() - start_time
    total_tokens = message.usage.input_tokens + message.usage.output_tokens

    return {
        "content": message.content[0].text,
        "duration": elapsed_time,
        "tokens": total_tokens
    }



In [10]:
claude_summarizer = ClaudeSummarizer(
    """
    You are a helpful assistant that summarizes text.
    """
)

In [11]:
# Test Claude Summarizer
bill = billsum[large_docs_idxs[0]]
response = claude_summarizer(bill["text"])
print(response["content"])

The Human Rights Information Act is a proposed U.S. legislation that aims to declassify and publicly disclose human rights records related to Guatemala and Honduras after 1944. The key points of the act are:

1. Federal agencies must identify, review, and organize all relevant human rights records within 120 days of the act's enactment, and make them public within 30 days after review.

2. Records can only be withheld if there is clear and convincing evidence that disclosure would gravely threaten U.S. national security, intelligence operations, or foreign relations. 

3. Requests for records from official entities like the UN, OAS, national truth commissions, or justice/human rights officials investigating human rights violations must be reviewed and disclosed per the act's standards.

4. The Interagency Security Classification Appeals Panel will review any agency decisions to withhold records. The President has final authority over the Panel's determinations.

5. After completing the

In [12]:
def get_geval_score_claude(
    criteria: str, steps: str, document: str, summary: str, metric_name: str
):
    claude_eval_client = anthropic.Anthropic()
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )

    response = claude_eval_client.messages.create(
      model="claude-3-opus-20240229",
      max_tokens=7,
      temperature=0,
      messages=[
          {
              "role": "user",
              "content": [
                  {
                      "type": "text",
                      "text": prompt
                  }
              ]
          }
      ]
    )
    return response.content[0].text


evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
    "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS),
}

In [13]:
import re

In [14]:
bill = billsum[6]
base_summary = bill["summary"]
generated_summary = claude_summarizer(bill["text"])
generated_summary_text = generated_summary["content"]

scores = {}

for eval_type, (criteria, steps) in evaluation_metrics.items():
    result = get_geval_score_claude(
        criteria,
        steps,
        base_summary,
        generated_summary_text,
        eval_type,
    )
    score_num = float(re.findall(r'\d+', result)[0])
    scores[eval_type] = score_num

for criteria, score in scores.items():
    print(f"{criteria}: {score}")

Relevance: 8.0
Coherence: 9.0
Consistency: 9.0
Fluency: 4.0


**Note:** regex required. Having some issues with Claude 3 Opus returning just the score. Not seeing same issue with GPT-4.

In [None]:
# generate and display scores for test set
all_scores = []
num_summaries = len(large_docs_idxs)
for i in large_docs_idxs:
    print(f"Generating summary for index: {i}")
    base_summary = billsum[i]["summary"]
    summary_result = claude_summarizer(billsum[i]["text"])
    generated_summary = summary_result["content"]
    scores = {}

    for eval_type, (criteria, steps) in evaluation_metrics.items():
        result = get_geval_score_claude(criteria, steps, base_summary, generated_summary, eval_type)
        score_num = float(result.strip())
        scores[eval_type] = score_num

    scores['Duration'] = summary_result['duration']
    scores['Total Tokens'] = summary_result['tokens']
    scores['Content'] = summary_result['content']

    all_scores.append(scores)
    time.sleep(65)  # avoid rate limit

# Create the DataFrame
df = pd.DataFrame(all_scores)

# Label summaries
df.index = [f"Summary {i+1}" for i in range(num_summaries)]

# Display of df
display(df)

Generating summary for index: 1
Generating summary for index: 4
Generating summary for index: 6
Generating summary for index: 7
Generating summary for index: 13


Unnamed: 0,Relevance,Coherence,Consistency,Fluency,Duration,Total Tokens,Content
Summary 1,8.0,9.0,9.0,5.0,18.746376,4318,The Human Rights Information Act is a proposed...
Summary 2,9.0,8.0,9.0,4.0,19.642315,4661,The Native American Energy Act is a proposed l...
Summary 3,8.0,9.0,9.0,4.0,19.759788,4301,This section amends the Elementary and Seconda...
Summary 4,8.0,8.0,8.0,4.0,20.831105,4915,The Gallatin Land Consolidation Act of 1998 is...
Summary 5,8.0,9.0,9.0,5.0,17.111541,3787,The Federal Agency Protection of Privacy Act i...


In [None]:
df.to_csv('summary_eval_results_claude3_v1.csv', index=True)

## OpenAI Evaluation

In [None]:
from openai import OpenAI
# Add API key to os.environ here
openai_client = OpenAI()

In [None]:
import math
import time

class OpenAISummarizer:
  """
  A class to summarize text using a specified language model.

  Attributes:
      system (str): A system message to initialize the summarizer.
      model (str): The language model to use for summarization. Defaults to "gpt-4".
  """
  def __init__(self, system="", model="gpt-4"):
    """
    Initializes the Summarizer with a system message and a model choice.
    """
    self.system = system
    self.messages = []
    self.model = model
    if self.system:
      self.messages.append({"role": "system", "content": system})

  def __call__(self, message):
    result = self.execute(message)
    return result

  def execute(self, message):
    start_time = time.time()
    completion = openai_client.chat.completions.create(
        model=self.model,
        messages = self.messages + [{"role": "user", "content": message}]
    )
    elapsed_time = time.time() - start_time # am I sure this won't run till completion done?
    total_tokens = completion.usage.total_tokens

    return {
        "content": completion.choices[0].message.content,
        "duration": elapsed_time,
        "tokens": total_tokens
    }

In [None]:
openai_summarizer = OpenAISummarizer(
    """
    You are a helpful assistant that summarizes text.
    """
)

In [None]:
# Test summarizer on one doc
bill = billsum[large_docs_idxs[0]]
response = openai_summarizer(bill["text"])
print(response["content"])

The "Human Rights Information Act" emphasizes on the national and international protection of human rights as a crucial component of democracy, and emphasizes on increased transparency by strengthening existing declassification procedures for human rights violation documents, held by Federal agencies. This law applies to gross human rights violations abroad that may involve American citizens, requiring agencies to expedite the declassification of relevant documents, subject to U.S. security interests. 

It also addresses the necessity of international cooperation in investigations dealing with gross human rights violations, and the importance of ending systematic human rights abuses worldwide. It facilitates the review of records and declassification in response to requests from entities such as the UN, Organization of American States, or a country's principal justice or human rights official investigating said violations.

Moreover, the Act establishes grounds for postponement of publ

In [None]:
def get_geval_score_openai(
    criteria: str, steps: str, document: str, summary: str, metric_name: str
):
    openai_eval_client = OpenAI()
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )
    response = openai_eval_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response.choices[0].message.content


evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
    "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS),
}

In [None]:
# Generate evals for one summary
bill = billsum[6]
base_summary = bill["summary"]
generated_summary = openai_summarizer(bill["text"])
generated_summary_text = generated_summary["content"]

scores = {}

for eval_type, (criteria, steps) in evaluation_metrics.items():
    result = get_geval_score_openai(
        criteria,
        steps,
        base_summary,
        generated_summary_text,
        eval_type,
    )
    score_num = float(result.strip())
    scores[eval_type] = score_num

for criteria, score in scores.items():
    print(f"{criteria}: {score}")

Relevance: 10.0
Coherence: 10.0
Consistency: 10.0
Fluency: 5.0


In [None]:
# generate and display scores for test set
all_scores = []
num_summaries = len(large_docs_idxs)
for i in large_docs_idxs:
    print(f"Generating summary for index: {i}")
    base_summary = billsum[i]["summary"]
    summary_result = openai_summarizer(billsum[i]["text"])
    generated_summary = summary_result["content"]
    scores = {}

    for eval_type, (criteria, steps) in evaluation_metrics.items():
        result = get_geval_score_openai(criteria, steps, base_summary, generated_summary, eval_type)
        score_num = float(result.strip())
        scores[eval_type] = score_num

    scores['Duration'] = summary_result['duration']
    scores['Total Tokens'] = summary_result['tokens']
    scores['Content'] = summary_result['content']

    all_scores.append(scores)
    time.sleep(65)  # avoid rate limit

# Create the DataFrame
df = pd.DataFrame(all_scores)

# Label summaries
df.index = [f"Summary {i+1}" for i in range(num_summaries)]

# Display of df
display(df)

Generating summary for index: 1
Generating summary for index: 4
Generating summary for index: 6
Generating summary for index: 7
Generating summary for index: 13


Unnamed: 0,Relevance,Coherence,Consistency,Fluency,Duration,Total Tokens,Content
Summary 1,9.0,9.0,8.0,5.0,6.5916,3822,"The ""Human Rights Information Act"" calls for t..."
Summary 2,10.0,10.0,10.0,5.0,9.299045,4013,"The ""Native American Energy Act"" amends curren..."
Summary 3,9.0,9.0,7.0,5.0,8.867422,3637,This section of legislation amends the Element...
Summary 4,9.0,10.0,10.0,5.0,5.313913,4167,The Gallatin Land Consolidation Act of 1998 re...
Summary 5,10.0,10.0,10.0,5.0,3.958818,3201,"The ""Federal Agency Protection of Privacy Act""..."


In [None]:
df.to_csv('summary_eval_results_gpt4_v1.csv', index=True)