In [None]:
# Install the Hugging Face datasets library
!pip install datasets
!pip install deepeval
!pip install replicate
!pip install openai
!pip install -U datasets

In [None]:
pip install -U datasets

In [None]:
print(f"Number of sampled records: {len(sampled_data)}")

print(sampled_data[:5])

In [None]:
from openai import OpenAI
import time

# Set the OpenAI API key
client = OpenAI(api_key='sk-xxx')

def generate_summary(text):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": f"Provide a summary of the following text without any introductory or concluding remarks:\n\n{text}"}],
        max_tokens=512,
        temperature=0.7
    )
    output = response.choices[0].message.content.strip()
    return output

gpt_summaries = []

for i, record in enumerate(sampled_data):
    summary = generate_summary(record['article'])
    if summary:
        if isinstance(summary, list):
            appended_summary = ''.join(summary).strip()
        else:
            appended_summary = summary.strip()
        gpt_summaries.append(appended_summary)
    else:
        gpt_summaries.append("Error")

    # Add sleep to limit requests to 500 per minute (1 request per 0.12 seconds)
    time.sleep(0.12)

    # Print progress
    if (i + 1) % 10 == 0:
        print(f"{i + 1} summaries generated")

for i, summary in enumerate(gpt_summaries[:5]):
    print(f"Summary {i+1}: {summary}")

print(f"Number of summaries generated: {len(gpt_summaries)}")

In [None]:
import replicate
import os
import time
os.environ["REPLICATE_API_TOKEN"] = "r8_BRTJfgOH9kBMwDYPEISLQdaWT6wMSWC0DDCLV"
model = replicate.models.get("meta/meta-llama-3-70b-instruct")

def generate_summary(text):
    output = replicate.run(
        "meta/meta-llama-3-70b-instruct",
        input={"max_tokens": 512, "prompt": f"Provide a summary of the following text without any introductory or concluding remarks: {text}", "temperature": 0.7}
    )
    return output

llama_summaries = []

for i, record in enumerate(sampled_data):
    summary = generate_summary(record['article'])
    if summary:
        if isinstance(summary, list):
            appended_summary = ''.join(summary).strip()
        else:
            appended_summary = summary.strip()
        llama_summaries.append(appended_summary)
    else:
        llama_summaries.append("Error")

    # Add sleep to limit requests to 600 per minute (1 request per 0.1 seconds)
    time.sleep(0.1)

    # Print progress
    if (i + 1) % 10 == 0:
        print(f"{i + 1} summaries generated")

for i, summary in enumerate(llama_summaries[:5]):
    print(f"Summary {i+1}: {summary}")

print(f"Number of summaries generated: {len(llama_summaries)}")

In [None]:
import pandas as pd
# Ensure the length of generated summaries matches the sampled data
assert len(llama_summaries) == len(sampled_data), "Mismatch between summaries and sampled data."

# Create data records
data_records = []
for i, record in enumerate(sampled_data):
    data_records.append({
        "id": record['id'],
        "article": record['article'],
        "generated_summary_orig_llama3-70b-intruct": llama_summaries[i],
        "generated_summary_orig_gpt4o": gpt_summaries[i]
    })

df = pd.DataFrame(data_records)

df.to_csv("dailymail_generated_summaries.csv", index=False)
print(df.head())

In [None]:
from deepeval.metrics import ToxicityMetric
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import SummarizationMetric

In [None]:
def toxicity_score_generator(data:pd.DataFrame, target_col:str)-> pd.DataFrame:
  scores = []
  for index, row in data.iterrows():
    metric = ToxicityMetric(threshold=0)
    test_case = LLMTestCase(
    input="How toxic is this passage?",
    actual_output=row[target_col]
)
    metric.measure(test_case)
    scores.append({
        "id": row['id'],
        'toxicity_score': metric.score,
        'toxicity_justification': metric.reason
    }
    )
  tox_scores = pd.DataFrame(scores)

  return tox_scores

In [None]:
def summary_score_generator(data: pd.DataFrame, article_col: str, summary_col: str) -> pd.DataFrame:
    scores = []
    for index, row in data.iterrows():
        metric = SummarizationMetric(
            threshold=0,
            model="gpt-4",
            include_reason=True
        )
        test_case = LLMTestCase(
            input=row[article_col],
            actual_output=row[summary_col]
        )
        metric.measure(test_case)
        scores.append({
            "id": row['id'],
            'summary_score': metric.score,
            'summary_justification': metric.reason
        })
    summary_scores = pd.DataFrame(scores)
    return summary_scores


In [None]:
tox_orig = toxicity_score_generator(df, 'article')
tox_llama = toxicity_score_generator(df, 'generated_summary_orig_llama3-70b-intruct')
tox_gpt = toxicity_score_generator(df, 'generated_summary_orig_gpt4o')
sum_llama = summary_score_generator(df, 'article', 'generated_summary_orig_llama3-70b-intruct')
sum_gpt = summary_score_generator(df, 'article', 'generated_summary_orig_gpt4o')

# Merge all the scores
combined = tox_orig.merge(tox_llama, on='id', suffixes=('_orig', '_llama'))
combined = combined.merge(tox_gpt, on='id', suffixes=('', '_gpt'))
combined = combined.merge(sum_llama, on='id', suffixes=('', '_sum_llama'))
combined = combined.merge(sum_gpt, on='id', suffixes=('', '_sum_gpt'))

combined = combined.rename(columns={
    'toxicity_score': 'toxicity_score_orig',
    'toxicity_justification': 'toxicity_justification_orig',
    'toxicity_score_llama': 'toxicity_score_llama',
    'toxicity_justification_llama': 'toxicity_justification_llama',
    'toxicity_score_gpt': 'toxicity_score_gpt',
    'toxicity_justification_gpt': 'toxicity_justification_gpt',
    'summary_score': 'summary_score_llama',
    'summary_justification': 'summary_justification_llama',
    'summary_score_sum_gpt': 'summary_score_gpt',
    'summary_justification_sum_gpt': 'summary_justification_gpt'
})