## Package Installations

In [None]:
%pip install sacremoses==0.0.53
%pip install datasets
%pip install transformers
%pip install torch torchvision torchaudio
%pip install rouge
%pip install bert_score
%pip install llmx typing_extensions openai
%pip install -q -U google-generativeai
%pip install bardapi

In [None]:
from datasets import load_dataset
from transformers import pipeline
from openai import OpenAI
import google.generativeai as genai
from bardapi import Bard

from rouge import Rouge
from bert_score import BERTScorer

import pandas as pd
import os
import json, gzip
from tqdm.notebook import tqdm

## Loading the dataset

In [None]:
xsum_dataset = load_dataset(
  "xsum",
  version="1.2.0",
  cache_dir='/Documents/Huggin_Face/data'
)

xsum_dataset

In [None]:
xsum_sample = xsum_dataset["train"].select(range(10))

display(xsum_sample.to_pandas())

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


## Model initialisation and getting the summary

### HuggingFace models

In [None]:
hf_models = ["t5-small", "facebook/bart-large-cnn", "google/pegasus-xsum", "philschmid/bart-large-cnn-samsum"]

In [None]:
summaries = pd.DataFrame()
summaries["human"] = xsum_sample["summary"]

In [None]:
for mod in hf_models:
  print('Model: ', mod)
  summarizer = pipeline(
      task="summarization",
      model=mod,
      min_length=20,
      max_length=40,
      truncation=True,
      model_kwargs={"cache_dir": '/Documents/Huggin_Face/'},
  )

  summary_gp = summarizer(xsum_sample["document"])
  summaries[mod] = pd.DataFrame(summary_gp)

Model:  t5-small
Model:  facebook/bart-large-cnn
Model:  google/pegasus-xsum


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model:  philschmid/bart-large-cnn-samsum


In [None]:
summaries

In [None]:
summarizer(xsum_sample["document"])

[{'summary_text': 'There was flooding in Newton Stewart after the River Cree overflowed into the town. First Minister Nicola Sturgeon visited the area to inspect the damage. Many roads in Peeblesshire'},
 {'summary_text': 'A fire started under one of the buses at the Holiday Inn in Hope Street at about 04:20 BST on Saturday. One of the tour groups is from Germany, the other'},
 {'summary_text': 'Stoffel Vandoorne out-qualified McLaren team-mate Jenson Button on his Formula 1 debut. Sebastian Vettel will start third ahead of Kimi Raikkonen'},
 {'summary_text': 'John Edward Bates, 67, is accused of sexually abusing two boys when he was a scout leader in South Lincolnshire and Cambridgeshire between 1972 and 1989. Mr Bates'},
 {'summary_text': 'Patients and staff were evacuated from Cerahpasa hospital on Wednesday after a man threatened to shoot himself and others. The man was receiving psychiatric treatment at the clinic for the past two'},
 {'summary_text': 'Simone Favaro scored the cru

In [None]:
a = [{'summary_text': 'Many roads in Peeblesshire remain badly affected by standing water. The full cost of damage in Newton Stewart is still being assessed. First Minister Nicola Sturgeon visited the area to inspect'},
 {'summary_text': 'Fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST. One of the tour groups is from Germany, the other from China and Taiwan. The driver of'},
 {'summary_text': 'Mercedes go half a second clear of Ferrari in qualifying. Sebastian Vettel will start third ahead of team-mate Kimi Raikkonen. Stoffel Vandoorne out-'},
 {'summary_text': 'John Edward Bates faces a total of 22 charges, including two counts of indecency with a child. The 67-year-old is accused of committing the offences between 1972 and 1989.'},
 {'summary_text': 'Patients and staff evacuated from Cerahpasa hospital after man threatens to shoot himself. Officers deployed to negotiate with the man, a young police officer. Incident comes amid tension in Istanbul'},
 {'summary_text': 'Rynard Landman and Ashton Hewitt score tries in either half. Chris Fusaro, Zander Fagerson and Junior Bulumakau also score tries. Simone Fav'},
 {'summary_text': 'Veronica Vanessa Chango-Alverez, 31, was killed and another man injured. Audi A3 struck them in Streatham High Road at 05:30 GMT on Saturday'},
 {'summary_text': 'The 25-year-old died after being hit by a motorbike during a race in Belgium. He was part of the Team Sky team competing in the Three Days of De Panne'},
 {'summary_text': 'Gundogan tore cruciate knee ligaments in December. The 26-year-old missed the 2014 World Cup after back surgery. He also missed Euro 2016 because of a disl'},
 {'summary_text': 'Crash happened at the junction of the A127 and Progress Road in Leigh-on-Sea, Essex. Man in his 20s airlifted to Royal London Hospital for further treatment.'}]

### GPT models

In [None]:
gpt_models = ["gpt-3.5-turbo", "gpt-4"]

In [None]:
client = OpenAI(
    api_key="",
)

for mod in gpt_models:
  summ = []

  for text in tqdm(xsum_sample["document"]):
    response = client.chat.completions.create(
        model = mod,
        messages = [{
            "role": "user",
            "content": "Summarise this: "+text
        }]
    )

    summ.append({'summary': response.choices[0].message.content})

  summaries[mod] = pd.DataFrame(summ)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
summaries

Unnamed: 0,human,t5-small,facebook/bart-large-cnn,google/pegasus-xsum,philschmid/bart-large-cnn-samsum,gpt-3.5-turbo
0,Clean-up operations are continuing across the ...,the full cost of damage in Newton Stewart is s...,Many roads in Peeblesshire remain badly affect...,A clean-up operation is under way in parts of ...,There was flooding in Newton Stewart after the...,"The areas of Newton Stewart, Hawick, and Peebl..."
1,Two tourist buses have been destroyed by fire ...,a fire alarm went off at the Holiday Inn in Ho...,Fire alarm went off at the Holiday Inn in Hope...,Two tourist buses have been destroyed in a sus...,A fire started under one of the buses at the H...,A fire alarm forced guests at the Holiday Inn ...
2,Lewis Hamilton stormed to pole position at the...,Sebastian Vettel will start third ahead of tea...,Mercedes go half a second clear of Ferrari in ...,Lewis Hamilton pipped Mercedes team-mate Nico ...,Stoffel Vandoorne out-qualified McLaren team-m...,"During qualifying for the Bahrain Grand Prix, ..."
3,A former Lincolnshire Police officer carried o...,the 67-year-old is accused of committing the o...,"John Edward Bates faces a total of 22 charges,...",A former Lincolnshire Police officer has gone ...,"John Edward Bates, 67, is accused of sexually ...","John Edward Bates, a 67-year-old man living in..."
4,An armed man who locked himself into a room at...,a man receiving psychiatric treatment at the c...,Patients and staff evacuated from Cerahpasa ho...,A stand-off between police and an armed man at...,Patients and staff were evacuated from Cerahpa...,Patients and staff were evacuated from Cerahpa...
5,Defending Pro12 champions Glasgow Warriors bag...,Gregor Townsend gave a debut to powerhouse win...,Rynard Landman and Ashton Hewitt score tries i...,Glasgow Warriors made it two wins out of two i...,Simone Favaro scored the crucial try with the ...,Simone Favaro scored a crucial try in the last...
6,A man with links to a car that was involved in...,"Veronica Vanessa Chango-Alverez, 31, was kille...","Veronica Vanessa Chango-Alverez, 31, was kille...",A man police want to trace in connection with ...,"Veronica Vanessa Chango-Alverez, 31, was kille...","Veronica Vanessa Chango-Alverez, 31, was kille..."
7,Welsh cyclist Luke Rowe says changes to the sp...,the 25-year-old was hit by a motorbike during ...,The 25-year-old died after being hit by a moto...,Welsh cyclist Luke Rowe has called for a speed...,Belgian cyclist Demoitie died after a collisio...,"A Belgian cyclist, Antoine Demoitie, died afte..."
8,Manchester City midfielder Ilkay Gundogan says...,gundogan will not be fit for the start of the ...,Gundogan tore cruciate knee ligaments in Decem...,Manchester City midfielder Ilkay Gundogan says...,Gundogan tore his cruciate knee ligaments in D...,"Ilkay Gundogan, the Manchester City midfielder..."
9,A jogger has been hit by an unmarked police ca...,the crash happened about 07:20 GMT at the junc...,Crash happened at the junction of the A127 and...,The Independent Police Complaints Commission (...,The crash happened at the junction of the A127...,A crash occurred at the junction of the A127 a...


### Gemini Pro

In [None]:
gemini_models = ['gemini-pro']
genai.configure(api_key="")

In [None]:
for mod in gemini_models:
  model = genai.GenerativeModel(mod)
  summ = []

  for text in tqdm(xsum_sample["document"]):
    prompt = "Summarise this: "+text
    try:
      summ.append({'summary': model.generate_content(prompt).text})
    except:
      summ.append({'summary': "No summary generated"})

  summaries[mod] = pd.DataFrame(summ)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
results = model.generate_content(prompt)

In [None]:
results.candidates

In [None]:
summaries

## Evaluation

### Rouge Evaluator

In [None]:
def get_rouge_scores(text1, text2, r_type='rouge-l'):
    rouge = Rouge()
    return rouge.get_scores(text1, text2)[0][r_type]['f']

In [None]:
models = hf_models + gpt_models + gemini_models
model_rouge = pd.DataFrame()

for mod in models:
  model_rouge[mod] = summaries.apply(lambda x: get_rouge_scores(x[mod], x["human"]), axis=1)

In [None]:
model_rouge

Unnamed: 0,t5-small,facebook/bart-large-cnn,google/pegasus-xsum,philschmid/bart-large-cnn-samsum,gpt-3.5-turbo,gemini-pro
0,0.136364,0.085106,0.228571,0.095238,0.136364,0.111111
1,0.15,0.046512,0.6875,0.095238,0.136986,0.189189
2,0.210526,0.205128,0.625,0.157895,0.211765,0.244444
3,0.047619,0.085106,0.3,0.083333,0.098361,0.0
4,0.2,0.150943,0.5,0.150943,0.183673,0.168421
5,0.162162,0.05,0.4,0.1,0.123894,0.191781
6,0.217391,0.088889,0.45,0.086957,0.138614,0.217391
7,0.058824,0.093023,0.578947,0.153846,0.141176,0.0
8,0.044444,0.095238,0.432432,0.097561,0.164948,0.102564
9,0.0,0.045455,0.222222,0.0,0.109589,0.102564


### BERTScorer

In [None]:
scorer = BERTScorer(lang="en")

def get_bert_scores(text1, text2):
  _, _, sc = scorer.score(text1.tolist(), text2.tolist())
  return sc

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
text1 = summaries

In [None]:
models = hf_models + gpt_models + gemini_models
model_bertscore = pd.DataFrame()

for mod in tqdm(models):
  model_bertscore[mod] = get_bert_scores(summaries['human'], summaries[mod])

  0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
model_bertscore

Unnamed: 0,t5-small,facebook/bart-large-cnn,google/pegasus-xsum,philschmid/bart-large-cnn-samsum,gpt-3.5-turbo,gemini-pro
0,0.841193,0.850072,0.931827,0.84552,0.85496,0.847783
1,0.859544,0.844913,0.952036,0.862819,0.865543,0.876786
2,0.875724,0.87975,0.980554,0.882914,0.893986,0.882008
3,0.851815,0.861004,0.928344,0.875173,0.857155,0.813564
4,0.885647,0.874825,0.924802,0.871389,0.884153,0.855279
5,0.805266,0.802855,0.894445,0.817318,0.821941,0.842248
6,0.852483,0.827069,0.924729,0.831085,0.854789,0.836437
7,0.831627,0.844906,0.954833,0.884319,0.87949,0.814794
8,0.852761,0.841722,0.93919,0.840888,0.874928,0.853159
9,0.822734,0.841315,0.883642,0.831138,0.840303,0.836959


### G-Eval

#### Evaluation Criteria

In [None]:
EVALUATION_PROMPT_TEMPLATE = """
You will be given one summary written for an article. Your task is to rate the summary on one metric.
Please make sure you read and understand these instructions very carefully.
Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{document}

Summary:

{summary}

Evaluation Form (scores ONLY):

- {metric_name}
"""

# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
Relevance(1-5) - selection of important content from the source. \
The summary should include only important information from the source document. \
Annotators were instructed to penalize summaries which contained redundancies and excess information.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """
Coherence(1-5) - the collective quality of all sentences. \
We align this dimension with the DUC quality question of structure and coherence \
whereby "the summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a\
coherent body of information about a topic."
"""

COHERENCE_SCORE_STEPS = """
1. Read the article carefully and identify the main topic and key points.
2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 3: Consistency

CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-5) - the factual alignment between the summary and the summarized source. \
A factually consistent summary contains only statements that are entailed by the source document. \
Annotators were also asked to penalize summaries that contained hallucinated facts.
"""

CONSISTENCY_SCORE_STEPS = """
1. Read the article carefully and identify the main facts and details it presents.
2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
3. Assign a score for consistency based on the Evaluation Criteria.
"""

# Metric 4: Fluency

FLUENCY_SCORE_CRITERIA = """
Fluency(1-5): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
3: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
5: Good. The summary has few or no errors and is easy to read and follow.
"""

FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 5.
"""

#### With GPT-4

In [None]:
client = OpenAI(
    api_key="",
)

In [None]:
def get_geval_score(criteria, steps, document, summary, metric_name):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response.choices[0].message.content


In [None]:
evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
    "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS)
}

In [None]:
data = pd.read_json(path_or_buf='model_annotations.aligned.jsonl', lines=True)

data = data[['decoded', 'expert_annotations', 'turker_annotations', 'references']]
data = data.rename(columns={'decoded': 'summary', 'references': 'document'})

data['document'] = data['document'].apply(lambda x: x[0])

In [None]:
sample = data.sample(10)

In [None]:
sample

In [None]:
for i in range(sample.shape[0]):
  print(i, ': ', sample['summary'].iloc[i])

0 :  Paul Merson is not happy with Andros Townsend's call-up to the England squad last week
1 :  residents on the east coast of japan 's hokkaido island had an unexpected change of scenery this morning after 1,000 ft of seabed was forced to the surface overnight . the extra stretch of coastline on shiretoko peninsula near the town of rausu has risen as high as 50ft from the sea surface in some places , exposing what used to be the ocean floor . geologists believe the emergence is a result of a landslide nearby , when melting ice and snow caused a section of land to drop , pivoting the underwater area into the air .
2 :  manchester city are keen to sign anderlecht teenager evangelos patoulidis . the 14-year-old playmaker is regarded as one of the best talents to emerge . the belgian starlet rejected a move to barcelona 's la masia academy .
3 :  the german star was in fine form as the world cup winners beat georgia 2-0 in their euro 2016 qualifier on sunday , and is now setting his sigh

In [None]:
eval_results = pd.DataFrame()
eval_results.index = ['Relevance', 'Coherence', 'Consistency', 'Fluency']

for i in tqdm(range(sample.shape[0])):
  excerpt = sample['document'].iloc[i]
  summ = sample['summary'].iloc[i]
  result = []

  for eval_type, (criteria, steps) in evaluation_metrics.items():
    result.append(get_geval_score(criteria, steps, excerpt, summ, eval_type))

  eval_results['Summary_'+str(i)] = result

  0%|          | 0/10 [00:00<?, ?it/s]

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-cjie4qUe1tJrKFFvH6KYBXoJ on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
eval_results

In [73]:
mean_df = pd.DataFrame()

for i in range(sample.shape[0]):
  expert_ann = sample['expert_annotations'].iloc[i]
  mean_df['Summary_'+str(i)] = pd.DataFrame(expert_ann).mean().astype('int')

In [74]:
mean_df

Unnamed: 0,Summary_0,Summary_1,Summary_2,Summary_3,Summary_4,Summary_5,Summary_6,Summary_7,Summary_8,Summary_9
coherence,4,5,4,2,3,2,1,4,3,5
consistency,5,5,4,5,5,1,5,2,5,5
fluency,5,5,4,5,5,3,5,3,5,5
relevance,2,5,4,3,4,3,3,4,4,4


In [None]:
# summ_dict = {"Summary 1": summaries['human'][0], "Summary 2": summaries[mod][0]}
# excerpt = xsum_sample['document'][0]

In [None]:
print(xsum_sample[0]['document'])
print(xsum_sample[0]['summary'])
print(summaries['human'][0])

In [None]:
models = ['human'] + hf_models + gpt_models + gemini_models
eval_results = {}

for mod in models:
  eval_results[mod] = pd.DataFrame()
  eval_results[mod].index = ['Relevance', 'Coherence', 'Consistency', 'Fluency']

  for i, sample_data in tqdm(enumerate(xsum_sample)):
    excerpt = sample_data['document']
    summ = summaries[mod][i]
    result = []

    for eval_type, (criteria, steps) in evaluation_metrics.items():
      result.append(get_geval_score(criteria, steps, excerpt, summ, eval_type))

    eval_results[mod]['Summary_'+str(i)] = result

In [None]:
eval_results

## G4F

In [None]:
%pip install -U g4f[all]

In [None]:
import g4f

import nest_asyncio
nest_asyncio.apply()

g4f.debug.logging = True  # Enable debug logging
g4f.debug.version_check = False  # Disable automatic version checking

In [None]:
def get_g4feval_score(criteria, steps, document, summary, metric_name):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )
    response = g4f.ChatCompletion.create(
        model=g4f.models.gpt_4,
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response

In [None]:
eval_results = pd.DataFrame()
eval_results.index = ['Relevance', 'Coherence', 'Consistency', 'Fluency']

for i in tqdm(range(sample.shape[0])):
  excerpt = sample['document'].iloc[i]
  summ = sample['summary'].iloc[i]
  result = []

  for eval_type, (criteria, steps) in evaluation_metrics.items():
    result.append(get_g4feval_score(criteria, steps, excerpt, summ, eval_type))

  eval_results['Summary_'+str(i)] = result

In [None]:
eval_results