## Initialisation

In [None]:
%pip install sacremoses==0.0.53
%pip install datasets
%pip install transformers
%pip install torch torchvision torchaudio
%pip install rouge
%pip install bert_score
%pip install llmx typing_extensions openai
%pip install -q -U google-generativeai

In [None]:
from datasets import load_dataset
from transformers import pipeline
from openai import OpenAI
import google.generativeai as genai

from rouge import Rouge
from bert_score import BERTScorer

import pandas as pd
import os
import json, gzip
from tqdm.notebook import tqdm
import enum

## Data for evaluation

In [None]:
xsum_dataset = load_dataset(
  "xsum",
  version="1.2.0",
  cache_dir='/Documents/Huggin_Face/data'
)

xsum_dataset

In [None]:
xsum_sample = xsum_dataset["train"].select(range(10))
display(xsum_sample.to_pandas())

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


## Evaluation template and criteria

In [None]:
EVALUATION_PROMPT_TEMPLATE = """
You will be given one summary written for an article. Your task is to rate the summary on one metric.
Please make sure you read and understand these instructions very carefully.
Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{document}

Summary:

{summary}

Evaluation Form (scores ONLY):

- {metric_name}
"""

# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
Relevance(1-5) - selection of important content from the source. \
The summary should include only important information from the source document. \
Annotators were instructed to penalize summaries which contained redundancies and excess information.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """
Coherence(1-5) - the collective quality of all sentences. \
We align this dimension with the DUC quality question of structure and coherence \
whereby "the summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a\
coherent body of information about a topic."
"""

COHERENCE_SCORE_STEPS = """
1. Read the article carefully and identify the main topic and key points.
2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 3: Consistency

CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-5) - the factual alignment between the summary and the summarized source. \
A factually consistent summary contains only statements that are entailed by the source document. \
Annotators were also asked to penalize summaries that contained hallucinated facts.
"""

CONSISTENCY_SCORE_STEPS = """
1. Read the article carefully and identify the main facts and details it presents.
2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
3. Assign a score for consistency based on the Evaluation Criteria.
"""

# Metric 4: Fluency

FLUENCY_SCORE_CRITERIA = """
Fluency(1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
3: Good. The summary has few or no errors and is easy to read and follow.
"""

FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 3.
"""


## G-Eval

In [None]:
client = OpenAI(
    api_key="",
)

In [None]:
def get_geval_score(criteria, steps, document, summary, metric_name):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response.choices[0].message.content


In [None]:
evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
    "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS)
}

In [None]:
eval_results = pd.DataFrame()
eval_results.index = ['Relevance', 'Coherence', 'Consistency', 'Fluency']

for i, sample_data in tqdm(enumerate(xsum_sample)):
  excerpt = sample_data['document']
  summary = sample_data['summary']
  result = []

  for eval_type, (criteria, steps) in evaluation_metrics.items():
    result.append(get_geval_score(criteria, steps, excerpt, summary, eval_type))

  eval_results['Summary_'+str(i)] = result

0it [00:00, ?it/s]

In [None]:
eval_results

Unnamed: 0,Summary_0,Summary_1,Summary_2,Summary_3,Summary_4,Summary_5,Summary_6,Summary_7,Summary_8,Summary_9
Relevance,2,3,2,3,4,4,2,2.0,2,4
Coherence,2,4,2,2,4,4,2,2.0,2,4
Consistency,5,5,5,5,4,5,5,3.5,5,4
Fluency,3,3,3,3,3,3,3,3.0,3,3


## With Gemini?

In [None]:
gemini_models = ['gemini-pro']
genai.configure(api_key="")

model = genai.GenerativeModel(gemini_models[0])

In [None]:
evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
    "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS)
}

In [None]:
def get_gemini_score(criteria, steps, document, summary, metric_name):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )

    response = model.generate_content(prompt).text

    return response.text

In [None]:
prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=RELEVANCY_SCORE_CRITERIA,
        steps=RELEVANCY_SCORE_STEPS,
        metric_name="Relevance",
        document=xsum_sample['document'][0],
        summary=xsum_sample['summary'][0],
    )

In [None]:
response = model.generate_content(prompt)

In [None]:
response.candidates

In [None]:
excerpt = xsum_sample['document'][0]
summary = xsum_sample['summary'][0]

eval_results = pd.DataFrame()
eval_results.index = ['Relevance', 'Coherence', 'Consistency', 'Fluency']

for i, sample_data in tqdm(enumerate(xsum_sample)):
  excerpt = sample_data['document']
  summary = sample_data['summary']
  result = []

  for eval_type, (criteria, steps) in evaluation_metrics.items():
    result.append(get_gemini_score(criteria, steps, excerpt, summary, eval_type))

  eval_results['Summary_'+str(i)] = result

## G4F

In [None]:
%pip install -U g4f[all]

In [13]:
import g4f

import nest_asyncio
nest_asyncio.apply()

g4f.debug.logging = True  # Enable debug logging
g4f.debug.version_check = False  # Disable automatic version checking
print(g4f.Provider.Bing.params)  # Print supported args for Bing

# Using automatic a provider for the given model
## Streamed completion
# response = g4f.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=[{"role": "user", "content": "Hello"}],
#     stream=True,
# )
# for message in response:
#     print(message, flush=True, end='')

## Normal response
response = g4f.ChatCompletion.create(
    model=g4f.models.gpt_4,
    messages=[{"role": "user", "content": "Hello"}],
)  # Alternative model setting

print(response)

g4f.Provider.Bing supports: (
    model: str
    messages: Messages
    proxy: str = None
    timeout: int = 900
    cookies: dict = None
    connector: BaseConnector = None
    tone: str = Balanced
    image: ImageType = None
    web_search: bool = False
)
Using RetryProvider provider and gpt-4 model
Using Bing provider
Bing: Exception: CaptchaChallenge: Use other cookies or/and ip address
Using Liaobots provider
Hello! How can I assist you today?


In [14]:
response

'Hello! How can I assist you today?'