## With reference_summary

In [3]:

import os
from dotenv import load_dotenv
import google.generativeai as genai
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
import nltk
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bert_score')
warnings.filterwarnings("ignore", category=Warning)

# Ensure punkt tokenizer is available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# Load .env and configure Gemini
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel("models/gemini-2.5-flash-preview-04-17")

# Input and prompts
input_text = """
A groundbreaking AI model developed by researchers at TechLabs has achieved a 30% reduction in 
energy consumption in data centers.
The model uses reinforcement learning to dynamically manage server workloads, avoiding unnecessary energy usage.
This could lead to millions in savings and a significant reduction in environmental impact.
"""
reference_summary = (
    "TechLabs' new AI model reduces data center energy use by 30% using reinforcement learning to optimize workloads."
)

prompt_weak = "Say to me something about the text."
prompt_strong = (
    "You are a technical journalist. In 1–2 sentences, summarize the key innovation and its real-world impact for a general audience."
)

def get_summary(prompt, text):
    full_prompt = f"{prompt}\n\n{text}"
    response = model.generate_content(full_prompt)
    return response.text.strip()

summary_weak = get_summary(prompt_weak, input_text)
summary_strong = get_summary(prompt_strong, input_text)

def evaluate_metrics(summary, reference):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge = scorer.score(reference, summary)['rougeL'].fmeasure
    ref_tokens = [word_tokenize(reference, preserve_line=True)]
    summary_tokens = word_tokenize(summary, preserve_line=True)
    bleu = sentence_bleu(ref_tokens, summary_tokens, smoothing_function=SmoothingFunction().method1)
    P, R, F1 = bert_score([summary], [reference], lang="en", verbose=False)
    bert_f1 = F1[0].item()
    length = len(summary_tokens)
    return {
        'ROUGE-L': round(rouge, 2),
        'BLEU': round(bleu, 2),
        'BERTScore-F1': round(bert_f1, 2),
        'Length': length
    }

def evaluate_with_gpt(summary, reference, criteria_description=None):
    criteria_description = criteria_description or """
You are an expert summarization evaluator. Evaluate the following summary according to these five criteria from 1 (poor) to 5 (excellent):

1. Relevance – Is the summary on-topic and aligned with the reference?
2. Clarity – Is the summary easy to read and grammatically correct?
3. Conciseness – Is it brief and free of unnecessary information?
4. Correctness – Are all factual details (like numbers) accurate?
5. Instruction Adherence – Does the summary follow the intended task (e.g., journalistic tone, impact focus)?
Return only a dictionary in Python format with these five keys.
"""
    full_prompt = f"{criteria_description}\n\nReference:\n{reference}\n\nSummary:\n{summary}\n\nEvaluation:"
    
    response = model.generate_content(full_prompt)
    
    # Parse the returned dictionary string into a Python dict safely
    try:
        raw = response.text.strip()
        
        # Rimuove i delimitatori ```python ... ```
        if raw.startswith("```"):
            raw = raw.strip("`").split("python")[-1].strip()

        # Eval sicuro solo su dizionari con chiavi attese
        result = eval(raw)
        assert all(k in result for k in ["Relevance", "Clarity", "Conciseness", "Correctness", "Instruction Adherence"])
        return result
    except Exception as e:
        print("⚠️ GPT evaluation parsing failed:", e)
        print("Raw response:", response.text)
        return {k: 3 for k in ["Relevance", "Clarity", "Conciseness", "Correctness", "Instruction Adherence"]}


# Evaluate both summaries
metrics_weak = evaluate_metrics(summary_weak, reference_summary)
metrics_strong = evaluate_metrics(summary_strong, reference_summary)


human_weak = evaluate_with_gpt(summary_weak, reference_summary)
human_strong = evaluate_with_gpt(summary_strong, reference_summary)


# Define weighted scoring
weights = {
    # automatic
    "ROUGE-L":      0.10,
    "BLEU":         0.05,
    "BERTScore-F1": 0.05,
    # LLM-as-judge  ← keys must match exactly what evaluate_with_gpt returns
    "Relevance":            0.30,
    "Clarity":              0.15,
    "Conciseness":          0.15,
    "Correctness":          0.10,
    "Instruction Adherence":0.10,
}


def compute_weighted(metrics, human):
    combined = {**metrics, **human}
    return round(sum(combined[k] * w for k, w in weights.items()), 3)

score_weak = compute_weighted(metrics_weak, human_weak)
score_strong = compute_weighted(metrics_strong, human_strong)



# Print output as requested
print("📝 Weak Prompt Summary:")
print(summary_weak)
print(f"\n  ROUGE-L: {metrics_weak['ROUGE-L']}")
print(f"  BLEU: {metrics_weak['BLEU']}")
print(f"  BERTScore-F1: {metrics_weak['BERTScore-F1']}")
print(f"  Length: {metrics_weak['Length']}\n")

print("📝 Strong Prompt Summary:")
print(summary_strong)
print(f"\n  ROUGE-L: {metrics_strong['ROUGE-L']}")
print(f"  BLEU: {metrics_strong['BLEU']}")
print(f"  BERTScore-F1: {metrics_strong['BERTScore-F1']}")
print(f"  Length: {metrics_strong['Length']}\n")

print(f"🏆 Weighted Score (Weak): {score_weak}")
print(f"🏆 Weighted Score (Strong): {score_strong}\n")

if score_strong > score_weak:
    print("✅ Strong prompt performed better across the majority of metrics.")
else:
    print("✅ Weak prompt performed better across the majority of metrics.")



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📝 Weak Prompt Summary:
Okay, based on the text:

It describes a new **AI model** developed by **TechLabs researchers**. This model uses **reinforcement learning** to significantly reduce **energy consumption** (specifically a **30% reduction**) in **data centers** by dynamically managing server workloads. The anticipated benefits are **millions in savings** and a **significant reduction in environmental impact**.

  ROUGE-L: 0.26
  BLEU: 0.01
  BERTScore-F1: 0.87
  Length: 90

📝 Strong Prompt Summary:
Here's a summary:

A groundbreaking AI developed by TechLabs can cut energy use in data centers by 30% through smarter server management. This promises massive savings for businesses and a significant positive impact on the environment.

  ROUGE-L: 0.19
  BLEU: 0.03
  BERTScore-F1: 0.9
  Length: 40

🏆 Weighted Score (Weak): 3.42
🏆 Weighted Score (Strong): 3.916

✅ Strong prompt performed better across the majority of metrics.


## Without reference_summary

In [4]:
# ── Imports & setup ────────────────────────────────────────────────────────────
import os, warnings, nltk
from dotenv import load_dotenv
import google.generativeai as genai          # Gemini SDK
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score   # For semantic similarity

warnings.filterwarnings("ignore", category=UserWarning, module="bert_score")

# Make sure NLTK’s punkt tokenizer is available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# Load Gemini API key & choose a model
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel("models/gemini-2.5-flash-preview-04-17")

# ── Source document & prompts ─────────────────────────────────────────────────
source_text = """
A groundbreaking AI model developed by researchers at TechLabs has achieved a 30 % reduction in 
energy consumption in data centers. The model uses reinforcement learning to dynamically manage
server workloads, avoiding unnecessary energy usage. This breakthrough could lead to millions of
dollars in savings and a significant reduction in environmental impact.
"""

prompt_weak   = "Say to me something about the text."
prompt_strong = (
    "You are a technical journalist. In 1-2 sentences, summarize the key innovation and its real-"
    "world impact for a general audience."
)

# ── Summarisation helper ──────────────────────────────────────────────────────
def get_summary(prompt: str, text: str) -> str:
    """Generate a summary of *text* according to *prompt* using Gemini."""
    full_prompt = f"{prompt}\n\n{text}"
    response    = model.generate_content(full_prompt)
    return response.text.strip()

summary_weak   = get_summary(prompt_weak,   source_text)
summary_strong = get_summary(prompt_strong, source_text)

# ── Automatic metrics (reference-free) ────────────────────────────────────────
def evaluate_metrics(summary: str, source: str) -> dict:
    """
    Compute overlap-style metrics *against the source document* (no gold summary needed).
    ROUGE-L recall, BLEU precision, BERTScore-F1, and length.
    """
    scorer     = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    rouge_l    = scorer.score(source, summary)["rougeL"].recall         # coverage of source
    source_tok = [word_tokenize(source, preserve_line=True)]
    summ_tok   = word_tokenize(summary, preserve_line=True)
    bleu       = sentence_bleu(source_tok, summ_tok,
                               smoothing_function=SmoothingFunction().method1)
    _, _, F1   = bert_score([summary], [source], lang="en", verbose=False)
    return {
        "ROUGE-L":      round(rouge_l,  2),
        "BLEU":         round(bleu,     2),
        "BERTScore-F1": round(F1[0].item(), 2),
        "Length":       len(summ_tok),
    }

# ── LLM-as-Judge (no reference) ───────────────────────────────────────────────
def evaluate_with_gpt_no_ref(summary: str, source_doc: str,
                             criteria_description: str | None = None) -> dict:
    """
    Ask Gemini to rate a summary **without any gold reference**.
    The LLM sees the SOURCE and the SUMMARY and returns scores 1-5 for each rubric item.
    """
    criteria_description = criteria_description or """
You are an expert summarisation evaluator. Judge the SUMMARY only against the SOURCE according to
these five criteria, each on a 1-5 scale (1 = poor, 5 = excellent):
1. Faithfulness – No hallucinations; every claim is supported by the source.
2. Coverage     – Captures the main, important points of the source.
3. Clarity      – Easy to read; well structured; no jargon unless explained.
4. Conciseness  – Succinct yet complete.
5. Fluency      – Grammatically correct and natural English.

Return a valid Python dict exactly like:
{"Faithfulness": 4, "Coverage": 5, "Clarity": 5, "Conciseness": 4, "Fluency": 5}
"""
    prompt = f"{criteria_description.strip()}\n\nSOURCE:\n{source_doc}\n\nSUMMARY:\n{summary}\n\nEVALUATION:"
    response = model.generate_content(prompt)

    # Parse the dict safely
    try:
        raw = response.text.strip()
        if raw.startswith("```"):
            raw = raw.strip("`").split("python")[-1].strip()
        result = eval(raw)                                        # <-- trusted parsing for demo
        expected = {"Faithfulness", "Coverage", "Clarity", "Conciseness", "Fluency"}
        assert expected.issubset(result)
        return result
    except Exception as err:
        print("⚠️  Gemini evaluation parsing failed:", err)
        print("Raw response:\n", response.text)
        # Fall back to neutral (3) for each criterion
        return {k: 3 for k in ["Faithfulness", "Coverage", "Clarity", "Conciseness", "Fluency"]}

# ── Gather scores ────────────────────────────────────────────────────────────
metrics_weak   = evaluate_metrics(summary_weak,   source_text)
metrics_strong = evaluate_metrics(summary_strong, source_text)

human_weak     = evaluate_with_gpt_no_ref(summary_weak,   source_text)
human_strong   = evaluate_with_gpt_no_ref(summary_strong, source_text)

# Combine all scores
def weighted_score(auto: dict, human: dict, weights: dict) -> float:
    total = 0
    for k, w in weights.items():
        total += w * (auto.get(k) or human.get(k))
    return round(total, 3)

weights = {
    # automatic
    "ROUGE-L":      0.10,
    "BLEU":         0.05,
    "BERTScore-F1": 0.05,
    # LLM-as-judge
    "Faithfulness": 0.30,
    "Coverage":     0.10,
    "Clarity":      0.15,
    "Conciseness":  0.15,
    "Fluency":      0.10,
}

score_weak   = weighted_score(metrics_weak,   human_weak,   weights)
score_strong = weighted_score(metrics_strong, human_strong, weights)

# ── Pretty print results ──────────────────────────────────────────────────────
def report(name, summary, auto, human):
    print(f"\n=== {name} SUMMARY ===")
    print(summary)
    print("‒-‒- Automatic metrics")
    for m, v in auto.items():   print(f"  {m:12}: {v}")
    print("‒-‒- Gemini rubric")
    for m, v in human.items():  print(f"  {m:12}: {v}")

report("WEAK",   summary_weak,   metrics_weak,   human_weak)
report("STRONG", summary_strong, metrics_strong, human_strong)

print(f"\n🏆 Weighted Score (Weak)  : {score_weak}")
print(f"🏆 Weighted Score (Strong): {score_strong}\n")

if score_strong > score_weak:
    print("✅ The *strong* prompt performed better across the majority of metrics.")
elif score_strong < score_weak:
    print("✅ The *weak* prompt performed better across the majority of metrics.")
else:
    print("🤝 The two prompts are tied under the current weighting scheme.")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== WEAK SUMMARY ===
Okay, here's something about the text:

The text describes a **groundbreaking AI model** developed by **TechLabs researchers**. Its main achievement is significantly reducing **energy consumption in data centers** by **30%** using **reinforcement learning**. This innovation is expected to bring about **large financial savings** and a **major positive environmental impact**.
‒-‒- Automatic metrics
  ROUGE-L     : 0.45
  BLEU        : 0.06
  BERTScore-F1: 0.87
  Length      : 84
‒-‒- Gemini rubric
  Faithfulness: 5
  Coverage    : 5
  Clarity     : 4
  Conciseness : 4
  Fluency     : 4

=== STRONG SUMMARY ===
Researchers have developed an AI that achieves a 30% reduction in data center energy consumption by intelligently managing server workloads. This breakthrough promises substantial cost savings and a significant positive impact on the environment.
‒-‒- Automatic metrics
  ROUGE-L     : 0.35
  BLEU        : 0.13
  BERTScore-F1: 0.91
  Length      : 36
‒-‒- Gemini