In [1]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
import nltk

# Ensure punkt tokenizer is available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# Load .env and configure Gemini
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel("models/gemini-2.5-flash-preview-04-17")

# Input and prompts
input_text = """
A groundbreaking AI model developed by researchers at TechLabs has achieved a 30% reduction in 
energy consumption in data centers.
The model uses reinforcement learning to dynamically manage server workloads, avoiding unnecessary energy usage.
This could lead to millions in savings and a significant reduction in environmental impact.
"""
reference_summary = (
    "TechLabs' new AI model reduces data center energy use by 30% using reinforcement learning to optimize workloads."
)

prompt_weak = "Say to me something about the text."
prompt_strong = (
    "You are a technical journalist. In 1–2 sentences, summarize the key innovation and its real-world impact for a general audience."
)

def get_summary(prompt, text):
    full_prompt = f"{prompt}\n\n{text}"
    response = model.generate_content(full_prompt)
    return response.text.strip()

summary_weak = get_summary(prompt_weak, input_text)
summary_strong = get_summary(prompt_strong, input_text)

def evaluate_metrics(summary, reference):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge = scorer.score(reference, summary)['rougeL'].fmeasure
    ref_tokens = [word_tokenize(reference, preserve_line=True)]
    summary_tokens = word_tokenize(summary, preserve_line=True)
    bleu = sentence_bleu(ref_tokens, summary_tokens, smoothing_function=SmoothingFunction().method1)
    P, R, F1 = bert_score([summary], [reference], lang="en", verbose=False)
    bert_f1 = F1[0].item()
    length = len(summary_tokens)
    return {
        'ROUGE-L': round(rouge, 2),
        'BLEU': round(bleu, 2),
        'BERTScore-F1': round(bert_f1, 2),
        'Length': length
    }

def evaluate_with_gpt(summary, reference, criteria_description=None):
    criteria_description = criteria_description or """
You are an expert summarization evaluator. Evaluate the following summary according to these five criteria from 1 (poor) to 5 (excellent):

1. Relevance – Is the summary on-topic and aligned with the reference?
2. Clarity – Is the summary easy to read and grammatically correct?
3. Conciseness – Is it brief and free of unnecessary information?
4. Correctness – Are all factual details (like numbers) accurate?
5. Instruction Adherence – Does the summary follow the intended task (e.g., journalistic tone, impact focus)?
Return only a dictionary in Python format with these five keys.
"""
    full_prompt = f"{criteria_description}\n\nReference:\n{reference}\n\nSummary:\n{summary}\n\nEvaluation:"
    
    response = model.generate_content(full_prompt)
    
    # Parse the returned dictionary string into a Python dict safely
    try:
        raw = response.text.strip()
        
        # Rimuove i delimitatori ```python ... ```
        if raw.startswith("```"):
            raw = raw.strip("`").split("python")[-1].strip()

        # Eval sicuro solo su dizionari con chiavi attese
        result = eval(raw)
        assert all(k in result for k in ["Relevance", "Clarity", "Conciseness", "Correctness", "Instruction Adherence"])
        return result
    except Exception as e:
        print("⚠️ GPT evaluation parsing failed:", e)
        print("Raw response:", response.text)
        return {k: 3 for k in ["Relevance", "Clarity", "Conciseness", "Correctness", "Instruction Adherence"]}


# Evaluate both summaries
metrics_weak = evaluate_metrics(summary_weak, reference_summary)
metrics_strong = evaluate_metrics(summary_strong, reference_summary)


human_weak = evaluate_with_gpt(summary_weak, reference_summary)
human_strong = evaluate_with_gpt(summary_strong, reference_summary)


# Define weighted scoring
weights = {
    'ROUGE-L': 0.3,
    'BLEU': 0.1,
    'BERTScore-F1': 0.3,
    'Relevance': 0.1,
    'Correctness': 0.1,
    'Instruction Adherence': 0.1
}

def compute_weighted(metrics, human):
    combined = {**metrics, **human}
    return round(sum(combined[k] * w for k, w in weights.items()), 3)

score_weak = compute_weighted(metrics_weak, human_weak)
score_strong = compute_weighted(metrics_strong, human_strong)



# Print output as requested
print("📝 Weak Prompt Summary:")
print(summary_weak)
print(f"\n  ROUGE-L: {metrics_weak['ROUGE-L']}")
print(f"  BLEU: {metrics_weak['BLEU']}")
print(f"  BERTScore-F1: {metrics_weak['BERTScore-F1']}")
print(f"  Length: {metrics_weak['Length']}\n")

print("📝 Strong Prompt Summary:")
print(summary_strong)
print(f"\n  ROUGE-L: {metrics_strong['ROUGE-L']}")
print(f"  BLEU: {metrics_strong['BLEU']}")
print(f"  BERTScore-F1: {metrics_strong['BERTScore-F1']}")
print(f"  Length: {metrics_strong['Length']}\n")

print(f"🏆 Weighted Score (Weak): {score_weak}")
print(f"🏆 Weighted Score (Strong): {score_strong}\n")

if score_strong > score_weak:
    print("✅ Strong prompt performed better across the majority of metrics.")
else:
    print("✅ Weak prompt performed better across the majority of metrics.")



  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📝 Weak Prompt Summary:
Here are a few things about the text:

*   It describes a **new AI model** developed by TechLabs researchers.
*   The model's main achievement is a **30% reduction in energy consumption** in data centers.
*   It uses **reinforcement learning** to manage server workloads efficiently.
*   This innovation is expected to lead to **significant cost savings** (millions) and a **positive environmental impact**.

In short, the text is about a promising AI technology that dramatically cuts energy use in data centers, offering major economic and environmental benefits.

  ROUGE-L: 0.2
  BLEU: 0.01
  BERTScore-F1: 0.87
  Length: 114

📝 Strong Prompt Summary:
Researchers have developed a new AI model that uses reinforcement learning to intelligently manage server workloads, achieving a remarkable 30% reduction in data center energy consumption. This breakthrough innovation promises significant cost savings and a major positive impact on the environment by lowering the energy