# Model Comparison: GPT-4.1 vs Phi-4 Summaries

This notebook scores summaries from both models using the Pi Labs API and compares their quality distributions.

## Setup & Imports

In [None]:
import json
import os

import matplotlib.pyplot as plt
import pandas as pd
import requests
from dotenv import load_dotenv

## Load Data & Config

In [None]:
# Load environment variables from .env file
load_dotenv("../.env")

PI_LABS_ENDPOINT = os.environ["PI_LABS_ENDPOINT"]
PI_LABS_KEY = os.environ["PI_LABS_KEY"]

print(f"Pi Labs endpoint: {PI_LABS_ENDPOINT}")

In [None]:
# Load summary CSVs
df_gpt = pd.read_csv("gpt_4_1_summaries.csv")
df_phi = pd.read_csv("phi_4_summaries.csv")

# Load sample_100 for ranked_results
df_sample = pd.read_csv("sample_100.csv")
df_sample["ranked_results"] = df_sample["ranked_results"].apply(json.loads)

print(f"GPT-4.1 summaries: {len(df_gpt)}")
print(f"Phi-4 summaries: {len(df_phi)}")
print(f"Sample queries: {len(df_sample)}")

In [None]:
# Load scoring specification
with open("scoring_spec.json") as f:
    scoring_spec = json.load(f)

print(f"Loaded {len(scoring_spec)} scoring criteria:")
for i, criterion in enumerate(scoring_spec, 1):
    print(f"  {i}. {criterion['label']}")

## Define Prompt Reconstruction

Reconstruct the exact prompt sent to the LLM models during summarization.

In [None]:
PROMPT_TEMPLATE = """Summarize the following search results in 2-3 sentences, highlighting the key information that answers the user's question: {query}

Results:
{results}"""


def extract_result_fields(result: dict) -> dict:
    """Extract name and description from schema.org result data."""
    content = result.get("content", "{}")
    if isinstance(content, str):
        try:
            content = json.loads(content)
        except json.JSONDecodeError:
            content = {}

    # Try various schema.org fields for name
    name = (
        content.get("headline")
        or content.get("name")
        or content.get("caption")
        or result.get("url", "Unknown")
    )

    # Try various fields for description
    description = content.get("description") or content.get("articleSection", "") or ""
    if isinstance(description, list):
        description = ", ".join(description)

    return {"name": name, "description": description}


def build_llm_input(query_text: str, ranked_results: list) -> str:
    """Reconstruct the exact prompt sent to the LLM."""
    raw_results = ranked_results[:3]
    results = [extract_result_fields(r) for r in raw_results]
    results_text = "\n".join(
        f"{i}. {r['name']}: {r['description']}" for i, r in enumerate(results, 1)
    )
    return PROMPT_TEMPLATE.format(query=query_text, results=results_text)


# Test with first row
print("Example LLM input:")
print("-" * 60)
print(
    build_llm_input(
        df_sample.iloc[0]["query_text"], df_sample.iloc[0]["ranked_results"]
    )
)

## Define Scoring Function

In [None]:
def score_summary(llm_input: str, llm_output: str, scoring_spec: list) -> float:
    """Score a summary using Pi Labs API."""
    response = requests.post(
        f"{PI_LABS_ENDPOINT}invocations",
        headers={"Authorization": f"Bearer {PI_LABS_KEY}"},
        json=[
            {
                "llm_input": llm_input,
                "llm_output": llm_output,
                "scoring_spec": scoring_spec,
            }
        ],
        timeout=30,
    )
    response.raise_for_status()
    return response.json()[0]["total_score"]


# Test with first GPT summary
test_input = build_llm_input(
    df_sample.iloc[0]["query_text"], df_sample.iloc[0]["ranked_results"]
)
test_output = df_gpt.iloc[0]["summary"]
test_score = score_summary(test_input, test_output, scoring_spec)
print(f"Test score: {test_score}")

## Score GPT-4.1 Summaries

In [None]:
gpt_scores = []

for i in range(len(df_gpt)):
    print(f"Scoring GPT-4.1 {i + 1}/{len(df_gpt)}...", end="\r")

    llm_input = build_llm_input(
        df_sample.iloc[i]["query_text"],
        df_sample.iloc[i]["ranked_results"],
    )
    llm_output = df_gpt.iloc[i]["summary"]

    score = score_summary(llm_input, llm_output, scoring_spec)
    gpt_scores.append(score)

df_gpt["score"] = gpt_scores
print(f"\nScored {len(gpt_scores)} GPT-4.1 summaries")
print(f"Mean score: {df_gpt['score'].mean():.3f}")

In [None]:
# Save updated GPT CSV
df_gpt.to_csv("gpt_4_1_summaries.csv", index=False)
print("Saved gpt_4_1_summaries.csv with score column")

## Score Phi-4 Summaries

In [None]:
phi_scores = []

for i in range(len(df_phi)):
    print(f"Scoring Phi-4 {i + 1}/{len(df_phi)}...", end="\r")

    llm_input = build_llm_input(
        df_sample.iloc[i]["query_text"],
        df_sample.iloc[i]["ranked_results"],
    )
    llm_output = df_phi.iloc[i]["summary"]

    score = score_summary(llm_input, llm_output, scoring_spec)
    phi_scores.append(score)

df_phi["score"] = phi_scores
print(f"\nScored {len(phi_scores)} Phi-4 summaries")
print(f"Mean score: {df_phi['score'].mean():.3f}")

In [None]:
# Save updated Phi CSV
df_phi.to_csv("phi_4_summaries.csv", index=False)
print("Saved phi_4_summaries.csv with score column")

## Display Score Statistics

In [None]:
# Create comparison table
stats = pd.DataFrame(
    {
        "GPT-4.1": [
            df_gpt["score"].mean(),
            df_gpt["score"].median(),
            df_gpt["score"].std(),
            df_gpt["score"].min(),
            df_gpt["score"].max(),
        ],
        "Phi-4": [
            df_phi["score"].mean(),
            df_phi["score"].median(),
            df_phi["score"].std(),
            df_phi["score"].min(),
            df_phi["score"].max(),
        ],
    },
    index=["Mean", "Median", "Std Dev", "Min", "Max"],
)

print("Score Statistics Comparison")
print("=" * 40)
print(stats.round(3).to_string())

## Create Histogram

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(df_gpt["score"], bins=20, alpha=0.5, label="GPT-4.1", color="blue")
ax.hist(df_phi["score"], bins=20, alpha=0.5, label="Phi-4", color="orange")

ax.set_xlabel("Score")
ax.set_ylabel("Frequency")
ax.set_title("Summary Quality Scores: GPT-4.1 vs Phi-4")
ax.legend()

# Add vertical lines for means
ax.axvline(
    df_gpt["score"].mean(),
    color="blue",
    linestyle="--",
    alpha=0.7,
    label=f"GPT-4.1 mean: {df_gpt['score'].mean():.2f}",
)
ax.axvline(
    df_phi["score"].mean(),
    color="orange",
    linestyle="--",
    alpha=0.7,
    label=f"Phi-4 mean: {df_phi['score'].mean():.2f}",
)

ax.legend()
plt.tight_layout()
plt.show()