# Summarization Benchmark

This notebook benchmarks the `ResultsSummarizer` using multiple models via Azure OpenAI
to generate summaries for the sample queryset.

In [9]:
import asyncio
import os
import json
from typing import Callable, Dict, Any

import pandas as pd
from dotenv import load_dotenv
from openai import AsyncAzureOpenAI

from nlweb_core.summarizer import ResultsSummarizer

load_dotenv()

True

## Shared Utilities

In [10]:
def extract_result_fields(result: dict) -> dict:
    """Extract name and description from schema.org result data."""
    content = result.get("content", "{}")
    if isinstance(content, str):
        try:
            content = json.loads(content)
        except json.JSONDecodeError:
            content = {}

    # Try various schema.org fields for name
    name = (
        content.get("headline")
        or content.get("name")
        or content.get("caption")
        or result.get("url", "Unknown")
    )

    # Try various fields for description
    description = (
        content.get("description")
        or content.get("articleSection", "")
        or ""
    )
    if isinstance(description, list):
        description = ", ".join(description)

    return {"name": name, "description": description}


async def run_summarization(
    df: pd.DataFrame,
    llm: Callable[[str, Dict[str, Any]], Any],
    model_name: str,
    concurrency: int = 4,
) -> list:
    """Run summarization on all queries with the given LLM.

    Args:
        df: DataFrame with query_text and ranked_results columns
        llm: Async callable with signature (prompt, schema) -> dict
        model_name: Name of model for progress output
        concurrency: Max concurrent requests

    Returns:
        List of summaries (or None for failures)
    """
    summarizer = ResultsSummarizer(llm=llm)
    semaphore = asyncio.Semaphore(concurrency)

    async def process_row(idx: int, row):
        async with semaphore:
            raw_results = row["ranked_results"][:3]
            results = [extract_result_fields(r) for r in raw_results]

            result = await summarizer.summarize(row["query_text"], results)
            print(f"[{model_name}] Processed {idx + 1}/{len(df)}")
            return result.summary if result else None

    tasks = [process_row(idx, row) for idx, row in df.iterrows()]
    return await asyncio.gather(*tasks)

## Load Data

In [11]:
df = pd.read_csv("sample_100.csv")
df["ranked_results"] = df["ranked_results"].apply(json.loads)

print(f"Loaded {len(df)} queries")
df.head()

Loaded 100 queries


Unnamed: 0,query_text,ranked_results
0,Where can I find reviews and recommendations f...,[{'url': 'https://www.ambitiouskitchen.com/bea...
1,what is big data really skillcrush,[{'url': 'https://skillcrush.com/blog/what-is-...
2,growth secrets from tinder uber twitch andrew ...,[{'url': 'https://tim.blog/2021/11/30/andrew-c...
3,stories about hiking in california trinity alp...,[{'url': 'https://www.backpacker.com/stories/c...
4,Who photographed Paulina Porizkova for the joy...,[{'url': 'https://www.thefashionspot.com/forum...


## Model Client

In [12]:
# Azure OpenAI client (shared by all models)
client = AsyncAzureOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_KEY"],
    api_version="2024-02-01",
)


async def gpt_4_1(prompt: str, schema: dict) -> dict:
    """Call GPT-4.1 and return parsed JSON response."""
    response = await client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {
                "role": "system",
                "content": f"Respond with JSON matching: {json.dumps(schema)}",
            },
            {"role": "user", "content": prompt},
        ],
        response_format={"type": "json_object"},
        timeout=20,
    )
    return json.loads(response.choices[0].message.content)


async def phi_4(prompt: str, schema: dict) -> dict:
    """Call Phi-4 and return parsed JSON response."""
    response = await client.chat.completions.create(
        model="Phi-4",
        messages=[
            {
                "role": "system",
                "content": f"Respond with JSON matching: {json.dumps(schema)}",
            },
            {"role": "user", "content": prompt},
        ],
        response_format={"type": "json_object"},
        timeout=20,
    )
    return json.loads(response.choices[0].message.content)

## Run GPT-4.1 Summarization

In [13]:
gpt_summaries = await run_summarization(df, gpt_4_1, "GPT-4.1")

[GPT-4.1] Processed 1/100
[GPT-4.1] Processed 4/100
[GPT-4.1] Processed 3/100
[GPT-4.1] Processed 5/100
[GPT-4.1] Processed 2/100
[GPT-4.1] Processed 7/100
[GPT-4.1] Processed 6/100
[GPT-4.1] Processed 10/100
[GPT-4.1] Processed 9/100
[GPT-4.1] Processed 11/100
[GPT-4.1] Processed 8/100
[GPT-4.1] Processed 12/100
[GPT-4.1] Processed 13/100
[GPT-4.1] Processed 14/100
[GPT-4.1] Processed 15/100
[GPT-4.1] Processed 17/100
[GPT-4.1] Processed 16/100
[GPT-4.1] Processed 18/100
[GPT-4.1] Processed 19/100
[GPT-4.1] Processed 22/100
[GPT-4.1] Processed 20/100
[GPT-4.1] Processed 21/100
[GPT-4.1] Processed 23/100
[GPT-4.1] Processed 25/100
[GPT-4.1] Processed 26/100
[GPT-4.1] Processed 24/100
[GPT-4.1] Processed 27/100
[GPT-4.1] Processed 28/100
[GPT-4.1] Processed 29/100
[GPT-4.1] Processed 30/100
[GPT-4.1] Processed 33/100
[GPT-4.1] Processed 32/100
[GPT-4.1] Processed 31/100
[GPT-4.1] Processed 34/100
[GPT-4.1] Processed 36/100
[GPT-4.1] Processed 35/100
[GPT-4.1] Processed 37/100
[GPT-4.1] 

In [14]:
# Save GPT-4.1 results
df_gpt = df.copy()
df_gpt["summary"] = gpt_summaries
df_gpt.to_csv("gpt_4_1_summaries.csv", index=False)
print("Saved GPT-4.1 results to gpt_4_1_summaries.csv")

Saved GPT-4.1 results to gpt_4_1_summaries.csv


## Run Phi-4 Summarization

In [15]:
phi_summaries = await run_summarization(df, phi_4, "Phi-4")

[Phi-4] Processed 4/100
[Phi-4] Processed 2/100
[Phi-4] Processed 3/100
[Phi-4] Processed 1/100
[Phi-4] Processed 5/100
[Phi-4] Processed 7/100
[Phi-4] Processed 6/100
[Phi-4] Processed 8/100
[Phi-4] Processed 11/100
[Phi-4] Processed 9/100
[Phi-4] Processed 10/100
[Phi-4] Processed 12/100
[Phi-4] Processed 13/100
[Phi-4] Processed 14/100
[Phi-4] Processed 15/100
[Phi-4] Processed 16/100
[Phi-4] Processed 17/100
[Phi-4] Processed 18/100
[Phi-4] Processed 20/100
[Phi-4] Processed 22/100
[Phi-4] Processed 21/100
[Phi-4] Processed 23/100
[Phi-4] Processed 24/100
[Phi-4] Processed 25/100
[Phi-4] Processed 27/100
[Phi-4] Processed 28/100
[Phi-4] Processed 26/100
[Phi-4] Processed 29/100
[Phi-4] Processed 19/100
[Phi-4] Processed 30/100
[Phi-4] Processed 31/100
[Phi-4] Processed 32/100
[Phi-4] Processed 33/100
[Phi-4] Processed 35/100
[Phi-4] Processed 34/100
[Phi-4] Processed 36/100
[Phi-4] Processed 37/100
[Phi-4] Processed 39/100
[Phi-4] Processed 38/100
[Phi-4] Processed 41/100
[Phi-4] P

In [16]:
# Save Phi-4 results
df_phi = df.copy()
df_phi["summary"] = phi_summaries
df_phi.to_csv("phi_4_summaries.csv", index=False)
print("Saved Phi-4 results to phi_4_summaries.csv")

Saved Phi-4 results to phi_4_summaries.csv


## Compare Results

In [17]:
# Side-by-side comparison
comparison = pd.DataFrame({
    "query": df["query_text"],
    "gpt_4_1_summary": gpt_summaries,
    "phi_4_summary": phi_summaries,
})
comparison.head(10)

Unnamed: 0,query,gpt_4_1_summary,phi_4_summary
0,Where can I find reviews and recommendations f...,You can find reviews and recommendations for a...,To find reviews and recommendations for anti-a...
1,what is big data really skillcrush,Big data refers to the large volumes of struct...,Big Data refers to extremely large datasets th...
2,growth secrets from tinder uber twitch andrew ...,"On The Tim Ferriss Show, Andrew Chen shares in...",The search results focus on episodes of The Ti...
3,stories about hiking in california trinity alp...,The search results highlight the Trinity Alps ...,The search results provide insights into hikin...
4,Who photographed Paulina Porizkova for the joy...,The search results indicate that Paulina Poriz...,Paulina Porizkova was photographed by Fabien B...
5,tim ferriss rick rubin podcast transcript,"In episode #649 of The Tim Ferriss Show, Tim F...","In the Tim Ferriss podcast episode #649, Rick ..."
6,once upon a chef green apple celery salad,The Once Upon a Chef green apple celery salad ...,The Green Apple and Celery Salad with Dill is ...
7,Who is Niko Korner and what is their role at Y...,"Niko Korner is a member of the Yoast team, as ...","Niko Korner is a member of the Yoast team, alt..."
8,litz rages in hueco climbing article by dougal...,The article 'Litz Rages in Hueco' highlights c...,In the article 'Litz Rages in Hueco' by Dougal...
9,Shanthi Rexaline entrepreneur author profile,Shanthi Rexaline is a contributor to Entrepren...,Shanthi Rexaline is recognized as an entrepren...
