# Scoring Specification Builder

This notebook collects human preferences between GPT-4.1 and Phi-4 summaries,
then generates a scoring specification via the `/scoring_system/generate` API.

## Setup & Imports

In [1]:
import json
import random
import textwrap

import pandas as pd
import requests
from IPython.display import display, Markdown, clear_output

## Load Data

In [2]:
# Load all three CSV files
df_sample = pd.read_csv("sample_100.csv")
df_gpt = pd.read_csv("gpt_4_1_summaries.csv")
df_phi = pd.read_csv("phi_4_summaries.csv")

# Parse ranked_results JSON
df_sample["ranked_results"] = df_sample["ranked_results"].apply(json.loads)

# Merge summaries with original data
df = df_sample.copy()
df["gpt_summary"] = df_gpt["summary"]
df["phi_summary"] = df_phi["summary"]

print(f"Loaded {len(df)} queries with summaries from both models")
df[["query_text", "gpt_summary", "phi_summary"]].head(3)

Loaded 100 queries with summaries from both models


Unnamed: 0,query_text,gpt_summary,phi_summary
0,Where can I find reviews and recommendations f...,You can find reviews and recommendations for a...,To find reviews and recommendations for anti-a...
1,what is big data really skillcrush,Big data refers to the large volumes of struct...,Big Data refers to extremely large datasets th...
2,growth secrets from tinder uber twitch andrew ...,"On The Tim Ferriss Show, Andrew Chen shares in...",The search results focus on episodes of The Ti...


## Define Prompt Reconstruction

Reconstruct the exact prompt sent to the LLM models during summarization.

In [3]:
PROMPT_TEMPLATE = """Summarize the following search results in 2-3 sentences, highlighting the key information that answers the user's question: {query}

Results:
{results}"""


def extract_result_fields(result: dict) -> dict:
    """Extract name and description from schema.org result data."""
    content = result.get("content", "{}")
    if isinstance(content, str):
        try:
            content = json.loads(content)
        except json.JSONDecodeError:
            content = {}

    # Try various schema.org fields for name
    name = (
        content.get("headline")
        or content.get("name")
        or content.get("caption")
        or result.get("url", "Unknown")
    )

    # Try various fields for description
    description = (
        content.get("description")
        or content.get("articleSection", "")
        or ""
    )
    if isinstance(description, list):
        description = ", ".join(description)

    return {"name": name, "description": description}


def build_llm_input(row) -> str:
    """Reconstruct the exact prompt sent to the LLM."""
    raw_results = row["ranked_results"][:3]
    results = [extract_result_fields(r) for r in raw_results]
    results_text = "\n".join(
        f"{i}. {r['name']}: {r['description']}"
        for i, r in enumerate(results, 1)
    )
    return PROMPT_TEMPLATE.format(query=row["query_text"], results=results_text)


# Test with first row
print("Example LLM input:")
print("-" * 60)
print(build_llm_input(df.iloc[0]))

Example LLM input:
------------------------------------------------------------
Summarize the following search results in 2-3 sentences, highlighting the key information that answers the user's question: Where can I find reviews and recommendations for anti-aging skincare products from real users?

Results:
1. Beauty Favorites: skin care products I love + my anti-aging routine: Favorite Things, Lifestyle
2. Beauty Favorites: skin care products I love + my anti-aging routine: 
3. https://centremarceau.com/medecine-esthetique/rajeunissement-de-la-peau/#primaryimage: 


## Sample 5 Random Examples

In [4]:
# Set seed for reproducibility (optional - remove for true randomness)
# random.seed(42)

# Sample 5 random indices
sample_indices = random.sample(range(len(df)), 5)
print(f"Selected indices: {sample_indices}")

# Preview selected queries
for i, idx in enumerate(sample_indices, 1):
    print(f"\n{i}. [{idx}] {df.iloc[idx]['query_text'][:80]}...")

Selected indices: [76, 26, 52, 72, 60]

1. [76] Is there a recipe for cakey baked donuts with apple cider reduction and cinnamon...

2. [26] Is there a job guarantee with Skillcrush’s Break Into Tech Get Hired Track?...

3. [52] valuewalk how to buy crypto UK article...

4. [72] latest Harper's Bazaar Spain magazine covers April 2025...

5. [60] Can you provide details about the website redesign project for the Trust Company...


## Interactive Preference Collection

For each example, you'll see:
- The query
- Summary A (randomly assigned to GPT or Phi)
- Summary B (the other model)

Enter **A** or **B** to indicate your preference.

In [5]:
WIDTH = 81  # Matches header width: "-" * 35 + " SUMMARY A " + "-" * 35

preferences = []

for i, idx in enumerate(sample_indices, 1):
    row = df.iloc[idx]
    query = row["query_text"]
    gpt_summary = row["gpt_summary"]
    phi_summary = row["phi_summary"]
    
    # Randomly assign A/B to prevent bias
    if random.choice([True, False]):
        summary_a, summary_b = gpt_summary, phi_summary
        model_a, model_b = "gpt", "phi"
    else:
        summary_a, summary_b = phi_summary, gpt_summary
        model_a, model_b = "phi", "gpt"
    
    # Display the comparison
    print("=" * WIDTH)
    print(f"EXAMPLE {i}/5 (Index: {idx})")
    print("=" * WIDTH)
    print(f"\nQUERY: {textwrap.fill(query, width=WIDTH)}")
    print("\n" + "-" * 35 + " SUMMARY A " + "-" * 35)
    print(textwrap.fill(summary_a, width=WIDTH))
    print("\n" + "-" * 35 + " SUMMARY B " + "-" * 35)
    print(textwrap.fill(summary_b, width=WIDTH))
    print()
    
    # Collect preference
    while True:
        choice = input("Which summary is better? Enter A or B: ").strip().upper()
        if choice in ["A", "B"]:
            break
        print("Invalid input. Please enter A or B.")
    
    # Record the preference
    if choice == "A":
        chosen_model = model_a
        chosen_summary = summary_a
        rejected_summary = summary_b
    else:
        chosen_model = model_b
        chosen_summary = summary_b
        rejected_summary = summary_a
    
    preferences.append({
        "index": idx,
        "query": query,
        "chosen_model": chosen_model,
        "chosen_summary": chosen_summary,
        "rejected_summary": rejected_summary,
        "llm_input": build_llm_input(row),
    })
    
    print(f"\n>> You chose: {chosen_model.upper()}\n")

print("\n" + "=" * WIDTH)
print("PREFERENCE COLLECTION COMPLETE")
print("=" * WIDTH)

EXAMPLE 1/5 (Index: 76)

QUERY: Is there a recipe for cakey baked donuts with apple cider reduction and cinnamon
sugar coating?

----------------------------------- SUMMARY A -----------------------------------
There is a recipe for cakey baked donuts with apple cider reduction and cinnamon
sugar coating. The baked apple cider donuts incorporate an apple cider reduction
to enhance their flavor and are coated in cinnamon sugar, along with apple pie
spices, making them a delightful fall treat. Another variation, Baked Cinnamon
Crunch Apple Cider Doughnuts, offers a similar flavor profile to celebrate the
autumn season.

----------------------------------- SUMMARY B -----------------------------------
Yes, there are recipes for cakey baked donuts that use an apple cider reduction
to enhance flavor and are coated in a cinnamon sugar mixture. These donuts are
described as soft, moist, and baked (not fried), making them easy to prepare and
perfect for fall. The recipes highlight the combinat

## Review Collected Preferences

In [6]:
# Summary of preferences
gpt_wins = sum(1 for p in preferences if p["chosen_model"] == "gpt")
phi_wins = sum(1 for p in preferences if p["chosen_model"] == "phi")

print(f"Preferences collected: {len(preferences)}")
print(f"  GPT-4.1 chosen: {gpt_wins}")
print(f"  Phi-4 chosen: {phi_wins}")
print()

for i, p in enumerate(preferences, 1):
    print(f"{i}. [{p['index']}] {p['chosen_model'].upper()} - {p['query'][:60]}...")

Preferences collected: 5
  GPT-4.1 chosen: 4
  Phi-4 chosen: 1

1. [76] PHI - Is there a recipe for cakey baked donuts with apple cider re...
2. [26] GPT - Is there a job guarantee with Skillcrush’s Break Into Tech G...
3. [52] GPT - valuewalk how to buy crypto UK article...
4. [72] GPT - latest Harper's Bazaar Spain magazine covers April 2025...
5. [60] GPT - Can you provide details about the website redesign project f...


## Build Preference Examples

In [7]:
# Build preference examples for the API
preference_examples = [
    {
        "llm_input": p["llm_input"],
        "chosen": p["chosen_summary"],
        "rejected": p["rejected_summary"],
    }
    for p in preferences
]

print(f"Built {len(preference_examples)} preference examples")
print("\nFirst example structure:")
print(json.dumps({
    "llm_input": preference_examples[0]["llm_input"][:200] + "...",
    "chosen": preference_examples[0]["chosen"][:100] + "...",
    "rejected": preference_examples[0]["rejected"][:100] + "...",
}, indent=2))

Built 5 preference examples

First example structure:
{
  "llm_input": "Summarize the following search results in 2-3 sentences, highlighting the key information that answers the user's question: Is there a recipe for cakey baked donuts with apple cider reduction and cinn...",
  "chosen": "There is a recipe for cakey baked donuts with apple cider reduction and cinnamon sugar coating. The ...",
  "rejected": "Yes, there are recipes for cakey baked donuts that use an apple cider reduction to enhance flavor an..."
}


## Generate Scoring Specification

POST to the `/scoring_system/generate` endpoint to create a scoring spec based on preferences.

In [8]:
# Build request body
request_body = {
    "application_description": "Summarize search results in 2-3 sentences, highlighting the key information that answers the user's question.",
    "examples": [],
    "preference_examples": preference_examples,
}

print("Request body preview:")
print(json.dumps({
    "application_description": request_body["application_description"],
    "examples": request_body["examples"],
    "preference_examples": f"[{len(preference_examples)} examples]",
}, indent=2))

Request body preview:
{
  "application_description": "Summarize search results in 2-3 sentences, highlighting the key information that answers the user's question.",
  "examples": [],
  "preference_examples": "[5 examples]"
}


In [9]:
# Send POST request to generate scoring specification
API_URL = "http://localhost:8000/scoring_system/generate"

print(f"Sending POST request to {API_URL}...")
print()

try:
    response = requests.post(
        API_URL,
        json=request_body,
        headers={"Content-Type": "application/json"},
        timeout=120,
    )
    response.raise_for_status()
    
    result = response.json()
    print("SUCCESS! Scoring specification generated.")
    print("\n" + "=" * 70)
    print("RESPONSE:")
    print("=" * 70)
    print(json.dumps(result, indent=2))
    
except requests.exceptions.ConnectionError:
    print("ERROR: Could not connect to the API.")
    print("Make sure the server is running: cd ask_api && make dev")
except requests.exceptions.Timeout:
    print("ERROR: Request timed out.")
except requests.exceptions.HTTPError as e:
    print(f"ERROR: HTTP {response.status_code}")
    print(response.text)

Sending POST request to http://localhost:8000/scoring_system/generate...

SUCCESS! Scoring specification generated.

RESPONSE:
{
  "state": "QUEUED",
  "detailed_status": [
    "LAUNCHING"
  ],
  "job_id": "generate_spec_jobs:localuser:dcbe0c82-2d89-4038-ae7f-05fc34f054f0",
  "scoring_spec": null,
  "threshold": null,
  "num_preference_examples_used": null,
  "num_labeled_examples_used": null,
  "balanced_accuracy": null,
  "precision": null,
  "recall": null,
  "f1": null
}


## Save Results (Optional)

In [10]:
# Optionally save the collected preferences and generated spec
if 'result' in dir():
    output = {
        "preferences": preferences,
        "scoring_spec": result,
    }
    
    with open("scoring_spec_output.json", "w") as f:
        json.dump(output, f, indent=2)
    
    print("Saved results to scoring_spec_output.json")
else:
    print("No scoring spec to save (API call may have failed)")

Saved results to scoring_spec_output.json
