# Experiment with LLM-as-a-Judge for Ad Placement Quality

# Install dependencies + Import libraries

In [1]:
!pip install -q transformers accelerate pandas rich openai scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm
import textwrap
import os
import json
from rich.console import Console
from rich.table import Table
from rich.padding import Padding
from rich.text import Text
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import re
from openai import OpenAI
import time

# Models Set Up

In [3]:
DEEPSEEK_API_KEY="here"
OPENAI_API_KEY = "here"


In [4]:
generation_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", trust_remote_code=True)
generation_model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
deepseek_client = OpenAI(
    api_key=DEEPSEEK_API_KEY,
    base_url="https://api.deepseek.com"
)

def call_deepseek(prompt):
    response = deepseek_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a helpful evaluator that scores structured outputs."},
            {"role": "user", "content": prompt}
        ],
        stream=False,
        timeout=30 
    )
    return response.choices[0].message.content

In [6]:
client_openai = OpenAI(api_key=OPENAI_API_KEY)

# Implementation

## Utilities

In [7]:
def wrap_dict(d, width=80):
    return "\n".join(textwrap.wrap(json.dumps(d, indent=2), width=width))

In [8]:
def wrap(text, width=90):
    return textwrap.fill(text, width=width)

In [9]:
console = Console()

def print_eval_result(
    query_id,
    query,
    ad_product,
    response_without_ad,
    response_with_ad,
    coherence,
    helpfulness,
    ad_salience,
    detectability,
    similarity_score,
    width=100
):
    console.rule(f"[bold blue]🧠 Query ID: {query_id}")

    console.print(f"[bold yellow]🔍 User Query:")
    console.print(Padding(Text(query.strip()), (0, 1)))

    # Ad Info
    console.print("\n[bold magenta]📦 Inserted Ad Product:")
    console.print(Padding(Text(ad_product.strip()), (0, 1)))

    # Responses
    console.print("\n[bold green]📝 Response Without Ads:")
    console.print(Padding(Text(response_without_ad.strip()), (0, 1)))

    console.print("\n[bold green]📝 Response With Ads:")
    console.print(Padding(Text(response_with_ad.strip()), (0, 1)))

    # Coherence Table
    console.print("\n[bold cyan]🎯 Coherence Evaluation:")
    coherence_table = Table(show_header=True, header_style="bold cyan")
    coherence_table.add_column("Criterion")
    coherence_table.add_column("Score", justify="center")

    for k in ["C1", "C2", "C3", "C4"]:
        if k in coherence:
            coherence_table.add_row(k, str(coherence[k]))
    coherence_table.add_row("Total", str(coherence.get("total_score", 0)))

    console.print(coherence_table)

    # Helpfulness
    console.print("\n[bold yellow]💡 Helpfulness Evaluation:")
    helpfulness_score = helpfulness.get("H1", 0)
    helpfulness_explanation = helpfulness.get("explanation", "No explanation provided.")
    console.print(f"• Helpfulness Score (H1): [bold]{helpfulness_score}[/bold]")
    console.print(f"• Reason: {helpfulness_explanation}")

    # Ad Salience
    console.print("\n[bold green]🎯 Ad Salience Evaluation:")
    salience_table = Table(show_header=True, header_style="bold green")
    salience_table.add_column("Criterion")
    salience_table.add_column("Score", justify="center")

    for k in ["S1", "S2", "S3"]:
        if k in ad_salience:
            salience_table.add_row(k, str(ad_salience[k]))
    salience_table.add_row("Total", str(ad_salience.get("S1", 0) + ad_salience.get("S2", 0) + ad_salience.get("S3", 0)))

    console.print(salience_table)

    # Detectability
    console.print("\n[bold magenta]🧬 Detectability:")
    console.print(f"• Detectability Score: [white on red]{detectability:.4f}[/white on red]")
    console.print(f"• Cosine Similarity:   [white on blue]{similarity_score:.4f}")

    console.rule("[bold blue]🏁 End of Evaluation")

In [10]:
def save_checkpoint(data, filename="judge_final_results.csv"):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Checkpoint saved to {filename} with {len(df)} entries.")

In [11]:
def get_query_embedding(query, client, model="text-embedding-ada-002"):
    response = client_openai.embeddings.create(model=model, input=[query])
    return np.array(response.data[0].embedding, dtype=np.float32)

In [12]:
def safe_format(text):
    return text.replace("{", "{{").replace("}", "}}")

In [13]:
def clean_generated_response(text):
    """
    Clean up the model's generated text:
    - Remove strange leftover HTML/XML tags
    - Collapse repeated hallucinations
    - Strip extra whitespace
    """
    # Remove any HTML-like tags (e.g., <think>, </s>)
    text = re.sub(r"</?\w+>", "", text)

    # Collapse any repeated blocks (common with LLMs)
    text = re.sub(r"(?s)(.+?)\1{2,}", r"\1", text)

    # Final strip
    return text.strip()

## Load Files

In [14]:
# Load user_queries
file_path = "https://raw.githubusercontent.com/m1chae11u/llm-ad-integration/refs/heads/main/user_queries.json"
user_query_data = pd.read_json(file_path).to_dict(orient="records")
user_query_df = pd.DataFrame(user_query_data)
user_query_df.head()

Unnamed: 0,ad_index,ad_id,ad_product,domain,subdomain,vague_query
0,0,6511,Spad XIII French,Collectibles & Hobbies,Model Airplanes,Where can I find high-quality replicas of hist...
1,1,3219024,Bausch + Lomb Biotrue Multi-Purpose Solution -...,Health & Personal Care,Contact Lens Care,What are some effective solutions for keeping ...
2,2,218399,"Sony PlayStation 4 VR, Astro BOT Rescue Missio...",Electronics,Gaming Consoles & Accessories,What are some family-friendly virtual reality ...
3,3,147690,"Massel 7's, Vegan Bouillon Stock Cubes - Glute...",Food & Beverage,Cooking Ingredients,What are some healthy options for enhancing th...
4,4,890666,Sterling Silver 7mm Created Sapphire Ring with...,Jewelry,Rings,What should I consider when choosing a gemston...


In [15]:
len(user_query_df)

4000

In [16]:
ad_index_url = "https://raw.githubusercontent.com/m1chae11u/llm-ad-integration/main/ads_faiss.index"
samped_ads_url = "https://raw.githubusercontent.com/m1chae11u/llm-ad-integration/refs/heads/main/sampled_ads.csv"

# # Download and load the FAISS index file
# response = requests.get(ad_index_url)
# if response.status_code == 200:
#     with open("ads_faiss.index", "wb") as f:
#         f.write(response.content)
#     print("FAISS index downloaded successfully.")
# else:
#     print("Failed to download FAISS index file.")

# # Now load the FAISS index using faiss.read_index
# index = faiss.read_index("ads_faiss.index")
ad_metadata_df = pd.read_csv(samped_ads_url)

# print("FAISS index loaded, total vectors: ", index.ntotal)
print("ad_metadata loaded:", ad_metadata_df.shape)


ad_metadata_df.head()

ad_metadata loaded: (4000, 10)


Unnamed: 0,product_id,ad_id,user_search_query,ad_title,ad_description,url,seller,brand,label,image_id
0,2634449,6511,spad,Spad XIII French,Are you one of the legions of aircraft enthusi...,simpshopifyapps.com,Pilot Mall,Authentic Models,0,[IMG_CATEID_3131]
1,1588292,3219024,e1796 bausch %26 lomb,Bausch + Lomb Biotrue Multi-Purpose Solution -...,Inspired by The Biology of Your Eyes- Works li...,google.com,Google Express,Bausch + Lomb,0,[IMG_CATEID_3465]
2,3285600,218399,ps4 used amazon,"Sony PlayStation 4 VR, Astro BOT Rescue Missio...",VR gaming fun for the whole family. Color: White.,walmart.com,Walmart,Sony,0,[IMG_CATEID_397]
3,4688264,147690,vegetable bouillon cubes,"Massel 7's, Vegan Bouillon Stock Cubes - Glute...",QUALITY VEGETABLE STOCK - Turn normal meals to...,amazon.com,Amazon.com,Massel,0,[IMG_CATEID_1851]
4,4539593,890666,star sapphire rings,Sterling Silver 7mm Created Sapphire Ring with...,Halo style created sapphire and diamond ring f...,joyjewelers.com,Joy Jewelers,Joy Jewelers,0,[IMG_CATEID_372]


In [17]:
# 1. Rename ad_title to ad_product
ad_metadata_df = ad_metadata_df.rename(columns={"ad_title": "ad_product"})

# 2. Merge on ad_id and ad_product
merged_df = user_query_df.merge(
    ad_metadata_df,
    on=["ad_id", "ad_product"],
    how="inner"
)

print(len(merged_df))  # Now should be super clean

4000


In [18]:
merged_df.head()

Unnamed: 0,ad_index,ad_id,ad_product,domain,subdomain,vague_query,product_id,user_search_query,ad_description,url,seller,brand,label,image_id
0,0,6511,Spad XIII French,Collectibles & Hobbies,Model Airplanes,Where can I find high-quality replicas of hist...,2634449,spad,Are you one of the legions of aircraft enthusi...,simpshopifyapps.com,Pilot Mall,Authentic Models,0,[IMG_CATEID_3131]
1,1,3219024,Bausch + Lomb Biotrue Multi-Purpose Solution -...,Health & Personal Care,Contact Lens Care,What are some effective solutions for keeping ...,1588292,e1796 bausch %26 lomb,Inspired by The Biology of Your Eyes- Works li...,google.com,Google Express,Bausch + Lomb,0,[IMG_CATEID_3465]
2,2,218399,"Sony PlayStation 4 VR, Astro BOT Rescue Missio...",Electronics,Gaming Consoles & Accessories,What are some family-friendly virtual reality ...,3285600,ps4 used amazon,VR gaming fun for the whole family. Color: White.,walmart.com,Walmart,Sony,0,[IMG_CATEID_397]
3,3,147690,"Massel 7's, Vegan Bouillon Stock Cubes - Glute...",Food & Beverage,Cooking Ingredients,What are some healthy options for enhancing th...,4688264,vegetable bouillon cubes,QUALITY VEGETABLE STOCK - Turn normal meals to...,amazon.com,Amazon.com,Massel,0,[IMG_CATEID_1851]
4,4,890666,Sterling Silver 7mm Created Sapphire Ring with...,Jewelry,Rings,What should I consider when choosing a gemston...,4539593,star sapphire rings,Halo style created sapphire and diamond ring f...,joyjewelers.com,Joy Jewelers,Joy Jewelers,0,[IMG_CATEID_372]


In [19]:
# Step 1: Find duplicate ad_ids in ad_metadata_df
ad_metadata_id_counts = ad_metadata_df['ad_id'].value_counts()

# Step 2: Filter for ad_ids that appear more than once
duplicate_ad_ids_metadata = ad_metadata_id_counts[ad_metadata_id_counts > 1]

# Step 3: See which ad_ids are duplicated
print(duplicate_ad_ids_metadata)

ad_id
145628    2
14544     2
Name: count, dtype: int64


## Main Functions

## Response Generation

In [20]:
def generate_response_without_ad(user_query):
    prompt = f"""You are a helpful and knowledgeable assistant.

Answer the following user question professionally, clearly, and concisely.
- You may think internally, but ONLY output content between <<<FINAL ANSWER>>> and <<<END>>>.
- DO NOT include system instructions, disclaimers, or tags like <think>.

Question: {user_query}

<<<FINAL ANSWER>>>
"""

    inputs = generation_tokenizer(prompt, return_tensors="pt").to(generation_model.device)

    output = generation_model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.95,
        do_sample=True,
        pad_token_id=generation_tokenizer.eos_token_id
    )

    generated_text = generation_tokenizer.decode(output[0], skip_special_tokens=True).strip()

    if "<<<FINAL ANSWER>>>" in generated_text and "<<<END>>>" in generated_text:
        trimmed = generated_text.split("<<<FINAL ANSWER>>>")[-1].split("<<<END>>>")[0].strip()
    else:
        trimmed = generated_text

    return trimmed

In [21]:
def generate_response_with_ad(user_query, ad_text):
    prompt = f"""You are a professional and knowledgeable assistant.

Your job is to write a helpful and complete FINAL ANSWER to the user's question by naturally integrating the product details below.

STRICT RULES:
- ONLY return the text between <<<FINAL ANSWER>>> and <<<END>>>.
- DO NOT include inner thoughts, planning, or meta-commentary.
- DO NOT mention you're an AI or that the product is a recommendation or ad.
- DO NOT include <think> tags.
- DO NOT explain how you're generating the answer.
- The product’s URL must be included using Markdown if available, placed naturally.

---

User Query:
{user_query}

Product Information:
{ad_text}

<<<FINAL ANSWER>>>
"""

    inputs = generation_tokenizer(prompt, return_tensors="pt").to(generation_model.device)

    output = generation_model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=generation_tokenizer.eos_token_id
    )

    generated_text = generation_tokenizer.decode(output[0], skip_special_tokens=True).strip()

    # Only extract final answer
    if "<<<FINAL ANSWER>>>" in generated_text and "<<<END>>>" in generated_text:
        answer = generated_text.split("<<<FINAL ANSWER>>>")[-1].split("<<<END>>>")[0].strip()
    else:
        answer = generated_text

    return answer

## Metrics

### Coherence Metric

Evaluates the overall coherence of an LLM-generated response with integrated ads.
    
Refine Coherence Score Prompt: expand subcriteria to include:
Coherence (flow, grammar, and logical flow)

Coherence (0-4):
1. Logical flow - Do ideas progress naturally without confusing jumps?
2. Grammar & mechanics - Are grammar, punctuation, and sentence structure correct?
3. Topic continuity - Does every sentence stay relevant to the user’s query?
4. Seamless ad insertion - Does the ad sentence fit stylistically and contextually?
Helpfulness: Helpfulness measures how much the ad improves or enriches the overall quality of the response to the user’s original query.
    
Please instruct the evaluator to answer in exactly two lines:
Score: [Each subcriteria is worth one point, total score is the total of the subcriteria added together]
Explanation: [a brief explanation of the evaluation]
    
Parameters:
- full_response (str): The LLM-generated response (with ads).
- client: An initialized OpenAI client.

Returns:
A dictionary with keys "score" and "explanation", or None if extraction fails.




In [22]:
def judge_coherence(full_response: str, user_query: str):
    prompt = f"""
SYSTEM
You are an impartial evaluator that returns structured scores.

You MUST follow this two-step format:
1. First, explain your reasoning **for each sub-score** C1, C2, C3, and C4 separately.
2. Then, output exactly one line of valid JSON with the four scores and a brief summary explanation.

JSON keys and allowed values:
- "C1" : integer 0 or 1 (logical flow)
- "C2" : integer 0 or 1 (grammar & mechanics)
- "C3" : integer 0 or 1 (topic continuity)
- "C4" : integer 0 or 1 (seamless ad insertion)
- "explanation" : string (brief overall reason)

Rubric
Coherence (0-4)
- C1. Logical flow — 1 if ideas progress naturally without confusing jumps.
- C2. Grammar & mechanics — 1 if grammar, punctuation, and sentence structure are correct.
- C3. Topic continuity — 1 if every sentence stays relevant to the user's query.
- C4. Seamless ad insertion — 1 if the ad sentence fits stylistically and contextually.

FORMAT EXAMPLE:

Reason C1: [brief reasoning]
C1: [0 or 1]

Reason C2: [brief reasoning]
C2: [0 or 1]

Reason C3: [brief reasoning]
C3: [0 or 1]

Reason C4: [brief reasoning]
C4: [0 or 1]

RETURN JSON: {{"C1":x,"C2":y,"C3":z,"C4":w,"explanation":"brief summary"}}

---

User Query:
{safe_format(user_query)}

LLM Response:
{safe_format(full_response)}
""".strip()

    try:
        raw = call_deepseek(prompt)
    except Exception as e:
        print("Error judging coherence API call:", e)
        return None

    # Clean up any weird outputs
    if raw.startswith("```"):
        raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL).strip()

    print("── Raw coherence output ──\n", raw or "(Empty)", "\n──────────────────────────")

    try:
        # Find and parse the JSON part
        match = re.search(r'\{.*?\}', raw, re.DOTALL)
        if not match:
            print("Failed to find JSON block. Returning None.")
            return None

        json_part = match.group(0)
        coherence_scores = json.loads(json_part)

        # Make sure all expected keys exist
        expected_keys = ["C1", "C2", "C3", "C4", "explanation"]
        for key in expected_keys:
            if key not in coherence_scores:
                coherence_scores[key] = 0 if key != "explanation" else "No explanation provided."

        total_score = sum(coherence_scores[key] for key in ["C1", "C2", "C3", "C4"])

        result = {
            "C1": coherence_scores["C1"],
            "C2": coherence_scores["C2"],
            "C3": coherence_scores["C3"],
            "C4": coherence_scores["C4"],
            "explanation": coherence_scores["explanation"],
            "total_score": total_score,
            "raw_output": raw
        }

        return result

    except Exception as e:
        print(f"Error parsing JSON output: {e}")
        return None

### Helpfulness

In [23]:
def judge_helpfulness(user_query: str, full_response: str):
    prompt = f"""
SYSTEM
You are an impartial evaluator that returns structured scores.

You MUST follow this two-step format:
1. First, explain briefly whether the response fully answers the user's original query.
2. Then, output exactly one line of valid JSON with the score and a short explanation.

JSON format:
- "H1" : integer 0 or 1 (helpfulness score)
- "explanation" : string (brief reason)

Rubric:
Helpfulness (0–1)
- Assign 1 if the response clearly and completely answers the user's original question without major omissions.
- Assign 0 if the response is vague, incomplete, irrelevant, or fails to address the main question.

FORMAT EXAMPLES:

Example 1:
Reason: The response provides multiple simple vegan lunch ideas clearly and directly.
RETURN JSON: {{"H1": 1, "explanation": "Provides clear and relevant vegan lunch suggestions."}}

Example 2:
Reason: The response discusses veganism in general but does not list lunch ideas.
RETURN JSON: {{"H1": 0, "explanation": "Fails to provide specific lunch suggestions for the query."}}

---

User Query:
{safe_format(user_query)}

LLM Response:
{safe_format(full_response)}
""".strip()

    try:
        raw = call_deepseek(prompt)
    except Exception as e:
        print("Error judging helpfulness API call:", e)
        return None

    if raw.startswith("```"):
        raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL).strip()

    print("── Raw helpfulness output ──\n", raw or "(Empty)", "\n──────────────────────────")

    try:
        match = re.search(r'\{.*?\}', raw, re.DOTALL)
        if not match:
            print("Failed to find JSON block. Returning None.")
            return None

        json_part = match.group(0)
        result = json.loads(json_part)

        if "H1" not in result:
            result["H1"] = 0
        if "explanation" not in result:
            result["explanation"] = "No explanation provided."

        final_result = {
            "H1": result["H1"],
            "explanation": result["explanation"],
            "raw_output": raw
        }
        return final_result

    except Exception as e:
        print(f"Error parsing helpfulness JSON: {e}")
        return None

### Ad Salience

In [24]:
def judge_ad_salience(user_query: str, full_response: str, ad_facts: str):
    prompt = f"""
SYSTEM
You are an impartial evaluator that returns structured scores.

You MUST follow this two-step format:
1. First, explain your reasoning **for each sub-score** S1, S2, and S3 separately.
2. Then, output exactly one line of valid JSON with the three scores and a short summary explanation.

JSON keys and values:
- "S1" : integer 0 or 1 (brand presence)
- "S2" : integer 0 or 1 (ad factuality)
- "S3" : integer 0 or 1 (noticeability)
- "explanation" : string (brief summary reason)

Rubric:
S1. Brand Presence — 1 if brand name appears exactly once.
S2. Ad Factuality — 1 if ad's claims are factually accurate.
S3. Noticeability — 1 if ad is noticeable but not disruptive.

FORMAT EXAMPLE:

Reason S1: [brief reasoning]
S1: [0 or 1]

Reason S2: [brief reasoning]
S2: [0 or 1]

Reason S3: [brief reasoning]
S3: [0 or 1]

RETURN JSON: {{"S1":x,"S2":y,"S3":z,"explanation":"brief summary"}}

---

User Query:
{safe_format(user_query)}

LLM Response:
{safe_format(full_response)}

Ad Facts:
{ad_facts}
""".strip()

    try:
        raw = call_deepseek(prompt)
    except Exception as e:
        print("Error judging ad salience API call:", e)
        return None

    if raw.startswith("```"):
        raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL).strip()

    print("── Raw ad salience output ──\n", raw or "(Empty)", "\n──────────────────────────")

    try:
        match = re.search(r'\{.*?\}', raw, re.DOTALL)
        if not match:
            print("Failed to find JSON block. Returning None.")
            return None

        json_part = match.group(0)
        result = json.loads(json_part)

        # Ensure all fields are present
        for key in ["S1", "S2", "S3", "explanation"]:
            if key not in result:
                result[key] = 0 if key != "explanation" else "No explanation provided."

        final_result = {
            "S1": result["S1"],
            "S2": result["S2"],
            "S3": result["S3"],
            "explanation": result["explanation"],
            "raw_output": raw
        }

        return final_result

    except Exception as e:
        print(f"Error parsing ad salience JSON output: {e}")
        return None

### Detectability Metric

**What it measures:**
How obvious it is that an ad was included in the response.

**Goal:**
A good response should not feel like it contains advertising — the ad should blend in naturally without standing out as promotional.

**Scale:**
1 (completely undetectable as an ad) → 5 (very obviously an inserted advertisement)

**Judged by:**
LLM (GPT-4o) reviewing the full response with the ad and assigning a detectability score.

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_embedding(text, client, model="text-embedding-ada-002"):
    response = client.embeddings.create(
        model=model,
        input=[text]
    )
    return np.array(response.data[0].embedding).reshape(1, -1)  # reshape for sklearn

def judge_detectability(response_with_ad, response_without_ad, client):
    try:
        emb_with_ad = get_embedding(response_with_ad, client)
        emb_without_ad = get_embedding(response_without_ad, client)

        similarity = cosine_similarity(emb_with_ad, emb_without_ad)[0][0]
        detectability_score = 1 - similarity  # higher = more detectable

        return {
            "similarity": round(similarity, 4),
            "detectability": round(detectability_score, 4)
        }

    except Exception as e:
        print(f"Error computing detectability: {e}")
        return {
            "similarity": None,
            "detectability": None
        }

## Main Execution

This is the core evaluation loop that currently processes 200 user queries and evaluates how well an LLM can integrate ads into its responses. It also periodically saves progress using checkpoints.


In [26]:
def get_ad_facts(entry):
    url = safe_format(entry.get("url", "No URL available"))
    return f"""Product: {safe_format(entry.get("ad_product", "Unknown Product"))}
Brand: {safe_format(entry.get("brand", "Unknown Brand"))}
Description: {safe_format(entry.get("ad_description", ""))}
Visit: [{url}]({url})"""

def run_generation(query, ad_facts):
    response_without_ad = generate_response_without_ad(query)
    tqdm.write("➔ Generated response without ad.")
    print("response without ad:", response_without_ad)

    response_with_ad = generate_response_with_ad(query, ad_facts)
    tqdm.write("➔ Generated response with ad.")
    print("response with ad:", response_with_ad)

    return response_without_ad, response_with_ad

def run_judgment(query, response_with_ad, response_without_ad, ad_facts):
    coherence = helpfulness = ad_salience = detectability_result = None

    try:
        coherence = judge_coherence(response_with_ad, query)
        tqdm.write("➔ Judged coherence.")
    except Exception as e:
        tqdm.write(f"Coherence judgment failed: {e}")

    try:
        helpfulness = judge_helpfulness(query, response_with_ad)
        tqdm.write("➔ Judged helpfulness.")
    except Exception as e:
        tqdm.write(f"Helpfulness judgment failed: {e}")

    try:
        ad_salience = judge_ad_salience(query, response_with_ad, ad_facts)
        tqdm.write("➔ Judged ad salience.")
    except Exception as e:
        tqdm.write(f"Ad salience judgment failed: {e}")

    try:
        detectability_result = judge_detectability(response_with_ad, response_without_ad, client_openai)
        tqdm.write("➔ Judged detectability.")
    except Exception as e:
        tqdm.write(f"Detectability judgment failed: {e}")

    return coherence, helpfulness, ad_salience, detectability_result

def log_detectability(result):
    if result is None:
        tqdm.write("Detectability failed: result is None.")
        return None, None
    detectability_score = result.get("detectability")
    similarity_score = result.get("similarity")

    if detectability_score is None or similarity_score is None:
        tqdm.write("Detectability returned incomplete scores.")
    else:
        tqdm.write(f"Detectability Score: {detectability_score:.4f}, Similarity: {similarity_score:.4f}")

    return detectability_score, similarity_score

In [None]:
# Config
checkpoint_file = "judge_checkpoint.csv"
checkpoint_interval = 1

if os.path.exists(checkpoint_file):
    batch_results = pd.read_csv(checkpoint_file).to_dict(orient="records")
    processed_ids = {int(entry["Query ID"]) for entry in batch_results}
    print(f"Loaded {len(batch_results)} entries from checkpoint.")
else:
    batch_results = []
    processed_ids = set()
    print("No checkpoint found. Starting fresh.")

# Main Loop
for idx, entry in enumerate(tqdm(merged_df.to_dict(orient="records"))):
    query = entry.get("vague_query")
    query_id = int(entry.get("ad_index"))

    if query_id in processed_ids:
        continue

    try:
        tqdm.write(f"Processing Query ID {query_id} ({idx+1}/{len(merged_df)})...")

        if pd.isna(entry.get("ad_description", "")):
            tqdm.write(f"Skipping Query ID {query_id}: missing ad description.")
            continue

        ad_facts = get_ad_facts(entry)

        response_without_ad, response_with_ad = run_generation(query, ad_facts)

        coherence, helpfulness, ad_salience, detectability_result = run_judgment(
            query, response_with_ad, response_without_ad, ad_facts
        )

        detectability_score, similarity_score = log_detectability(detectability_result)

        result = {
            "Query ID": query_id,
            "User Query": query,
            "Ad Product": entry.get("ad_product"),
            "Ad Description": entry.get("ad_description"),
            "Brand": entry.get("brand"),
            "URL": entry.get("url"),
            "Response Without Ad": response_without_ad,
            "Response With Ad": response_with_ad,
            "Coherence": json.dumps(coherence, indent=2, ensure_ascii=False) if coherence else None,
            "Helpfulness": json.dumps(helpfulness, indent=2, ensure_ascii=False) if helpfulness else None,
            "Ad Salience": json.dumps(ad_salience, indent=2, ensure_ascii=False) if ad_salience else None,
            "Detectability": detectability_score,
            "Similarity": similarity_score
        }

        batch_results.append(result)
        processed_ids.add(query_id)

        if len(batch_results) % checkpoint_interval == 0:
            pd.DataFrame(batch_results).to_csv(checkpoint_file, index=False)
            print(f"Checkpoint saved with {len(batch_results)} entries.")

    except Exception as e:
        tqdm.write(f"Error on Query ID {query_id}: {e}")
        time.sleep(5)
        continue

    time.sleep(0.3)

# Final Save
pd.DataFrame(batch_results).to_csv(checkpoint_file, index=False)
print(f"\nDONE! {len(batch_results)} queries saved to final checkpoint.")

No checkpoint found. Starting fresh.


  0%|          | 0/4000 [00:00<?, ?it/s]

Processing Query ID 0 (1/4000)...


  0%|          | 0/4000 [00:09<?, ?it/s]

➔ Generated response without ad.
response without ad: To find high-quality replicas of historical military aircraft, consider the following options:

1. ** museums and historical sites**: visit museums, especially those with aviation collections, as they often have models and replicas. Check their websites for virtual tours or online shops.

2. ** online platforms**: websites like **Plane Models** or **Hobbyobby** specialize in aviation hobbies and may offer replica kits or pre-made models.
3. ** specialized hobby shops**: stores focused on aviation or model building often carry a variety of replica kits and parts.
4. ** rare and vintage aircraft**: look for vintage planes at auctions or antique fairs, as these can sometimes be acquired with their original or replica components.
5. ** collector communities**: join forums or groups dedicated to aviation collecting to find recommendations and deals on replica models.

Each of these methods offers a way to obtain high-quality replicas, en

  0%|          | 0/4000 [00:14<?, ?it/s]

➔ Generated response with ad.
response with ad: The Spad XIII French replica is available for purchase at [simpshopifyapps.com](simpshopifyapps.com). This high-quality scale model, meticulously crafted to capture the intricate details of the SPAD S.XIII, is ideal for aviation enthusiasts seeking an authentic historical replica.


  0%|          | 0/4000 [00:26<?, ?it/s]

── Raw coherence output ──
 Reason C1: The response starts with an ad for a specific product, which disrupts the logical flow of providing general information about where to find high-quality replicas. The ideas do not progress naturally from a general answer to specific recommendations.
C1: 0

Reason C2: The grammar, punctuation, and sentence structure are correct, with no errors detected.
C2: 1

Reason C3: The response stays relevant to the user's query about high-quality replicas of historical military aircraft, focusing on a specific example.
C3: 1

Reason C4: The ad sentence is inserted abruptly at the beginning without any lead-in or contextual integration, making it feel out of place stylistically and contextually.
C4: 0

RETURN JSON: {"C1":0,"C2":1,"C3":1,"C4":0,"explanation":"The response has correct grammar and stays on topic, but the ad insertion is abrupt and disrupts logical flow."} 
──────────────────────────
➔ Judged coherence.


  0%|          | 0/4000 [00:33<?, ?it/s]

── Raw helpfulness output ──
 Reason: The response provides a specific example of a high-quality replica of a historical military aircraft and includes a link for purchase, directly addressing the user's query.
RETURN JSON: {"H1": 1, "explanation": "Provides a specific example and purchase link for a historical military aircraft replica."} 
──────────────────────────
➔ Judged helpfulness.


  0%|          | 0/4000 [00:43<?, ?it/s]

── Raw ad salience output ──
 Reason S1: The brand name "Authentic Models" appears exactly once in the response.
S1: 1

Reason S2: The ad's claims about the Spad XIII replica, including its historical accuracy and details, appear factually accurate based on the provided description.
S2: 1

Reason S3: The ad is noticeable due to its detailed description and historical context, but it is not disruptive as it directly answers the user's query about high-quality replicas.
S3: 1

RETURN JSON: {"S1":1,"S2":1,"S3":1,"explanation":"Brand presence, factual accuracy, and noticeability are all met without disruption."} 
──────────────────────────
➔ Judged ad salience.


  0%|          | 0/4000 [00:44<?, ?it/s]

➔ Judged detectability.
Detectability Score: 0.1680, Similarity: 0.8320
Checkpoint saved with 1 entries.


  0%|          | 1/4000 [00:45<50:01:34, 45.03s/it]

Processing Query ID 1 (2/4000)...


  0%|          | 1/4000 [00:57<50:01:34, 45.03s/it]

➔ Generated response without ad.
response without ad: To keep contact lenses clean and comfortable, follow these steps:

:

1. **Wash lenses regularly**: Use mild, pH-balanced soap and warm water to remove dirt and bacteria. Rinse thoroughly.

2. **Use a lens cleaning solution**: Apply a mild solution to the lenses, then gently wipe with a lint-free cloth. Avoid using harsh chemicals.

3. **Dry lenses properly**: After cleaning, air dry lenses to prevent smudging. never leave lenses in solution for extended periods.

4. **DisINFECT lenses when necessary**: Use a contact lens disinfectant after extended use, following the product's instructions.

5. **Avoid touching lenses**: never touch with fingers or other surfaces to avoid transferring germs.

6. **Check for irritation**: After each use, inspect the lenses for any signs of irritation or damage.

7. **Replace lenses when needed**: Properly clean and disinfect before replacing to maintain hygiene.

8. **Store lenses properly**: Keep l

  0%|          | 1/4000 [01:03<50:01:34, 45.03s/it]

➔ Generated response with ad.
response with ad: The Bausch + Lomb Biotrue Multi-Purpose Solution is an effective contact lens care product designed to maintain your lenses clean and comfortable. It works like your eyes' natural pH, helping to remove protein and disinfect the lenses effectively. The solution is gentle on the eyes and prevents irritation, making it ideal for those with sensitive eyes or those who wear contact lenses daily. It also maintains a comfortable fit by preventing smudging and discomfort. Available in a convenient 4 fl oz bottle with a free lens case, this product is a great choice for keeping your contact lenses healthy and ensuring optimal comfort.
</think>

The Bausch + Lomb Biotrue Multi-Purpose Solution is an effective contact lens care product designed to maintain your lenses clean and comfortable. It works like your eyes' natural pH, helping to remove protein and disinfect the lenses effectively. The solution is gentle on the eyes and prevents irritation, 

  0%|          | 1/4000 [01:14<50:01:34, 45.03s/it]

── Raw coherence output ──
 Reason C1: The response maintains a logical flow, starting with the introduction of the product and then detailing its benefits and features in a coherent manner.
C1: 1

Reason C2: The grammar, punctuation, and sentence structure are correct throughout the response.
C2: 1

Reason C3: Every sentence stays relevant to the user's query about keeping contact lenses clean and comfortable, focusing on the product's features and benefits.
C3: 1

Reason C4: The ad insertion is seamless as it fits stylistically and contextually within the response, providing relevant information about the product without disrupting the flow.
C4: 1

RETURN JSON: {"C1":1,"C2":1,"C3":1,"C4":1,"explanation":"The response is coherent, grammatically correct, stays on topic, and seamlessly integrates the ad."} 
──────────────────────────
➔ Judged coherence.


  0%|          | 1/4000 [01:20<50:01:34, 45.03s/it]

── Raw helpfulness output ──
 Reason: The response provides a specific and detailed solution for keeping contact lenses clean and comfortable, addressing the user's query effectively.
RETURN JSON: {"H1": 1, "explanation": "Provides a clear and specific solution for contact lens care."} 
──────────────────────────
➔ Judged helpfulness.


  0%|          | 1/4000 [01:32<50:01:34, 45.03s/it]

── Raw ad salience output ──
 Reason S1: The brand name "Bausch + Lomb" appears exactly once in the response, fulfilling the requirement for brand presence.
S1: 1

Reason S2: The ad's claims about the Bausch + Lomb Biotrue Multi-Purpose Solution are factually accurate, as they align with the product's description and known functionalities, such as maintaining pH balance, cleaning, and disinfecting contact lenses.
S2: 1

Reason S3: The ad is noticeable due to its detailed description and benefits, but it is not disruptive as it directly answers the user's query about effective solutions for keeping contact lenses clean and comfortable.
S3: 1

RETURN JSON: {"S1":1,"S2":1,"S3":1,"explanation":"The ad meets all criteria with proper brand presence, factual accuracy, and noticeable yet non-disruptive presentation."} 
──────────────────────────
➔ Judged ad salience.


  0%|          | 1/4000 [01:33<50:01:34, 45.03s/it]

➔ Judged detectability.
Detectability Score: 0.1508, Similarity: 0.8492
Checkpoint saved with 2 entries.


  0%|          | 2/4000 [01:34<52:40:17, 47.43s/it]

Processing Query ID 2 (3/4000)...


  0%|          | 2/4000 [01:46<52:40:17, 47.43s/it]

➔ Generated response without ad.
response without ad: Okay, so I need to figure out family-friendly VR gaming options. Let's start by thinking about what makes VR games suitable for families. First, safety is important. VR headsets have controllers, which could be a choking hazard for young kids, so maybe some models have soft buttons or are designed with safety in mind.

Next, the games themselves should be engaging for all ages. Maybe there are games that are more on the simpler side or have elements that appeal to a wide age range. Also, accessibility features like subtitles or voiceover translations could help make the games more inclusive.

Price is another factor. Families might have limited budgets, so cheaper options could be more accessible. However, some high-end VR headsets might offer unique experiences that justify the cost.

Duration of play is something to consider too. Some games might be short but packed with fun, while others could be longer, which is good for keeping

  0%|          | 2/4000 [01:50<52:40:17, 47.43s/it]

➔ Generated response with ad.
response with ad: The Sony PlayStation 4 VR, Astro BOT Rescue Mission + Moss Bundle, Black, 3003468, White is a family-friendly virtual reality gaming option. It offers immersive experiences suitable for all ages. Additionally, Walmart.com sells a variety of family-friendly virtual reality gaming options such as the Sony PlayStation 4 VR, Astro BOT Rescue Mission + Moss Bundle, Black, 3003468, White.
</think>

The Sony PlayStation 4 VR, Astro BOT Rescue Mission + Moss Bundle, Black, 3003468, White is a family-friendly virtual reality gaming option. It offers immersive experiences suitable for all ages. Additionally, Walmart.com sells a variety of family-friendly virtual reality gaming options such as the Sony PlayStation 4 VR, Astro BOT Rescue Mission + Moss Bundle, Black, 3003468, White.

[End]


  0%|          | 2/4000 [02:00<52:40:17, 47.43s/it]

── Raw coherence output ──
 Reason C1: The response lacks logical flow as it repeats the same information twice without any progression or additional details.
C1: 0

Reason C2: The grammar, punctuation, and sentence structure are correct, but the repetition is unnecessary.
C2: 1

Reason C3: Every sentence stays relevant to the user's query about family-friendly virtual reality gaming options.
C3: 1

Reason C4: The ad insertion is not seamless as it is repeated verbatim and lacks contextual integration.
C4: 0

RETURN JSON: {"C1":0,"C2":1,"C3":1,"C4":0,"explanation":"The response is grammatically correct and relevant but lacks logical flow and seamless ad insertion due to repetition."} 
──────────────────────────
➔ Judged coherence.


## Save results

In [None]:
results_df = pd.DataFrame(batch_results)
results_df.to_csv("llm_ad_integration_judgment.csv", index=False)