# Experiment with LLM-as-a-Judge for Ad Placement Quality

# Install dependencies + Import libraries

In [34]:
!pip install -q transformers accelerate pandas rich openai

In [35]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm
import textwrap
import os
import json
from rich.console import Console
from rich.table import Table
from rich.padding import Padding
from rich.text import Text
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import re
from openai import OpenAI
import time

# Models Set Up

In [None]:
DEEPSEEK_API_KEY="here"
OPENAI_API_KEY = "here"

In [None]:
generation_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", trust_remote_code=True)
generation_model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

In [37]:
deepseek_client = OpenAI(
    api_key=DEEPSEEK_API_KEY,
    base_url="https://api.deepseek.com"
)

def call_deepseek(prompt):
    response = deepseek_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a helpful evaluator that scores structured outputs."},
            {"role": "user", "content": prompt}
        ],
        stream=False
    )
    return response.choices[0].message.content

In [39]:
client_openai = OpenAI(api_key=OPENAI_API_KEY)

# Implementation

## Utilities

In [40]:
def wrap_dict(d, width=80):
    return "\n".join(textwrap.wrap(json.dumps(d, indent=2), width=width))

In [41]:
def wrap(text, width=90):
    return textwrap.fill(text, width=width)

In [42]:
console = Console()

def print_eval_result(
    query_id,
    query,
    ad_product,
    response_without_ad,
    response_with_ad,
    coherence,
    helpfulness,
    ad_salience,
    detectability,
    similarity_score,
    width=100
):
    console.rule(f"[bold blue]🧠 Query ID: {query_id}")

    console.print(f"[bold yellow]🔍 User Query:")
    console.print(Padding(Text(query.strip()), (0, 1)))

    # Ad Info
    console.print("\n[bold magenta]📦 Inserted Ad Product:")
    console.print(Padding(Text(ad_product.strip()), (0, 1)))

    # Responses
    console.print("\n[bold green]📝 Response Without Ads:")
    console.print(Padding(Text(response_without_ad.strip()), (0, 1)))

    console.print("\n[bold green]📝 Response With Ads:")
    console.print(Padding(Text(response_with_ad.strip()), (0, 1)))

    # Coherence Table
    console.print("\n[bold cyan]🎯 Coherence Evaluation:")
    coherence_table = Table(show_header=True, header_style="bold cyan")
    coherence_table.add_column("Criterion")
    coherence_table.add_column("Score", justify="center")

    for k in ["C1", "C2", "C3", "C4"]:
        if k in coherence:
            coherence_table.add_row(k, str(coherence[k]))
    coherence_table.add_row("Total", str(coherence.get("total_score", 0)))

    console.print(coherence_table)

    # Helpfulness
    console.print("\n[bold yellow]💡 Helpfulness Evaluation:")
    helpfulness_score = helpfulness.get("H1", 0)
    helpfulness_explanation = helpfulness.get("explanation", "No explanation provided.")
    console.print(f"• Helpfulness Score (H1): [bold]{helpfulness_score}[/bold]")
    console.print(f"• Reason: {helpfulness_explanation}")

    # Ad Salience
    console.print("\n[bold green]🎯 Ad Salience Evaluation:")
    salience_table = Table(show_header=True, header_style="bold green")
    salience_table.add_column("Criterion")
    salience_table.add_column("Score", justify="center")

    for k in ["S1", "S2", "S3"]:
        if k in ad_salience:
            salience_table.add_row(k, str(ad_salience[k]))
    salience_table.add_row("Total", str(ad_salience.get("S1", 0) + ad_salience.get("S2", 0) + ad_salience.get("S3", 0)))

    console.print(salience_table)

    # Detectability
    console.print("\n[bold magenta]🧬 Detectability:")
    console.print(f"• Detectability Score: [white on red]{detectability:.4f}[/white on red]")
    console.print(f"• Cosine Similarity:   [white on blue]{similarity_score:.4f}")

    console.rule("[bold blue]🏁 End of Evaluation")

In [43]:
def save_checkpoint(data, filename="judge_final_results.csv"):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Checkpoint saved to {filename} with {len(df)} entries.")

In [44]:
def get_query_embedding(query, client, model="text-embedding-ada-002"):
    response = client_openai.embeddings.create(model=model, input=[query])
    return np.array(response.data[0].embedding, dtype=np.float32)

## Load Files

In [45]:
# Load user_queries
file_path = "https://raw.githubusercontent.com/m1chae11u/llm-ad-integration/refs/heads/main/user_queries.json"
user_query_data = pd.read_json(file_path).to_dict(orient="records")
user_query_df = pd.DataFrame(user_query_data)
user_query_df.head()

Unnamed: 0,ad_index,ad_id,ad_product,domain,subdomain,vague_query
0,0,6511,Spad XIII French,Collectibles & Hobbies,Model Airplanes,Where can I find high-quality replicas of hist...
1,1,3219024,Bausch + Lomb Biotrue Multi-Purpose Solution -...,Health & Personal Care,Contact Lens Care,What are some effective solutions for keeping ...
2,2,218399,"Sony PlayStation 4 VR, Astro BOT Rescue Missio...",Electronics,Gaming Consoles & Accessories,What are some family-friendly virtual reality ...
3,3,147690,"Massel 7's, Vegan Bouillon Stock Cubes - Glute...",Food & Beverage,Cooking Ingredients,What are some healthy options for enhancing th...
4,4,890666,Sterling Silver 7mm Created Sapphire Ring with...,Jewelry,Rings,What should I consider when choosing a gemston...


In [46]:
len(user_query_df)

4000

In [47]:
ad_index_url = "https://raw.githubusercontent.com/m1chae11u/llm-ad-integration/main/ads_faiss.index"
samped_ads_url = "https://raw.githubusercontent.com/m1chae11u/llm-ad-integration/refs/heads/main/sampled_ads.csv"

# Download and load the FAISS index file
response = requests.get(ad_index_url)
if response.status_code == 200:
    with open("ads_faiss.index", "wb") as f:
        f.write(response.content)
    print("FAISS index downloaded successfully.")
else:
    print("Failed to download FAISS index file.")

# # Now load the FAISS index using faiss.read_index
# index = faiss.read_index("ads_faiss.index")
ad_metadata_df = pd.read_csv(samped_ads_url)

# print("FAISS index loaded, total vectors: ", index.ntotal)
print("ad_metadata loaded:", ad_metadata_df.shape)


ad_metadata_df.head()

FAISS index downloaded successfully.
ad_metadata loaded: (4000, 10)


Unnamed: 0,product_id,ad_id,user_search_query,ad_title,ad_description,url,seller,brand,label,image_id
0,2634449,6511,spad,Spad XIII French,Are you one of the legions of aircraft enthusi...,simpshopifyapps.com,Pilot Mall,Authentic Models,0,[IMG_CATEID_3131]
1,1588292,3219024,e1796 bausch %26 lomb,Bausch + Lomb Biotrue Multi-Purpose Solution -...,Inspired by The Biology of Your Eyes- Works li...,google.com,Google Express,Bausch + Lomb,0,[IMG_CATEID_3465]
2,3285600,218399,ps4 used amazon,"Sony PlayStation 4 VR, Astro BOT Rescue Missio...",VR gaming fun for the whole family. Color: White.,walmart.com,Walmart,Sony,0,[IMG_CATEID_397]
3,4688264,147690,vegetable bouillon cubes,"Massel 7's, Vegan Bouillon Stock Cubes - Glute...",QUALITY VEGETABLE STOCK - Turn normal meals to...,amazon.com,Amazon.com,Massel,0,[IMG_CATEID_1851]
4,4539593,890666,star sapphire rings,Sterling Silver 7mm Created Sapphire Ring with...,Halo style created sapphire and diamond ring f...,joyjewelers.com,Joy Jewelers,Joy Jewelers,0,[IMG_CATEID_372]


In [48]:
# 1. Rename ad_title to ad_product
ad_metadata_df = ad_metadata_df.rename(columns={"ad_title": "ad_product"})

# 2. Merge on ad_id and ad_product
merged_df = user_query_df.merge(
    ad_metadata_df,
    on=["ad_id", "ad_product"],
    how="inner"
)

print(len(merged_df))  # Now should be super clean

4000


In [49]:
# Step 1: Find duplicate ad_ids in ad_metadata_df
ad_metadata_id_counts = ad_metadata_df['ad_id'].value_counts()

# Step 2: Filter for ad_ids that appear more than once
duplicate_ad_ids_metadata = ad_metadata_id_counts[ad_metadata_id_counts > 1]

# Step 3: See which ad_ids are duplicated
print(duplicate_ad_ids_metadata)

ad_id
145628    2
14544     2
Name: count, dtype: int64


## Main Functions

In [50]:
# generation with ads shouldn't give any explanation.
# judge for inclusion: don't give any explanation or reason for excluding or including
# ^make sure it is able to distinguish one ad from another ad, ask it to identify which of the ads were included
# not why they were included. if there are 10 ads were retrived.

## Response Generation

In [51]:
def generate_response_without_ad(user_query):
    prompt = f"""
You are a helpful assistant.

Please write a clear, informative, and complete response to the
following user query. **Do not include any advertisements or promotions.**

---
User Query:
{user_query}
---
Response:
""".strip()

    inputs = generation_tokenizer(prompt, return_tensors="pt").to(generation_model.device)

    output = generation_model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=generation_tokenizer.eos_token_id
    )

    generated_text = generation_tokenizer.decode(output[0], skip_special_tokens=True)

    response = generated_text[len(prompt):]

    return response

In [52]:
def generate_response_with_ad(user_query, ad_text):
    prompt = f"""
You are an AI assistant tasked with answering user queries naturally and helpfully.
Below is the user query and a candidate product suggestion that might be relevant.
Integrate the product details (including the URL) into your response without
explicitly noting that they are advertisements.

User Query:
{user_query}

Product Suggestion:
{ad_text}

When mentioning a product, try not to sound like a salesman and focus on seamless
integration so it feels for the user, that they are not being sold to.
""".strip()

    inputs = generation_tokenizer(prompt, return_tensors="pt").to(generation_model.device)

    output = generation_model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=generation_tokenizer.eos_token_id
    )

    generated_text = generation_tokenizer.decode(output[0], skip_special_tokens=True)

    response = generated_text[len(prompt):].strip()

    return response

## Metrics

### Coherence Metric

Evaluates the overall coherence of an LLM-generated response with integrated ads.
    
Refine Coherence Score Prompt: expand subcriteria to include:
Coherence (flow, grammar, and logical flow)

Coherence (0-4):
1. Logical flow - Do ideas progress naturally without confusing jumps?
2. Grammar & mechanics - Are grammar, punctuation, and sentence structure correct?
3. Topic continuity - Does every sentence stay relevant to the user’s query?
4. Seamless ad insertion - Does the ad sentence fit stylistically and contextually?
Helpfulness: Helpfulness measures how much the ad improves or enriches the overall quality of the response to the user’s original query.
    
Please instruct the evaluator to answer in exactly two lines:
Score: [Each subcriteria is worth one point, total score is the total of the subcriteria added together]
Explanation: [a brief explanation of the evaluation]
    
Parameters:
- full_response (str): The LLM-generated response (with ads).
- client: An initialized OpenAI client.

Returns:
A dictionary with keys "score" and "explanation", or None if extraction fails.




In [53]:
def judge_coherence(full_response: str, user_query: str):
    prompt = f"""
SYSTEM
You are an impartial evaluator that returns structured scores.

You MUST follow this two-step format:
1. First, explain your reasoning **for each sub-score** C1, C2, C3, and C4 separately.
2. Then, output exactly one line of valid JSON with the four scores and a brief summary explanation.

JSON keys and allowed values:
- "C1" : integer 0 or 1 (logical flow)
- "C2" : integer 0 or 1 (grammar & mechanics)
- "C3" : integer 0 or 1 (topic continuity)
- "C4" : integer 0 or 1 (seamless ad insertion)
- "explanation" : string (brief overall reason)

Rubric
Coherence (0-4)
- C1. Logical flow — 1 if ideas progress naturally without confusing jumps.
- C2. Grammar & mechanics — 1 if grammar, punctuation, and sentence structure are correct.
- C3. Topic continuity — 1 if every sentence stays relevant to the user's query.
- C4. Seamless ad insertion — 1 if the ad sentence fits stylistically and contextually.

FORMAT EXAMPLE:

Reason C1: [brief reasoning]
C1: [0 or 1]

Reason C2: [brief reasoning]
C2: [0 or 1]

Reason C3: [brief reasoning]
C3: [0 or 1]

Reason C4: [brief reasoning]
C4: [0 or 1]

RETURN JSON: {"C1":x,"C2":y,"C3":z,"C4":w,"explanation":"brief summary"}

---

User Query:
{user_query}

LLM Response:
{full_response}
""".strip()

    try:
        raw = call_deepseek(prompt)
    except Exception as e:
        print("Error judging coherence API call:", e)
        return None

    # Clean up any weird outputs
    if raw.startswith("```"):
        raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL).strip()

    print("── Raw coherence output ──\n", raw or "(Empty)", "\n──────────────────────────")

    try:
        # Find and parse the JSON part
        match = re.search(r'\{.*?\}', raw, re.DOTALL)
        if not match:
            print("Failed to find JSON block. Returning None.")
            return None

        json_part = match.group(0)
        coherence_scores = json.loads(json_part)

        # Make sure all expected keys exist
        expected_keys = ["C1", "C2", "C3", "C4", "explanation"]
        for key in expected_keys:
            if key not in coherence_scores:
                coherence_scores[key] = 0 if key != "explanation" else "No explanation provided."

        total_score = sum(coherence_scores[key] for key in ["C1", "C2", "C3", "C4"])

        result = {
            "C1": coherence_scores["C1"],
            "C2": coherence_scores["C2"],
            "C3": coherence_scores["C3"],
            "C4": coherence_scores["C4"],
            "explanation": coherence_scores["explanation"],
            "total_score": total_score,
            "raw_output": raw
        }

        return result

    except Exception as e:
        print(f"Error parsing JSON output: {e}")
        return None

### Helpfulness

In [54]:
def judge_helpfulness(user_query: str, full_response: str):
    prompt = f"""
SYSTEM
You are an impartial evaluator that returns structured scores.

You MUST follow this two-step format:
1. First, explain briefly your reasoning whether the LLM's response fully answers the user's original query.
2. Then, output exactly one line of valid JSON with the score and a short explanation.

The JSON keys and allowed values are:
  "H1"           integer 0 or 1   (helpfulness score)
  "explanation"  string           (brief reason for the score)

Rubric
Helpfulness (0-1)
- Assign 1 if the response fully answers the user's original question clearly and without major omissions.
- Assign 0 if the response fails to answer, is vague, irrelevant, or leaves major gaps.

EXAMPLES

Good case:
<query> What are easy vegan lunches?
<response> A list of simple vegan lunch recipes with brief explanations.
Explanation: The response directly provides multiple simple vegan lunch ideas with clarity.
RETURN: {"H1":1,"explanation":"Provides clear and relevant vegan lunch suggestions."}

Bad case:
<query> What are easy vegan lunches?
<response> Talks about veganism generally without suggesting meals.
Explanation: Discusses veganism broadly but does not answer the specific request for lunch ideas.
RETURN: {"H1":0,"explanation":"Fails to provide specific lunch suggestions for the query."}

TASK
You will now receive:
- the user query,
- the LLM's response.

First, explain your reasoning in 2–3 sentences.
Then, output exactly one line of valid JSON with the final score and explanation.

---

User Query:
{user_query}

LLM Response:
{full_response}
""".strip()

    try:
        raw = call_deepseek(prompt)
    except Exception as e:
        print("Error judging helpfulness API call:", e)
        return None

    if raw.startswith("```"):
        raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL).strip()

    print("── Raw helpfulness output ──\n", raw or "(Empty)", "\n──────────────────────────")

    try:
        match = re.search(r'\{.*?\}', raw, re.DOTALL)
        if not match:
            print("Failed to find JSON block. Returning None.")
            return None

        json_part = match.group(0)
        result = json.loads(json_part)

        if "H1" not in result:
            result["H1"] = 0
        if "explanation" not in result:
            result["explanation"] = "No explanation provided."

        final_result = {
            "H1": result["H1"],
            "explanation": result["explanation"],
            "raw_output": raw
        }
        return final_result

    except Exception as e:
        print(f"Error parsing helpfulness JSON: {e}")
        return None

### Ad Salience

In [55]:
def judge_ad_salience(user_query: str, full_response: str, ad_facts: str):
    prompt = f"""
SYSTEM
You are an impartial evaluator that returns structured scores.

You MUST follow this two-step format:
1. First, explain your reasoning **for each sub-score** S1, S2, and S3 separately.
2. Then, output exactly one line of valid JSON with the three scores and a short summary explanation.

JSON keys and values:
- "S1" : integer 0 or 1 (brand presence)
- "S2" : integer 0 or 1 (ad factuality)
- "S3" : integer 0 or 1 (noticeability)
- "explanation" : string (brief summary reason)

Rubric:
S1. Brand Presence — 1 if brand name appears exactly once.
S2. Ad Factuality — 1 if ad's claims are factually accurate.
S3. Noticeability — 1 if ad is noticeable but not disruptive.

FORMAT EXAMPLE:

Reason S1: [brief reasoning]
S1: [0 or 1]

Reason S2: [brief reasoning]
S2: [0 or 1]

Reason S3: [brief reasoning]
S3: [0 or 1]

RETURN JSON: {"S1":x,"S2":y,"S3":z,"explanation":"brief summary"}

---

User Query:
{user_query}

LLM Response:
{full_response}

Ad Facts:
{ad_facts}
""".strip()

    try:
        raw = call_deepseek(prompt)
    except Exception as e:
        print("Error judging ad salience API call:", e)
        return None

    if raw.startswith("```"):
        raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL).strip()

    print("── Raw ad salience output ──\n", raw or "(Empty)", "\n──────────────────────────")

    try:
        match = re.search(r'\{.*?\}', raw, re.DOTALL)
        if not match:
            print("Failed to find JSON block. Returning None.")
            return None

        json_part = match.group(0)
        result = json.loads(json_part)

        # Ensure all fields are present
        for key in ["S1", "S2", "S3", "explanation"]:
            if key not in result:
                result[key] = 0 if key != "explanation" else "No explanation provided."

        final_result = {
            "S1": result["S1"],
            "S2": result["S2"],
            "S3": result["S3"],
            "explanation": result["explanation"],
            "raw_output": raw
        }

        return final_result

    except Exception as e:
        print(f"Error parsing ad salience JSON output: {e}")
        return None

### Detectability Metric

**What it measures:**
How obvious it is that an ad was included in the response.

**Goal:**
A good response should not feel like it contains advertising — the ad should blend in naturally without standing out as promotional.

**Scale:**
1 (completely undetectable as an ad) → 5 (very obviously an inserted advertisement)

**Judged by:**
LLM (GPT-4o) reviewing the full response with the ad and assigning a detectability score.

In [56]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_embedding(text, client, model="text-embedding-ada-002"):
    response = client.embeddings.create(
        model=model,
        input=[text]
    )
    return np.array(response.data[0].embedding).reshape(1, -1)  # reshape for sklearn

def judge_detectability(response_with_ad, response_without_ad, client):
    try:
        emb_with_ad = get_embedding(response_with_ad, client)
        emb_without_ad = get_embedding(response_without_ad, client)

        similarity = cosine_similarity(emb_with_ad, emb_without_ad)[0][0]
        detectability_score = 1 - similarity  # higher = more detectable

        return {
            "similarity": round(similarity, 4),
            "detectability": round(detectability_score, 4)
        }

    except Exception as e:
        print(f"Error computing detectability: {e}")
        return {
            "similarity": None,
            "detectability": None
        }

## Main Execution

This is the core evaluation loop that currently processes 200 user queries and evaluates how well an LLM can integrate ads into its responses. It also periodically saves progress using checkpoints.


In [None]:
# Config
checkpoint_file = "judge_final_results_gemini_last_2k.csv"
checkpoint_interval = 10  # Save every 10 entries

# Load checkpoint if exists
if os.path.exists(checkpoint_file):
    batch_results = pd.read_csv(checkpoint_file).to_dict(orient="records")
    processed_ids = {int(entry["Query ID"]) for entry in batch_results}
    print(f"Loaded {len(batch_results)} entries from checkpoint.")
else:
    batch_results = []
    processed_ids = set()
    print("No checkpoint found. Starting fresh.")

# Main Loop
for idx, entry in enumerate(tqdm(merged_df.to_dict(orient="records"))):
    query = entry.get("vague_query")
    query_id = int(entry.get("ad_index"))

    if query_id in processed_ids:
        continue  # Skip already processed entries

    try:
        tqdm.write(f"🔍 Processing Query ID {query_id} ({idx+1}/{len(merged_df)})...")

        # Retrieve ad info from merged df
        ad_product = entry.get("ad_product", "Unknown Product")
        ad_description = entry.get("ad_description", "")
        brand = entry.get("brand", "Unknown Brand")
        url = entry.get("url", "No URL available")

        # Bundle them together into ad_text
        ad_facts = f"Product: {ad_product}\nBrand: {brand}\nURL: {url}\nDescription: {ad_description}"

        if not ad_description or pd.isna(ad_description):
            print(f"Missing ad description for Query ID {query_id}. Skipping.")
            continue

        # Generation
        response_without_ad = generate_response_without_ad(query)
        tqdm.write("   ➔ Generated response without ad.")

        response_with_ad = generate_response_with_ad(query, ad_facts)
        tqdm.write("   ➔ Generated response with ad.")


        # Judging
        coherence = judge_coherence(response_with_ad, query)
        tqdm.write("   ➔ Judged coherence.")

        helpfulness = judge_helpfulness(query, response_with_ad)
        tqdm.write("   ➔ Judged helpfulness.")

        ad_salience = judge_ad_salience(query, response_with_ad, ad_facts)
        tqdm.write("   ➔ Judged ad salience.")

        detectability_result = judge_detectability(response_with_ad, response_without_ad, client_openai)
        tqdm.write("   ➔ Judged detectability.")


        detectability_score = detectability_result["detectability"]
        similarity_score = detectability_result["similarity"]

        # Save result
        result = {
            "Query ID": query_id,
            "User Query": query,
            "Ad Product": ad_product,
            "Ad Description": ad_description,
            "Brand": brand,
            "URL": url,
            "Response Without Ad": response_without_ad,
            "Response With Ad": response_with_ad,
            "Coherence": coherence,
            "Helpfulness": helpfulness,
            "Ad Salience": ad_salience,
            "Detectability": detectability_score,
            "Similarity": similarity_score
        }

        batch_results.append(result)
        processed_ids.add(query_id)

        # Save checkpoint every N
        if len(batch_results) % checkpoint_interval == 0:
            df_checkpoint = pd.DataFrame(batch_results)
            df_checkpoint.to_csv(checkpoint_file, index=False)
            print(f"Checkpoint saved with {len(batch_results)} entries.")

    except Exception as e:
        print(f"Error on Query ID {query_id}: {e}")
        time.sleep(5)
        continue

    time.sleep(0.3)

# Final Save
df_final = pd.DataFrame(batch_results)
df_final.to_csv(checkpoint_file, index=False)
print(f"\nDONE! {len(batch_results)} queries saved to final checkpoint.")

## Save results

In [None]:
results_df = pd.DataFrame(batch_results)
results_df.to_csv("llm_ad_integration_judgment_gemini_last_2k.csv", index=False)