### Retrieval Evaluation Metrics

#### Generate Gold Truth datasets

In [34]:
from typing import List
import json, random
from pydantic import BaseModel


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from openai import OpenAI

openai_client = OpenAI()

In [5]:
from qdrant_client import QdrantClient

client = QdrantClient("http://localhost:6333")
collection_name = "met-museum-euro-artworks"

In [6]:
def get_all_points() -> List:
    """
    Retrieve all points at once. Only use for small collections (<10k points).
    
    Returns:
        List of all point records
    """
    all_points = []
    offset = None
    
    while True:
        records, next_offset = client.scroll(
            collection_name=collection_name,
            #limit=10,
            offset=offset,
            with_payload=True,
            with_vectors=False
        )

        all_points.extend(records)
        
        if next_offset is None:
            break
        
        offset = next_offset    
    return all_points

In [7]:
painting_objects = get_all_points()

In [9]:
painting_description_lst = []

for painting_obj in painting_objects:
    painting_description_lst.append({"id": painting_obj.id , "artwork_text": painting_obj.payload['artwork_text']}) 

In [46]:
def validate_queries(queries: List[str], title_and_artist: str, description: str) -> List[str]:
    valid_queries = []

    for q in queries:
        # tests for valid queries
        if len(q.split()) < 5:
            continue

        if title_and_artist.lower() in q.lower():
            continue

        # simple query overlap
        desc_words = set(description.lower().split())
        query_words = set(q.lower().split())

        overlap = len(desc_words & query_words) / len(query_words)
        if overlap > 0.7:
            continue

        valid_queries.append(q)

    return valid_queries



In [47]:
class QueryResponse(BaseModel):
    queries: List[str]

def generate_ground_truth_questions(num_queries: str, artwork_text: str) -> List[str]:
    title_artist = artwork_text.split('. The description of the artwork is: ')[0]
    painting_description = artwork_text.split('. The description of the artwork is: ')[1]
    
    prompt = f"""You are an expert in creating RAG evaluation datasets for museum chatbots. Your task is to generate EXACTLY {num_queries} realistic, diverse user queries that should retrieve information about this specific painting.


        Given this painting information:
        Title and Artist: {title_artist if title_artist else "Not provided"}
        Description: {painting_description}

        CONTEXT: These queries will be used to evaluate a RAG system for a Met Museum chatbot. Visitors use this chatbot to discover paintings they want to see. Your queries must reflect REAL visitor behavior and needs.
        
        GENERATE {num_queries} QUERIES WITH THE FOLLOWING DISTRIBUTION:

        1. SPECIFIC ARTWORK QUERIES (20%): Direct questions about this painting using 2-3 specific visual elements from the description
        - Use partial information (no exact title/artist)
        - Example: "painting with water lilies and Japanese bridge"
        - Difficulty: Easy-Medium

        2. THEMATIC QUERIES (25%): Broader themes that should include this painting
        - Focus on subject matter, symbolism, or narrative that are present in the painting's description
        - Example: "paintings depicting mythology from ancient Greece"
        - Difficulty: Medium

        3. COMPARATIVE QUERIES (20%): Reference similar works, movements, or styles
        - Require understanding of art history context
        - Example: "works similar to Caravaggio's dramatic lighting style"
        - Difficulty: Medium-Hard

        4. NAVIGATIONAL QUERIES (20%): Gallery/collection-based questions
        - Time period, culture, medium, or department-specific
        - Example: "what Impressionist landscapes can I see here?"
        - Difficulty: Easy-Medium

        5. EXPLORATORY QUERIES (15%): Open-ended discovery questions
        - Mood, color palette, artistic techniques
        - Example: "paintings that feel peaceful and contemplative"
        - Difficulty: Medium-Hard

        CRITICAL REQUIREMENTS:

        A. DIFFICULTY CALIBRATION:
        - Easy: Direct visual elements, clear subjects (30% of queries)
        - Medium: Requires semantic understanding or context (50% of queries)
        - Hard: Abstract concepts, multi-hop reasoning, or ambiguous phrasing (20% of queries)

        B. LINGUISTIC DIVERSITY:
        - Vary formality: questions vs. statements vs. keywords
        - Mix specificity: vague → precise
        - Different query lengths: 5-25 words
        - Include natural language variations ("show me", "looking for", "are there any")

        C. EDGE CASES TO INCLUDE (at least 2):
        - Misspellings or informal terms
        - Temporally ambiguous ("Renaissance" vs "1500s")
        - Culturally ambiguous ("Asian art" when more specific available)
        - Under-specified queries that need disambiguation
        - Partial/incomplete descriptions ('that pinting with the blue..')

        D. REALISM PRINCIPLES:
        - Use vocabulary real visitors use (not art historian jargon unless for Hard queries)
        - Include implicit intent ("I want to see..." vs "retrieve documents about...")
        - Add conversational markers occasionally ("I'm interested in...", "Can you help me find...")
        - NO exact title or artist name unless it's a common search pattern

        AVOID:
        - Generic queries that would retrieve 100+ paintings ("famous paintings")
        - Queries that mention the exact title
        - Queries that only an art expert would ask
        - Repetitive phrasings
        - Overly SEO-like keyword stuffing

        Return ONLY a parsable JSON without using code blocks. Format:
        ["query 1", "query 2", ... "query {num_queries}"]
""".strip()
    
    
    
    # response = openai_client.chat.completions.create(
    #     model='gpt-4o',
    #     messages=[ {"role": "user", "content": prompt}],
    #     temperature=0.8,
    #     max_tokens=500,
    #     response_format={"type": "json_object"}
    # )

    response = openai_client.chat.completions.parse(
        model='gpt-4o',
        messages=[ {"role": "user", "content": prompt}],
        temperature=0.8,
        response_format=QueryResponse

    )
    
    queries = json.loads(response.choices[0].message.content)
    valid_queries = validate_queries(queries = queries['queries'], title_and_artist=title_artist, description=painting_description)

    return valid_queries


In [48]:
random_indexes = random.sample(range(len(painting_description_lst)), 100)

In [49]:
ground_truth_lst = []

for painting in [painting_description_lst[i] for i in random_indexes]:

    query_lst_of_dicts = [{'id': painting['id'],'question': q} for q in generate_ground_truth_questions(5, painting['artwork_text'])]

    ground_truth_lst.extend(query_lst_of_dicts)

In [51]:
import csv 

with open('ground_truth.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['id', 'question'])
    writer.writeheader()
    writer.writerows(ground_truth_lst)