# Ingestion pipeline

In [30]:
import pandas as pd

# Load raw LLM-generated data after much cleaning
df = pd.read_json("data/data.json")

In [31]:
print(df.columns)

Index(['id', 'book_name', 'author_name', 'series_name', 'series_position',
       'subgenres', 'themes', 'summary', 'page_count', 'publication_year',
       'publisher', 'target_audience', 'pacing', 'tone', 'writing_style',
      dtype='object')


In [32]:
df

Unnamed: 0,id,book_name,author_name,series_name,series_position,subgenres,themes,summary,page_count,publication_year,publisher,target_audience,pacing,tone,writing_style,setting_type,technology_focus,content_warnings
0,0,Dune,Frank Herbert,Dune Chronicles,1,Space Opera; Political SF,Ecology; Religion; Politics; Resource Control;...,"On the desert planet Arrakis, control over the...",608,1965,Chilton Books,Adult,Slow-burn; Character-driven,Philosophical; Epic; Political,Dense; Descriptive; Multi-layered,Desert planet; Feudal space empire,Mentats; Spice-based space travel; No AI,Violence; Political intrigue
1,1,Neuromancer,William Gibson,Sprawl Trilogy,1,Cyberpunk,Artificial Intelligence; Corporate Control; Vi...,A washed-out hacker is drawn into a dangerous ...,271,1984,Ace Books,Adult,Fast-paced; Action-driven,Dark; Gritty; Noir,Dense; Technical; Stylized,Near-future Earth; Cyberspace,AI; Cyberspace; Neural interfaces; Body modifi...,Violence; Drug use; Mature themes
2,2,The Left Hand of Darkness,Ursula K. Le Guin,Hainish Cycle,4,Social SF; Planetary SF,Gender; Politics; Cultural Difference; Loyalty...,An envoy to the icy world of Winter must navig...,336,1969,Ace Books,Adult,Slow-burn; Character-driven,Philosophical; Contemplative; Political,Literary; Descriptive; Thoughtful,Ice planet; Feudal societies,Minimal technology; Ansible communication,Political intrigue
3,3,Foundation,Isaac Asimov,Foundation Series,1,Social SF; Space Opera,Psychohistory; Empire; Civilization; Knowledge...,"As a galactic empire declines, a Foundation is...",255,1951,Gnome Press,Adult,Moderate; Idea-driven,Intellectual; Epic; Optimistic,Clear; Dialogue-heavy; Concept-focused,Galactic empire; Multiple worlds,Atomic power; Space travel; Psychohistory,
4,4,The Hobbit,J.R.R. Tolkien,Middle-earth,0,High Fantasy; Adventure,Heroism; Greed; Home; Adventure; Personal Growth,Bilbo Baggins sets out on a quest with dwarves...,310,1937,George Allen & Unwin,Young Adult; Adult,Moderate; Adventure-driven,Whimsical; Adventurous; Light,Accessible; Descriptive; Fairy-tale-like,Middle-earth; Fantasy world,Pre-industrial; Magic items,Fantasy violence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,187,The Bonehunters,Steven Erikson,Malazan Book of the Fallen,6,Epic fantasy; Military fantasy,Survival; Betrayal; Duty; Endurance,The Malazan armies push deeper into enemy terr...,1232,2006,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate; tension rising,Dark; Gritty; Heroic undertone,Third-person; expansive POV,"Battlefields, cities, warrens, besieged zones",Magic; divine interference; war tech,Violence; Death; Disease
188,188,Reaper’s Gale,Steven Erikson,Malazan Book of the Fallen,7,Epic fantasy; Cataclysmic fantasy,Collapse; Rebirth; Sacrifice; Cosmic stakes,"As gods, armies, and ancient machinations conv...",1280,2007,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate to fast; multiple climaxes,Epic; Tense; Dramatic,Third-person multiple POV,"Continents, warrens, divine realms",Magic; divine-mortal interaction,Violence; Death; Cosmic peril
189,189,Toll the Hounds,Steven Erikson,Malazan Book of the Fallen,8,Epic fantasy; Dark fantasy,Memory; Regret; Fate; Redemption,Amid political and spiritual upheavals in Daru...,1296,2008,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate; deep tension,Somber; Reflective; Dark,Third-person multiple POV,"City, warrens, mythic zones",Magic; prophecy; warrens,Violence; Vengeance; Loss
190,190,Dust of Dreams,Steven Erikson,Malazan Book of the Fallen,9,Epic fantasy; Endgame fantasy,Perseverance; Fate; Transition; Consequence,As the Malazan forces march eastward through t...,1280,2010,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate; heavy with foreshadowing,Ominous; Reflective; Grim,Third-person multiple POV,"Wastelands, borderlands, rising menace",Magic; ancient weapons; prophecy,Violence; Death; Despair


In [None]:
# Check for NaN values:
text_fields = [
    "book_name","author_name","series_name","subgenres","themes","summary",
    "publisher","target_audience","pacing","tone","writing_style",
    "setting_type","technology_focus","content_warnings"
]

print(df[text_fields].isna().sum().sort_values(ascending=False))

book_name           0
author_name         0
series_name         0
subgenres           0
themes              0
summary             0
publisher           0
target_audience     0
pacing              0
tone                0
writing_style       0
setting_type        0
technology_focus    0
dtype: int64


In [37]:
documents = df.to_dict(orient="records")

In [38]:
import minsearch

# Create and fit the index with enriched metadata fields
index = minsearch.Index(
    text_fields=[
        'book_name',
        'author_name',
        'series_name',
        'subgenres',
        'themes',
        'summary',
        'publisher',
        'target_audience',
        'pacing',
        'tone',
        'writing_style',
        'setting_type',
        'technology_focus',
        'content_warnings'
    ],
    keyword_fields=[
        'id',
        'series_position',
        'page_count',
        'publication_year'
    ]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x217d1651190>

In [45]:
query = "Which books are written by Robert Jordan?"
results = index.search(query, num_results=5)

In [46]:
results

[{'id': 93,
  'book_name': 'The Eye of the World',
  'author_name': 'Robert Jordan',
  'series_name': 'The Wheel of Time',
  'series_position': 1,
  'subgenres': 'High fantasy; Epic fantasy',
  'themes': 'Prophecy; Coming of age; Light vs Shadow; Fellowship',
  'summary': 'Villagers are driven from their home by dark forces and join Moiraine and Lan on a journey that reveals a prophesied hero and a world-spanning conflict.',
  'page_count': 782,
  'publication_year': 1990,
  'publisher': 'Tor Books (US); Orbit (UK)',
  'target_audience': 'Adult / epic fantasy readers',
  'pacing': 'Moderate; quest-driven with dense worldbuilding',
  'tone': 'Mythic; Adventurous; Foreboding',
  'writing_style': 'Third-person limited; multiple POVs',
  'setting_type': 'Broad secondary world across nations and wilds',
  'technology_focus': 'The One Power; Artifacts; Ancient lore',
 {'id': 103,
  'book_name': 'Knife of Dreams',
  'author_name': 'Robert Jordan',
  'series_name': 'The Wheel of Time',
  'seri

# Retrieval (RAG) Flow

In [47]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [48]:
prompt_template = """
You are an assistant for helping people decide on which fantasy and sci-fi books to read. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: {context}
""".strip()
    
entry_template = """
book_name: {book_name}
author_name: {author_name}
series_name: {series_name}
subgenres: {subgenres}
themes: {themes}
summary: {summary}
publisher: {publisher}
target_audience: {target_audience}
pacing: {pacing}
tone: {tone}
writing_style: {writing_style}
setting_type: {setting_type}
technology_focus: {technology_focus}
content_warnings: {content_warnings}
"""


def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [49]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [52]:
query = 'What is the release order of Steven Erikson books? Include all 10 books written by the author.'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [53]:
answer = rag(query)
print(answer)

Based on the provided context, here is the release order of Steven Erikson's books in the Malazan Book of the Fallen series:

1. Gardens of the Moon (1999)
2. Deadhouse Gates (not listed in the provided context, but released in 2000)
3. Memories of Ice (2001)
4. House of Chains (2002)
5. Midnight Tides (2004)
6. The Bonehunters (2006)
7. Reaper's Gale (2007)
8. Toll the Hounds (2008)
9. Dust of Dreams (2010)
10. The Crippled God (2010)

Note: This list only includes the main books in the series and does not include any companion novels, short stories, or other works by Steven Erikson.


# Retrieval evaluation

## Data generation

In [1]:
import pandas as pd
df = pd.read_json('data/data.json')
documents = df.to_dict(orient='records')

In [2]:
documents

[{'id': 0,
  'book_name': 'Dune',
  'author_name': 'Frank Herbert',
  'series_name': 'Dune Chronicles',
  'series_position': 1,
  'subgenres': 'Space Opera; Political SF',
  'themes': 'Ecology; Religion; Politics; Resource Control; Prophecy',
  'summary': 'On the desert planet Arrakis, control over the spice melange triggers a clash of prophecy, politics, and power.',
  'page_count': 608,
  'publication_year': 1965,
  'publisher': 'Chilton Books',
  'target_audience': 'Adult',
  'pacing': 'Slow-burn; Character-driven',
  'tone': 'Philosophical; Epic; Political',
  'writing_style': 'Dense; Descriptive; Multi-layered',
  'setting_type': 'Desert planet; Feudal space empire',
  'technology_focus': 'Mentats; Spice-based space travel; No AI',
 {'id': 1,
  'book_name': 'Neuromancer',
  'author_name': 'William Gibson',
  'series_name': 'Sprawl Trilogy',
  'series_position': 1,
  'subgenres': 'Cyberpunk',
  'themes': 'Artificial Intelligence; Corporate Control; Virtual Reality; Identity; Techno

In [3]:
import json

In [4]:
prompt_template = """
You are creating evaluation questions for a book metadata database.

You will receive ONE record with the fields:
book_name, author_name, series_name, series_position, subgenres, themes, summary,
page_count, publication_year, publisher, target_audience, pacing, tone, writing_style,
setting_type, technology_focus, content_warnings.

TASK:
- Write 5 questions that can be answered **solely** from THIS record.
- DO NOT ask about plot events, character actions, endings, or trivia not present.
- Keep questions specific to this book but metadata-focused.
- Questions must be complete and not too short.

The record:

book_name: {book_name}
author_name: {author_name}
series_name: {series_name}
series_position: {series_position}
subgenres: {subgenres}
themes: {themes}
summary: {summary}
page_count: {page_count}
publication_year: {publication_year}
publisher: {publisher}
target_audience: {target_audience}
pacing: {pacing}
tone: {tone}
writing_style: {writing_style}
setting_type: {setting_type}
technology_focus: {technology_focus}
content_warnings: {content_warnings}

Provide the output in parsable JSON without using code blocks:
{{"questions": ["question1", "question2", ..., "question5"]}}

Make sure they are completely CONSISTENT with each other in format.

""".strip()

#### Test that the output makes sense

In [70]:
prompt_test = prompt_template.format(**documents[0])

In [71]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [72]:
questions = llm(prompt_test)

In [73]:
import json
json.loads(questions)

{'questions': ["What is the publication year of the book 'Dune'?",
  "Who is the author of the book series 'Dune Chronicles'?",
  "What is the target audience for the book 'Dune'?",
  "How many pages are in the book 'Dune'?",
  "What subgenres does the book 'Dune' belong to?"]}

#### Generate all

In [5]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', # llama3.2:latest, deepseek-r1, qwen3:8b
        messages=[ {'role': 'user','content': prompt}]
        )
    
    json_response = response.message.content
    return json_response

In [6]:
from tqdm.auto import tqdm

In [7]:
results = {}

In [8]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/192 [00:00<?, ?it/s]

In [9]:
# Outputs the very last one
questions

'{\n  "questions": [\n    "What is the publication year of \'The Crippled God\'?",\n    "Which publisher released \'The Crippled God\' in the UK and the US?",\n    "How many pages does the book \'The Crippled God\' have?",\n    "In what year was \'The Crippled God\' published?",\n    "For whom is the target audience of \'The Crippled God\' designated?"\n  ]\n}'

In [10]:
# Full results
results

{0: '{\n  "questions": [\n    "What is the publication year of the book Dune?",\n    "In what genre does the book Dune primarily belong?",\n    "Who is the author of the book Dune?",\n    "How many pages are in the hardcover edition of the book Dune?",\n    "Which audience is the target demographic for the book Dune?"\n  ]\n}',
 1: '{\n    "questions": [\n        {"type": "Metadata", "text": "What is the title of the book in question?", "answerKey": "Neuromancer"},\n        {"type": "Metadata", "text": "Who wrote this book?", "answerKey": "William Gibson"},\n        {"type": "Metadata", "text": "In what series does Neuromancer belong and where is it positioned within that series?", "answerKey": "Sprawl Trilogy, 1st position"},\n        {"type": "Metadata", "text": "What genres or subgenres categorize this book?", "answerKey": "Cyberpunk"},\n        {"type": "Metadata", "text": "How many pages does the book contain?", "answerKey": "271"}\n    ]\n}',
 2: '{"questions": [\n    "What is th

#### The results are of course not fully consistent + the following gives errors.

In [None]:
'''final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

final_results[0]

df_results = pd.DataFrame(final_results, columns=['id', 'question'])
df_results.to_csv('data/ground-truth-retrieval.csv', index=False)
'''

### ChatGPT cleaning script to fix the above and give a correct output.

In [14]:
import json, re, ast
import pandas as pd

def parse_any(text_or_obj):
    """Return a Python object from possibly-messy LLM output."""
    x = text_or_obj
    if isinstance(x, (list, dict)):
        return x
    if not isinstance(x, str):
        return None

    s = x.strip()
    # Strip ```json fences
    s = re.sub(r"^```(?:json)?\s*|\s*```$", "", s, flags=re.IGNORECASE|re.MULTILINE)
    # Normalize quotes
    s = s.replace("“","\"").replace("”","\"").replace("’","'").replace("‘","'")

    # Try direct JSON
    try:
        return json.loads(s)
    except Exception:
        pass

    # Try to extract the first [...] array
    m = re.search(r"\[(?:.|\n)*\]", s)
    if m:
        arr = m.group(0)
        try:
            return json.loads(arr)
        except Exception:
            try:
                return ast.literal_eval(arr)
            except Exception:
                pass

    # As a last resort, return the raw string
    return s

def normalize_questions(obj):
    """Always return List[str] of questions."""
    if obj is None: return []

    # Already a list
    if isinstance(obj, list):
        out = []
        for item in obj:
            if isinstance(item, str):
                out.append(item.strip())
            elif isinstance(item, dict) and "question" in item:
                out.append(str(item["question"]).strip())
        return [q for q in out if q]

    # Dict forms
    if isinstance(obj, dict):
        if "questions" in obj and isinstance(obj["questions"], list):
            return normalize_questions(obj["questions"])
        # keys like question1, question2, ...
        numbered = [obj[k] for k in sorted(obj) if k.lower().startswith("question")]
        if numbered:
            return [str(v).strip() if not isinstance(v, dict) else str(v.get("question","")).strip()
                    for v in numbered if v]
        # look for nested list
        for v in obj.values():
            n = normalize_questions(v)
            if n: return n
        return []

    # Raw string: split by newlines/bullets if needed
    if isinstance(obj, str):
        # Try to parse again as JSON array if it looks like one
        if obj.strip().startswith("["):
            try:
                return normalize_questions(json.loads(obj))
            except Exception:
                pass
        # Fallback: naive split (keeps only question-like lines)
        lines = [ln.strip(" -•\t") for ln in obj.splitlines()]
        return [ln for ln in lines if ln.endswith("?") and len(ln) > 3]

    return []

# Build a tidy dataframe
rows = []
for doc_id, raw in results.items():
    parsed = parse_any(raw)
    qs = normalize_questions(parsed)
    for q in qs:
        rows.append((doc_id, q))

df_results = pd.DataFrame(rows, columns=["id", "question"])

# Optional cleaning
df_results["question"] = (
    df_results["question"]
      .astype(str)
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

# Drop empties/very short entries
df_results = df_results[df_results["question"].str.len() > 5]

# Save safely (handles commas/newlines with proper quoting)
df_results.to_csv("data/ground-truth-retrieval.csv", index=False)


## Evaluation

### Generate embeddings

In [1]:
import pandas as pd
df_question = pd.read_csv('data/ground-truth-retrieval.csv')

In [2]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the publication year of the book Dune?
1,0,In what genre does the book Dune primarily bel...
2,0,Who is the author of the book Dune?
3,0,How many pages are in the hardcover edition of...
4,0,Which audience is the target demographic for t...


In [3]:
ground_truth = df_question.to_dict(orient='records')

In [4]:
ground_truth[0]

{'id': 0, 'question': 'What is the publication year of the book Dune?'}

In [5]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [6]:
from tqdm.auto import tqdm

vectors = []

for doc in tqdm(ground_truth):
    question = doc['question']
    vector = model.encode(question)
    vectors.append(vector)

  0%|          | 0/838 [00:00<?, ?it/s]

In [7]:
import numpy as np

vectors = np.array(vectors)

In [8]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields=['id'])
vindex.fit(vectors, ground_truth)

<minsearch.vector.VectorSearch at 0x1f686019e20>

### Retrieval evaluation

In [9]:
def minsearch_vector_search(vector, question):
    return vindex.search(
        vector,
        filter_dict={'question': question},
        num_results=5
    )

def question_text_vector(q):
    question = q['question']
    v_q = model.encode(question)

    return minsearch_vector_search(v_q, question)

In [10]:
question_text_vector(dict(
    question='What themes does the Dune series follow?'
))

[{'id': 0, 'question': 'In what genre does the book Dune primarily belong?'},
 {'id': 0, 'question': 'Who is the author of the book Dune?'},
 {'id': 0,
  'question': 'Which audience is the target demographic for the book Dune?'},
 {'id': 0, 'question': 'What is the publication year of the book Dune?'},
 {'id': 112,
  'question': 'What are the primary themes associated with this book?'}]

In [11]:
prompt_template = """
You are an assistant for helping people decide on which fantasy and sci-fi books to read. 
Answer the QUESTION based on the CONTEXT (book metadata from the provided database).
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}
CONTEXT: {context}
""".strip()
    

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [13]:
def rag(query: dict) -> str:
    search_results = question_text_vector(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt)
    return answer

In [14]:
ground_truth[20]

{'id': 5,
 'question': "What is the publication year of the book 'A Game of Thrones'?"}

In [15]:
rag(ground_truth[20])

'The publication year of the book "A Game of Thrones" by George R.R. Martin is 1996.'

#### I don't have original "reference answers" to these questions readily available in order to use a cosine similarity metric, and as such we move on to LLM-as-a-Judge evaluation immediately.

### LLM-as-a-Judge

In [None]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [31]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Classify the generated answer's relevance to the given question as one of:
"NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

QUESTION:
{question}

GENERATED_ANSWER:
{answer_llm}

Reply ONLY with a single JSON object (no markdown, no explanation outside JSON), exactly:
{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "a brief reason"
}}
""".strip()

#### Test

In [17]:
len(ground_truth)

838

In [18]:
record = ground_truth[0]
record

{'id': 0, 'question': 'What is the publication year of the book Dune?'}

In [19]:
answer_llm = rag(record)
print(answer_llm)

The publication year of the book "Dune" by Frank Herbert is 1965.


In [21]:
prompt = prompt2_template.format(
    question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: For whom is the target audience of 'The Crippled God' designated?
Generated Answer: The publication year of the book "Dune" by Frank Herbert is 1965.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


#### Actual

In [32]:
import json

In [None]:
evaluations = {}

for record in tqdm(ground_truth):
    id = record['id']
    question = record["question"]   

    if id in evaluations:
        continue

    answer_llm = rag(record) 

    prompt = prompt2_template.format(
        question=question, 
        answer_llm=answer_llm
        )
    
    evaluation = llm(prompt)
    eval_json = json.loads(evaluation)  # parse output   
    evaluations[id] = {
        "id": id,
        "question": question,
        "answer_llm": answer_llm,
        "evaluation": eval_json
    }

#### ChatGPT script for parsing JSON without any LLM-induced errors and incosistencies to avoid crashing the run.

In [36]:
import re

# --- robust parser for "JSON-ish" model outputs ---
def parse_judge_json(text: str):
    raw = text.strip()

    # strip code fences if any
    if raw.startswith("```"):
        raw = raw.strip("`")
        raw = re.sub(r"^\s*json\s*", "", raw, flags=re.IGNORECASE)

    # extract first {...} block if there's surrounding prose
    start = raw.find("{")
    end = raw.rfind("}")
    candidate = raw[start:end+1] if (start != -1 and end != -1 and end > start) else raw

    # normalize common issues
    sane = (candidate
            .replace("\u201c", '"').replace("\u201d", '"')   # smart quotes
            .replace("\u2018", "'").replace("\u2019", "'")
            .replace("True", "true").replace("False", "false").replace("None", "null"))

    # remove trailing commas before } or ]
    sane = re.sub(r",\s*([}\]])", r"\1", sane)

    # if it used single quotes for keys/strings, try swapping
    if "'" in sane and '"' not in sane[:80]:
        sane = sane.replace("'", '"')

    try:
        obj = json.loads(sane)
        # minimal validation
        if obj.get("Relevance") not in {"NON_RELEVANT", "PARTLY_RELEVANT", "RELEVANT"}:
            raise ValueError("bad enum")
        return obj
    except Exception:
        # fallback: keep raw text but don't crash the run
        return {"Relevance": None, "Explanation": raw[:1000]}

In [None]:
evaluations = {}   # dict keyed by id
seen = set()
ckpt_path = "evaluations_checkpoint.json"
save_every = 25

for i, record in enumerate(tqdm(ground_truth)):
    rid = record["id"]
    if rid in seen:
        continue

    question = record["question"]
    answer_llm = rag(record)

    prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
    judge_text = llm(prompt)  # set temperature=0 on your client if possible

    eval_json = parse_judge_json(judge_text)

    evaluations[rid] = {
        "id": rid,
        "question": question,
        "answer_llm": answer_llm,
        "evaluation": eval_json
    }
    seen.add(rid)

    # checkpoint periodically so one bad item doesn’t waste the run
    if (i + 1) % save_every == 0:
        with open(ckpt_path, "w", encoding="utf-8") as f:
            json.dump([evaluations[k] for k in sorted(evaluations)], f, ensure_ascii=False, indent=2)

# final save
with open("evaluations.json", "w", encoding="utf-8") as f:
    json.dump([evaluations[k] for k in sorted(evaluations)], f, ensure_ascii=False, indent=2)

  0%|          | 0/838 [00:00<?, ?it/s]

In [38]:
evaluations

{0: {'id': 0,
  'question': 'What is the publication year of the book Dune?',
  'answer_llm': 'The context does not provide information about a specific book titled "Dune". However, it does mention that there are multiple books and adaptations associated with the name "Dune".\n\nIf you\'d like to know the publication years for different titles related to the "Dune" franchise, I can help with that.',
  'evaluation': {'Relevance': 'NON_RELEVANT',
   'Explanation': "Does not provide relevant publication year for a specific 'Dune' book title"}},
 2: {'id': 2,
  'question': "What is the page count of the book 'The Left Hand of Darkness'?",
  'answer_llm': 'According to the metadata, "The Left Hand of Darkness" by Ursula K. Le Guin has 272 pages.',
  'evaluation': {'Relevance': 'RELEVANT',
   'Explanation': 'The answer directly addresses the question about page count.'}},
 3: {'id': 3,
  'question': 'What is the publication year of the book "Foundation"?',
  'answer_llm': 'The publication ye

In [46]:
# Load JSON
with open("evaluations.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)

# Convert to DataFrame
df_eval = pd.DataFrame(eval_data)

# Flatten the nested "evaluation" dict into separate columns
df_eval["relevance"]   = df_eval["evaluation"].apply(lambda d: d.get("Relevance") if isinstance(d, dict) else None)
df_eval["explanation"] = df_eval["evaluation"].apply(lambda d: d.get("Explanation") if isinstance(d, dict) else None)

# Drop original nested column if you want
df_eval = df_eval.drop(columns=["evaluation"])

# Inspect
print(df_eval.head())

   id                                           question  \
0   0     What is the publication year of the book Dune?   
1   2  What is the page count of the book 'The Left H...   
2   3  What is the publication year of the book "Foun...   
3   4  What is the publication year of the book 'The ...   
4   5  What is the publication year of the book 'A Ga...   

                                          answer_llm        relevance  \
0  The context does not provide information about...     NON_RELEVANT   
1  According to the metadata, "The Left Hand of D...         RELEVANT   
2  The publication year of the book "Foundation" ...             None   
3  The publication year of "The Hobbit" by J.R.R....  PARTLY_RELEVANT   
4  The publication year of the book 'A Game of Th...     NON_RELEVANT   

                                         explanation  
0  Does not provide relevant publication year for...  
1  The answer directly addresses the question abo...  
2  {\n  "Relevance": "PARTLY_RELEVA

In [47]:
# Distribution of relevance labels
print(df_eval["relevance"].value_counts(normalize=True))

# Look at some non-relevant cases
print(df_eval[df_eval["relevance"] == "NON_RELEVANT"].sample(5))


relevance
NON_RELEVANT       0.503497
PARTLY_RELEVANT    0.370629
RELEVANT           0.125874
Name: proportion, dtype: float64
      id                                           question  \
44    47  What is the position of this book within its s...   
32    35  What are the subgenres associated with this book?   
123  146  What is the page count of 'Parable of the Tale...   
70    80    What is the page count of the book 'Insurgent'?   
39    42     What is the title of the book being evaluated?   

                                            answer_llm     relevance  \
44   I'm ready to help. Please go ahead and provide...  NON_RELEVANT   
32   I'm ready to help. Please provide the book met...  NON_RELEVANT   
123  The page count for "Parable of the Talents" by...  NON_RELEVANT   
70   I couldn't find any information about the page...  NON_RELEVANT   
39   I'm ready to help. Please go ahead and provide...  NON_RELEVANT   

                                           explanation  
44  

Due to using a free and open-source LLM-as-a-judge (llama3.2 here), we find that half of the answers are not relevant to the questions posed by the user. 

The judge is also sometimes being too conservative and marking clearly relevant answers as 'PARTLY RELEVANT'.

In [49]:
df_eval.to_csv('data/evaluations.csv', index=False)

## Monitoring

## Interface