# Ingestion pipeline

In [30]:
import pandas as pd

# Load raw LLM-generated data after much cleaning
df = pd.read_json("data/data.json")

In [31]:
print(df.columns)

Index(['id', 'book_name', 'author_name', 'series_name', 'series_position',
       'subgenres', 'themes', 'summary', 'page_count', 'publication_year',
       'publisher', 'target_audience', 'pacing', 'tone', 'writing_style',
      dtype='object')


In [32]:
df

Unnamed: 0,id,book_name,author_name,series_name,series_position,subgenres,themes,summary,page_count,publication_year,publisher,target_audience,pacing,tone,writing_style,setting_type,technology_focus,content_warnings
0,0,Dune,Frank Herbert,Dune Chronicles,1,Space Opera; Political SF,Ecology; Religion; Politics; Resource Control;...,"On the desert planet Arrakis, control over the...",608,1965,Chilton Books,Adult,Slow-burn; Character-driven,Philosophical; Epic; Political,Dense; Descriptive; Multi-layered,Desert planet; Feudal space empire,Mentats; Spice-based space travel; No AI,Violence; Political intrigue
1,1,Neuromancer,William Gibson,Sprawl Trilogy,1,Cyberpunk,Artificial Intelligence; Corporate Control; Vi...,A washed-out hacker is drawn into a dangerous ...,271,1984,Ace Books,Adult,Fast-paced; Action-driven,Dark; Gritty; Noir,Dense; Technical; Stylized,Near-future Earth; Cyberspace,AI; Cyberspace; Neural interfaces; Body modifi...,Violence; Drug use; Mature themes
2,2,The Left Hand of Darkness,Ursula K. Le Guin,Hainish Cycle,4,Social SF; Planetary SF,Gender; Politics; Cultural Difference; Loyalty...,An envoy to the icy world of Winter must navig...,336,1969,Ace Books,Adult,Slow-burn; Character-driven,Philosophical; Contemplative; Political,Literary; Descriptive; Thoughtful,Ice planet; Feudal societies,Minimal technology; Ansible communication,Political intrigue
3,3,Foundation,Isaac Asimov,Foundation Series,1,Social SF; Space Opera,Psychohistory; Empire; Civilization; Knowledge...,"As a galactic empire declines, a Foundation is...",255,1951,Gnome Press,Adult,Moderate; Idea-driven,Intellectual; Epic; Optimistic,Clear; Dialogue-heavy; Concept-focused,Galactic empire; Multiple worlds,Atomic power; Space travel; Psychohistory,
4,4,The Hobbit,J.R.R. Tolkien,Middle-earth,0,High Fantasy; Adventure,Heroism; Greed; Home; Adventure; Personal Growth,Bilbo Baggins sets out on a quest with dwarves...,310,1937,George Allen & Unwin,Young Adult; Adult,Moderate; Adventure-driven,Whimsical; Adventurous; Light,Accessible; Descriptive; Fairy-tale-like,Middle-earth; Fantasy world,Pre-industrial; Magic items,Fantasy violence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,187,The Bonehunters,Steven Erikson,Malazan Book of the Fallen,6,Epic fantasy; Military fantasy,Survival; Betrayal; Duty; Endurance,The Malazan armies push deeper into enemy terr...,1232,2006,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate; tension rising,Dark; Gritty; Heroic undertone,Third-person; expansive POV,"Battlefields, cities, warrens, besieged zones",Magic; divine interference; war tech,Violence; Death; Disease
188,188,Reaper’s Gale,Steven Erikson,Malazan Book of the Fallen,7,Epic fantasy; Cataclysmic fantasy,Collapse; Rebirth; Sacrifice; Cosmic stakes,"As gods, armies, and ancient machinations conv...",1280,2007,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate to fast; multiple climaxes,Epic; Tense; Dramatic,Third-person multiple POV,"Continents, warrens, divine realms",Magic; divine-mortal interaction,Violence; Death; Cosmic peril
189,189,Toll the Hounds,Steven Erikson,Malazan Book of the Fallen,8,Epic fantasy; Dark fantasy,Memory; Regret; Fate; Redemption,Amid political and spiritual upheavals in Daru...,1296,2008,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate; deep tension,Somber; Reflective; Dark,Third-person multiple POV,"City, warrens, mythic zones",Magic; prophecy; warrens,Violence; Vengeance; Loss
190,190,Dust of Dreams,Steven Erikson,Malazan Book of the Fallen,9,Epic fantasy; Endgame fantasy,Perseverance; Fate; Transition; Consequence,As the Malazan forces march eastward through t...,1280,2010,Bantam (UK); Tor (US),Adult / epic fantasy readers,Moderate; heavy with foreshadowing,Ominous; Reflective; Grim,Third-person multiple POV,"Wastelands, borderlands, rising menace",Magic; ancient weapons; prophecy,Violence; Death; Despair


In [33]:
# Text fields minsearch will vectorize:
text_fields = [
    "book_name","author_name","series_name","subgenres","themes","summary",
    "publisher","target_audience","pacing","tone","writing_style",
    "setting_type","technology_focus","content_warnings"
]

In [34]:
print(df[text_fields].isna().sum().sort_values(ascending=False))

book_name           0
author_name         0
series_name         0
subgenres           0
themes              0
summary             0
publisher           0
target_audience     0
pacing              0
tone                0
writing_style       0
setting_type        0
technology_focus    0
dtype: int64


In [37]:
documents = df.to_dict(orient="records")

In [38]:
import minsearch

# Create and fit the index with enriched metadata fields
index = minsearch.Index(
    text_fields=[
        'book_name',
        'author_name',
        'series_name',
        'subgenres',
        'themes',
        'summary',
        'publisher',
        'target_audience',
        'pacing',
        'tone',
        'writing_style',
        'setting_type',
        'technology_focus',
        'content_warnings'
    ],
    keyword_fields=[
        'id',
        'series_position',
        'page_count',
        'publication_year'
    ]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x217d1651190>

In [45]:
query = "Which books are written by Robert Jordan?"
results = index.search(query, num_results=5)

In [46]:
results

[{'id': 93,
  'book_name': 'The Eye of the World',
  'author_name': 'Robert Jordan',
  'series_name': 'The Wheel of Time',
  'series_position': 1,
  'subgenres': 'High fantasy; Epic fantasy',
  'themes': 'Prophecy; Coming of age; Light vs Shadow; Fellowship',
  'summary': 'Villagers are driven from their home by dark forces and join Moiraine and Lan on a journey that reveals a prophesied hero and a world-spanning conflict.',
  'page_count': 782,
  'publication_year': 1990,
  'publisher': 'Tor Books (US); Orbit (UK)',
  'target_audience': 'Adult / epic fantasy readers',
  'pacing': 'Moderate; quest-driven with dense worldbuilding',
  'tone': 'Mythic; Adventurous; Foreboding',
  'writing_style': 'Third-person limited; multiple POVs',
  'setting_type': 'Broad secondary world across nations and wilds',
  'technology_focus': 'The One Power; Artifacts; Ancient lore',
 {'id': 103,
  'book_name': 'Knife of Dreams',
  'author_name': 'Robert Jordan',
  'series_name': 'The Wheel of Time',
  'seri

# Retrieval (RAG) Flow

In [10]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [None]:
prompt_template = """
You are an assistant for helping people decide on which fantasy and sci-fi books to read. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: {context}
""".strip()
    
entry_template = """
book_name: {book_name}
author_name: {author_name}
subgenres: {subgenres}
summary: {summary}
rating: {rating}
release_date: {release_date}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [13]:
query = 'What is the release order of Steven Erikson books?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
answer = rag(query)
print(answer)

To determine the release order of Steven Erikson books, we need to look at the release dates. 

In that case, here are the release dates in chronological order:

1. Gardens of the Moon - 1999
2. Memories of Ice - 2001
3. House of Chains - 2002
4. Deadhouse Gates - 2000
5. Midnight Tides - 2004
6. Reaper's Gale - 2007
7. Toll the Hounds - 2008
8. Dust of Dreams - 2009
9. The Bonehunters - 2006 (out of order in initial context, placed correctly here)
10. The Crippled God - 2011


# Retrieval evaluation

## Data generation

In [74]:
import pandas as pd
df = pd.read_csv('data/data.csv')
df.insert(0, 'id', df.index)
documents = df.to_dict(orient='records')

In [75]:
documents

[{'id': 0,
  'book_name': 'Dune',
  'author_name': 'Frank Herbert',
  'subgenres': 'Space Opera; Political SF',
  'summary': 'On the desert planet Arrakis, control over the spice melange triggers a clash of prophecy, politics, and power.',
  'rating': 'Four and a half out of Five',
  'release_date': 1965},
 {'id': 1,
  'book_name': 'Neuromancer',
  'author_name': 'William Gibson',
  'subgenres': 'Cyberpunk',
  'summary': 'A washed-out hacker is drawn into a dangerous AI heist across cyberspace and corporate intrigue.',
  'rating': 'Four out of Five',
  'release_date': 1984},
 {'id': 2,
  'book_name': 'The Left Hand of Darkness',
  'author_name': 'Ursula K. Le Guin',
  'subgenres': 'Social SF; Planetary SF',
  'summary': 'An envoy to the icy world of Winter must navigate a culture without fixed gender amid political tension.',
  'rating': 'Four out of Five',
  'release_date': 1969},
 {'id': 3,
  'book_name': 'Foundation',
  'author_name': 'Isaac Asimov',
  'subgenres': 'Social SF; Space

In [76]:
import json

In [None]:
prompt_template = """
You emulate a book reader who's excited to learn about popular fantasy and sci-fi books.
Formulate 5 specific questions this reader might ask based on a provided book. 
Make the questions specific to this book.
The record should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as few words as possible from the record. 

The record:

book_name: {book_name}
author_name: {author_name}
subgenres: {subgenres}
summary: {summary}
rating: {rating}
release_date: {release_date}

Provide the output in parsable JSON without using code blocks:
{{"questions": ["question1", "question2", ..., "question5"]}}

Make sure they are completely CONSISTENT with each other in format.

""".strip()

#### Test that the output makes sense

In [94]:
prompt_test = prompt_template.format(**documents[0])

In [95]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [96]:
questions = llm(prompt_test)

In [97]:
import json
json.loads(questions)

JSONDecodeError: Expecting ',' delimiter: line 6 column 130 (char 565)

#### Generate all

In [34]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', # llama3.2:latest, deepseek-r1, qwen3:8b
        messages=[ {'role': 'user','content': prompt}]
        )
    
    json_response = response.message.content
    return json_response

In [35]:
from tqdm.auto import tqdm

In [36]:
results = {}

In [37]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/193 [00:00<?, ?it/s]

In [None]:
# Outputs the very last one
questions

'{"questions": \n["What is the Malazan saga\'s conclusion like in The Crippled God?", \n"What role does sacrifice play in the story of The Crippled God?", \n"How does the book\'s release date impact its reception by readers?", \n"In what way does the cosmic scale affect the plot of The Crippled God?", \n"Why does the author, Steven Erikson, receive a rating of Five out of Five for this novel?"]}'

In [12]:
# Full results
results

{0: '{\n    "questions": [\n        "What is the significance of the spice melange in the novel, and how does it impact the plot?",\n        "Can you elaborate on the character of Paul Atreides and his role in the story, including his prescience abilities?",\n        "How does Frank Herbert\'s depiction of politics and power struggle on Arrakis reflect real-world issues or inspirations?",\n        "What is the nature of the Fremen people, and how do they relate to the Atreides family and the overall narrative?",\n        "In what ways does the novel Dune explore themes of ecology, environmentalism, and the consequences of human actions?"\n    ]\n}',
 1: '{\n  "questions": [\n    "What is the primary occupation of the protagonist, Case, at the beginning of Neuromancer?",\n    "How does the character of Molly use her abilities to aid Case throughout the story?",\n    "What is the purpose of the \'Wintermute\' AI in relation to the heist across cyberspace?",\n    "In what way do the corpo

### The results are of course not fully consistent + the following gives errors.

In [None]:
"""final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))"""

In [None]:
"""final_results[0]"""

(0, '{')

In [None]:
"""df_results = pd.DataFrame(final_results, columns=['id', 'question'])
df_results.to_csv('data/ground-truth-retrieval.csv', index=False)"""

### ChatGPT cleaning script to fix the above and give a correct output.

In [70]:
import json, re, ast
import pandas as pd

def parse_any(text_or_obj):
    """Return a Python object from possibly-messy LLM output."""
    x = text_or_obj
    if isinstance(x, (list, dict)):
        return x
    if not isinstance(x, str):
        return None

    s = x.strip()
    # Strip ```json fences
    s = re.sub(r"^```(?:json)?\s*|\s*```$", "", s, flags=re.IGNORECASE|re.MULTILINE)
    # Normalize quotes
    s = s.replace("“","\"").replace("”","\"").replace("’","'").replace("‘","'")

    # Try direct JSON
    try:
        return json.loads(s)
    except Exception:
        pass

    # Try to extract the first [...] array
    m = re.search(r"\[(?:.|\n)*\]", s)
    if m:
        arr = m.group(0)
        try:
            return json.loads(arr)
        except Exception:
            try:
                return ast.literal_eval(arr)
            except Exception:
                pass

    # As a last resort, return the raw string
    return s

def normalize_questions(obj):
    """Always return List[str] of questions."""
    if obj is None: return []

    # Already a list
    if isinstance(obj, list):
        out = []
        for item in obj:
            if isinstance(item, str):
                out.append(item.strip())
            elif isinstance(item, dict) and "question" in item:
                out.append(str(item["question"]).strip())
        return [q for q in out if q]

    # Dict forms
    if isinstance(obj, dict):
        if "questions" in obj and isinstance(obj["questions"], list):
            return normalize_questions(obj["questions"])
        # keys like question1, question2, ...
        numbered = [obj[k] for k in sorted(obj) if k.lower().startswith("question")]
        if numbered:
            return [str(v).strip() if not isinstance(v, dict) else str(v.get("question","")).strip()
                    for v in numbered if v]
        # look for nested list
        for v in obj.values():
            n = normalize_questions(v)
            if n: return n
        return []

    # Raw string: split by newlines/bullets if needed
    if isinstance(obj, str):
        # Try to parse again as JSON array if it looks like one
        if obj.strip().startswith("["):
            try:
                return normalize_questions(json.loads(obj))
            except Exception:
                pass
        # Fallback: naive split (keeps only question-like lines)
        lines = [ln.strip(" -•\t") for ln in obj.splitlines()]
        return [ln for ln in lines if ln.endswith("?") and len(ln) > 3]

    return []

# Build a tidy dataframe
rows = []
for doc_id, raw in results.items():
    parsed = parse_any(raw)
    qs = normalize_questions(parsed)
    for q in qs:
        rows.append((doc_id, q))

df_results = pd.DataFrame(rows, columns=["id", "question"])

# Optional cleaning
df_results["question"] = (
    df_results["question"]
      .astype(str)
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

# Drop empties/very short entries
df_results = df_results[df_results["question"].str.len() > 5]

# Save safely (handles commas/newlines with proper quoting)
df_results.to_csv("data/ground-truth-retrieval.csv", index=False)


## Evaluation

### Generate embeddings

In [1]:
import pandas as pd
df_question = pd.read_csv('data/ground-truth-retrieval.csv')

In [2]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the significance of the spice melange ...
1,0,Can you elaborate on the character of Paul Atr...
2,0,How does Frank Herbert's depiction of politics...
3,0,"What is the nature of the Fremen people, and h..."
4,0,In what ways does the novel Dune explore theme...


In [3]:
ground_truth = df_question.to_dict(orient='records')

In [None]:
ground_truth[0]

{'id': 0,
 'question': 'What is the significance of the spice melange in the novel, and how does it impact the plot?'}

In [7]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [21]:
from tqdm.auto import tqdm

vectors = []

for doc in tqdm(ground_truth):
    question = doc['question']
    vector = model.encode(question)
    vectors.append(vector)

100%|██████████| 771/771 [00:10<00:00, 74.07it/s]


In [22]:
import numpy as np

vectors = np.array(vectors)

In [24]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields=['id'])
vindex.fit(vectors, ground_truth)

<minsearch.vector.VectorSearch at 0x1efc99cfcb0>

### Retrieval evaluation

In [25]:
def minsearch_vector_search(vector, question):
    return vindex.search(
        vector,
        filter_dict={'question': question},
        num_results=5
    )

def question_text_vector(q):
    question = q['question']
    v_q = model.encode(question)

    return minsearch_vector_search(v_q, question)

In [40]:
question_text_vector(dict(
    question='Which fantasy or sci-fi books in the list have to do with romance?'
))

[{'id': 117,
  'question': 'What themes or moral lessons do you think are most prominent in this epic fantasy novel?'},
 {'id': 8,
  'question': "Can you describe the subgenre of Science Fantasy and its influence on the book's plot?"},
 {'id': 142,
  'question': 'How does the science fantasy element contribute to the Gothic atmosphere in the book?'},
 {'id': 151,
  'question': 'What commentary does the author offer through the character of the hybrid offspring regarding societal norms and expectations of romantic relationships?'},
 {'id': 29,
  'question': 'Why is the novel classified as a Gothic SF, and what elements contribute to this classification?'}]

In [54]:
prompt_template = """
You are an assistant for helping people decide on which fantasy and sci-fi books to read. 
Answer the QUESTION based on the CONTEXT from the provided database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}
CONTEXT: {context}
""".strip()
    

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [55]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [56]:
def rag(query: dict) -> str:
    search_results = question_text_vector(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt)
    return answer

In [69]:
ground_truth[16]

{'id': 4,
 'question': 'Who is the wizard that joins forces with Bilbo and the dwarves to reclaim treasure from the dragon?'}

In [68]:
rag(ground_truth[20])

"I'm ready to help. However, I don't see a CONTEXT provided. Please share the context about A Game of Thrones, including the noble houses, and I'll be happy to answer your question based on that information."

In [67]:
df.iloc[20]['summary'] 

'The Mars trilogy concludes with political struggles, terraforming triumphs, and humanity’s expansion into space.'

In [None]:
# Cosine similarity metric
answer_orig = 'Yes, sessions are recorded if you miss one. Everything is recorded, allowing you to catch up on any missed content. Additionally, you can ask questions in advance for office hours and have them addressed during the live stream. You can also ask questions in Slack.'
answer_llm = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

v_llm.dot(v_orig)

### LLM-as-a-Judge

## Monitoring

## Interface