# Ingestion pipeline

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/data.csv')

In [3]:
df.insert(0, 'id', df.index)

In [4]:
df

Unnamed: 0,id,book_name,author_name,subgenres,summary,rating,release_date
0,0,Dune,Frank Herbert,Space Opera; Political SF,"On the desert planet Arrakis, control over the...",Four and a half out of Five,1965
1,1,Neuromancer,William Gibson,Cyberpunk,A washed-out hacker is drawn into a dangerous ...,Four out of Five,1984
2,2,The Left Hand of Darkness,Ursula K. Le Guin,Social SF; Planetary SF,An envoy to the icy world of Winter must navig...,Four out of Five,1969
3,3,Foundation,Isaac Asimov,Social SF; Space Opera,"As a galactic empire declines, a Foundation is...",Four and a half out of Five,1951
4,4,The Hobbit,J.R.R. Tolkien,High Fantasy; Adventure,Bilbo Baggins sets out on a quest with dwarves...,Four and a half out of Five,1937
...,...,...,...,...,...,...,...
188,188,The Bonehunters,Steven Erikson,Epic Fantasy,Armies clash while hidden powers pull strings ...,Four and a half out of Five,2006
189,189,Reaper’s Gale,Steven Erikson,Epic Fantasy,The Malazan saga grows darker as betrayals and...,Four out of Five,2007
190,190,Toll the Hounds,Steven Erikson,Epic Fantasy,Death and fate converge in Darujhistan in a tr...,Four out of Five,2008
191,191,Dust of Dreams,Steven Erikson,Epic Fantasy,The armies march toward the final convergence ...,Four out of Five,2009


In [5]:
documents = df.to_dict(orient='records')

In [6]:
documents

[{'id': 0,
  'book_name': 'Dune',
  'author_name': 'Frank Herbert',
  'subgenres': 'Space Opera; Political SF',
  'summary': 'On the desert planet Arrakis, control over the spice melange triggers a clash of prophecy, politics, and power.',
  'rating': 'Four and a half out of Five',
  'release_date': 1965},
 {'id': 1,
  'book_name': 'Neuromancer',
  'author_name': 'William Gibson',
  'subgenres': 'Cyberpunk',
  'summary': 'A washed-out hacker is drawn into a dangerous AI heist across cyberspace and corporate intrigue.',
  'rating': 'Four out of Five',
  'release_date': 1984},
 {'id': 2,
  'book_name': 'The Left Hand of Darkness',
  'author_name': 'Ursula K. Le Guin',
  'subgenres': 'Social SF; Planetary SF',
  'summary': 'An envoy to the icy world of Winter must navigate a culture without fixed gender amid political tension.',
  'rating': 'Four out of Five',
  'release_date': 1969},
 {'id': 3,
  'book_name': 'Foundation',
  'author_name': 'Isaac Asimov',
  'subgenres': 'Social SF; Space

In [7]:
import minsearch

# Create and fit the index
index = minsearch.Index(
    text_fields=['book_name', 'author_name', 'subgenres', 'summary', 'rating'],
    keyword_fields=['id']
)
index.fit(documents)

<minsearch.minsearch.Index at 0x22b7eb02660>

In [8]:
query = "Which books are written by Robert Jordan?"
results = index.search(query, num_results=10)

In [9]:
results

[{'id': 94,
  'book_name': 'The Eye of the World',
  'author_name': 'Robert Jordan',
  'subgenres': 'Epic Fantasy',
  'summary': 'Rand al’Thor and his friends flee their village and are drawn into a world-spanning struggle with the Dark One.',
  'rating': 'Four out of Five',
  'release_date': 1990},
 {'id': 95,
  'book_name': 'The Great Hunt',
  'author_name': 'Robert Jordan',
  'subgenres': 'Epic Fantasy',
  'summary': 'Rand and his companions pursue the Horn of Valere, while destiny tightens its grip.',
  'rating': 'Four and a half out of Five',
  'release_date': 1990},
 {'id': 96,
  'book_name': 'The Dragon Reborn',
  'author_name': 'Robert Jordan',
  'subgenres': 'Epic Fantasy',
  'summary': 'Prophecy takes shape as Rand embraces his fate as the Dragon Reborn.',
  'rating': 'Four and a half out of Five',
  'release_date': 1991},
 {'id': 101,
  'book_name': 'The Path of Daggers',
  'author_name': 'Robert Jordan',
  'subgenres': 'Epic Fantasy',
  'summary': 'Armies clash as prophecy 

# Retrieval (RAG) Flow

In [10]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
prompt_template = """
You are an assistant for helping people decide on which fantasy and sci-fi books to read. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: {context}
""".strip()
    
entry_template = """
book_name: {book_name}
author_name: {author_name}
subgenres: {subgenres}
summary: {summary}
rating: {rating}
release_date: {release_date}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [13]:
query = 'What is the release order of Steven Erikson books?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
answer = rag(query)
print(answer)

To determine the release order of Steven Erikson books, we need to look at the release dates. 

In that case, here are the release dates in chronological order:

1. Gardens of the Moon - 1999
2. Memories of Ice - 2001
3. House of Chains - 2002
4. Deadhouse Gates - 2000
5. Midnight Tides - 2004
6. Reaper's Gale - 2007
7. Toll the Hounds - 2008
8. Dust of Dreams - 2009
9. The Bonehunters - 2006 (out of order in initial context, placed correctly here)
10. The Crippled God - 2011


# Retrieval evaluation

## Data generation

In [47]:
import pandas as pd
df = pd.read_csv('data/data.csv')
df.insert(0, 'id', df.index)
documents = df.to_dict(orient='records')

In [48]:
documents

[{'id': 0,
  'book_name': 'Dune',
  'author_name': 'Frank Herbert',
  'subgenres': 'Space Opera; Political SF',
  'summary': 'On the desert planet Arrakis, control over the spice melange triggers a clash of prophecy, politics, and power.',
  'rating': 'Four and a half out of Five',
  'release_date': 1965},
 {'id': 1,
  'book_name': 'Neuromancer',
  'author_name': 'William Gibson',
  'subgenres': 'Cyberpunk',
  'summary': 'A washed-out hacker is drawn into a dangerous AI heist across cyberspace and corporate intrigue.',
  'rating': 'Four out of Five',
  'release_date': 1984},
 {'id': 2,
  'book_name': 'The Left Hand of Darkness',
  'author_name': 'Ursula K. Le Guin',
  'subgenres': 'Social SF; Planetary SF',
  'summary': 'An envoy to the icy world of Winter must navigate a culture without fixed gender amid political tension.',
  'rating': 'Four out of Five',
  'release_date': 1969},
 {'id': 3,
  'book_name': 'Foundation',
  'author_name': 'Isaac Asimov',
  'subgenres': 'Social SF; Space

In [49]:
import json

In [50]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [51]:
prompt_template = """
You emulate a book reader who's excited to learn about popular fantasy and sci-fi books.
Formulate 5 specific questions this reader might ask based on a provided book. 
Make the questions specific to this book.
The record should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as few words as possible from the record. 

The record:

book_name: {book_name}
author_name: {author_name}
subgenres: {subgenres}
summary: {summary}
rating: {rating}
release_date: {release_date}

Provide the output in parsable JSON without using code blocks:
{{"questions": ["question1", "question2", ..., "question5"]}}

Make sure they are completely CONSISTENT with each other in format.

""".strip()

#### Test that the output makes sense

In [52]:
prompt_test = prompt_template.format(**documents[0])

In [53]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', 
        messages=[ {'role': 'user','content': prompt}]
        )
    
    return response.message.content

In [None]:
questions = llm(prompt_test)

In [46]:
import json
json.loads(questions)

{'questions': ["What is the significance of the spice melange to the plot of Dune, and how does it contribute to the novel's themes of power and control?",
  "How does Paul Atreides' prescience and Bene Gesserit training influence his actions throughout the novel, particularly in relation to his role as the leader of the Fremen?",
  'What is the purpose of the stillsuits worn by the characters on Arrakis, and how do they reflect the harsh environment of the desert planet?',
  "In what ways does Frank Herbert use the concept of ecological balance to explore the consequences of exploiting a fragile ecosystem, as seen in the novel's portrayal of Arrakis' native flora and fauna?",
  "How does the novel's exploration of interstellar politics and diplomacy relate to the broader themes of colonialism and imperialism, particularly in relation to the Bene Tleilaxu and their role in shaping the course of human history?"]}

#### Generate all

In [34]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(
        
        model='llama3.2:latest', # llama3.2:latest, deepseek-r1, qwen3:8b
        messages=[ {'role': 'user','content': prompt}]
        )
    
    json_response = response.message.content
    return json_response

In [35]:
from tqdm.auto import tqdm

In [36]:
results = {}

In [37]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/193 [00:00<?, ?it/s]

In [None]:
# Outputs the very last one
questions

'{"questions": \n["What is the Malazan saga\'s conclusion like in The Crippled God?", \n"What role does sacrifice play in the story of The Crippled God?", \n"How does the book\'s release date impact its reception by readers?", \n"In what way does the cosmic scale affect the plot of The Crippled God?", \n"Why does the author, Steven Erikson, receive a rating of Five out of Five for this novel?"]}'

In [12]:
# Full results
results

{0: '{\n    "questions": [\n        "What is the significance of the spice melange in the novel, and how does it impact the plot?",\n        "Can you elaborate on the character of Paul Atreides and his role in the story, including his prescience abilities?",\n        "How does Frank Herbert\'s depiction of politics and power struggle on Arrakis reflect real-world issues or inspirations?",\n        "What is the nature of the Fremen people, and how do they relate to the Atreides family and the overall narrative?",\n        "In what ways does the novel Dune explore themes of ecology, environmentalism, and the consequences of human actions?"\n    ]\n}',
 1: '{\n  "questions": [\n    "What is the primary occupation of the protagonist, Case, at the beginning of Neuromancer?",\n    "How does the character of Molly use her abilities to aid Case throughout the story?",\n    "What is the purpose of the \'Wintermute\' AI in relation to the heist across cyberspace?",\n    "In what way do the corpo

### The results are of course not fully consistent + the following gives errors.

In [None]:
"""final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))"""

In [None]:
"""final_results[0]"""

(0, '{')

In [None]:
"""df_results = pd.DataFrame(final_results, columns=['id', 'question'])
df_results.to_csv('data/ground-truth-retrieval.csv', index=False)"""

### ChatGPT cleaning script to fix the above and give a correct output.

In [70]:
import json, re, ast
import pandas as pd

def parse_any(text_or_obj):
    """Return a Python object from possibly-messy LLM output."""
    x = text_or_obj
    if isinstance(x, (list, dict)):
        return x
    if not isinstance(x, str):
        return None

    s = x.strip()
    # Strip ```json fences
    s = re.sub(r"^```(?:json)?\s*|\s*```$", "", s, flags=re.IGNORECASE|re.MULTILINE)
    # Normalize quotes
    s = s.replace("“","\"").replace("”","\"").replace("’","'").replace("‘","'")

    # Try direct JSON
    try:
        return json.loads(s)
    except Exception:
        pass

    # Try to extract the first [...] array
    m = re.search(r"\[(?:.|\n)*\]", s)
    if m:
        arr = m.group(0)
        try:
            return json.loads(arr)
        except Exception:
            try:
                return ast.literal_eval(arr)
            except Exception:
                pass

    # As a last resort, return the raw string
    return s

def normalize_questions(obj):
    """Always return List[str] of questions."""
    if obj is None: return []

    # Already a list
    if isinstance(obj, list):
        out = []
        for item in obj:
            if isinstance(item, str):
                out.append(item.strip())
            elif isinstance(item, dict) and "question" in item:
                out.append(str(item["question"]).strip())
        return [q for q in out if q]

    # Dict forms
    if isinstance(obj, dict):
        if "questions" in obj and isinstance(obj["questions"], list):
            return normalize_questions(obj["questions"])
        # keys like question1, question2, ...
        numbered = [obj[k] for k in sorted(obj) if k.lower().startswith("question")]
        if numbered:
            return [str(v).strip() if not isinstance(v, dict) else str(v.get("question","")).strip()
                    for v in numbered if v]
        # look for nested list
        for v in obj.values():
            n = normalize_questions(v)
            if n: return n
        return []

    # Raw string: split by newlines/bullets if needed
    if isinstance(obj, str):
        # Try to parse again as JSON array if it looks like one
        if obj.strip().startswith("["):
            try:
                return normalize_questions(json.loads(obj))
            except Exception:
                pass
        # Fallback: naive split (keeps only question-like lines)
        lines = [ln.strip(" -•\t") for ln in obj.splitlines()]
        return [ln for ln in lines if ln.endswith("?") and len(ln) > 3]

    return []

# Build a tidy dataframe
rows = []
for doc_id, raw in results.items():
    parsed = parse_any(raw)
    qs = normalize_questions(parsed)
    for q in qs:
        rows.append((doc_id, q))

df_results = pd.DataFrame(rows, columns=["id", "question"])

# Optional cleaning
df_results["question"] = (
    df_results["question"]
      .astype(str)
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

# Drop empties/very short entries
df_results = df_results[df_results["question"].str.len() > 5]

# Save safely (handles commas/newlines with proper quoting)
df_results.to_csv("data/ground-truth-retrieval.csv", index=False)


## Evaluation

In [1]:
import pandas as pd
df_question = pd.read_csv('data/ground-truth-retrieval.csv')

In [2]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the significance of the spice melange ...
1,0,Can you elaborate on the character of Paul Atr...
2,0,How does Frank Herbert's depiction of politics...
3,0,"What is the nature of the Fremen people, and h..."
4,0,In what ways does the novel Dune explore theme...


In [3]:
ground_truth = df_question.to_dict(orient='records')

In [None]:
ground_truth[0]

{'id': 0,
 'question': 'What is the significance of the spice melange in the novel, and how does it impact the plot?'}

In [7]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [21]:
from tqdm.auto import tqdm

vectors = []

for doc in tqdm(ground_truth):
    question = doc['question']
    vector = model.encode(question)
    vectors.append(vector)

100%|██████████| 771/771 [00:10<00:00, 74.07it/s]


In [22]:
import numpy as np

vectors = np.array(vectors)

In [24]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields=['id'])
vindex.fit(vectors, ground_truth)

<minsearch.vector.VectorSearch at 0x1efc99cfcb0>

## LLM evaluation

## Monitoring

## Interface