# Indexing

This part indexes documents (creates embedding and stores them in a Haystack DocumentStore). It uses different indexing variant s.a. normal indexing and contextualized indexing (used later for contextual RAG evaluation).

In [1]:
import os
# Setting temp dir to /srv/data directory, as it otherwise fills up the home directory too much
# Just comment out on machines that are not "Goober"
os.environ["TMPDIR"] = "/srv/data/tmp"
os.makedirs("/srv/data/tmp", exist_ok=True)

%pip install haystack-ai
%pip install nltk
%pip install openai
%pip install pandas
%pip install sentence-transformers
%pip install hf_xet
%pip install ollama-haystack==2.4.2
%pip install tqdm # For Progress Bar

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from utils.markdown_utils import for_each_markdown_file
import pandas as pd
from haystack.document_stores.in_memory import InMemoryDocumentStore
from tqdm import tqdm
tqdm.pandas()
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s',
    datefmt='%H:%M:%S'
)

# silence haystack’s pipeline logs
logging.getLogger("haystack").setLevel(logging.WARNING)
logging.getLogger("haystack.core.pipeline").setLevel(logging.WARNING)

# if you see similar spam from transformers, ragas, etc.
logging.getLogger("transformers").setLevel(logging.WARNING)
logging.getLogger("ragas").setLevel(logging.WARNING)

In [None]:
from config.secret import OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

os.environ["EMBEDDING_MODEL_NAME"] = "Qwen/Qwen3-Embedding-4B"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./model-assets/sentence-transformers"

os.environ["LLM_NAME"] = "gemma3:12b"
os.environ["LLM_CONTEXT_SIZE"] = "8192"
os.environ["LLM_PROVIDER"] = "ollama"

In [4]:
documents = pd.read_pickle("data/all_documents.pkl")["document"].tolist()

## Base Indexing

In [7]:
base_indexing_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

In [8]:
from pipelines.indexing_pipelines.base_indexing_pipeline import get_base_indexing_pipeline

base_indexing_pipeline = get_base_indexing_pipeline(base_indexing_store)
base_indexing_pipeline.run({
    "embedder": { 
        "documents": documents
    },
})

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

{'writer': {'documents_written': 321}}

In [7]:
filepath = f"data/document_stores/{os.environ['EMBEDDING_MODEL_NAME']}"
os.makedirs(filepath, exist_ok=True)
base_indexing_store.save_to_disk(f"{filepath}/base_indexing_store.json")

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


## Context Indexing

In [10]:
context_indexing_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

In [None]:
from pipelines.indexing_pipelines.context_indexing_pipeline import get_context_indexing_pipeline

context_indexing_pipeline = get_context_indexing_pipeline(context_indexing_store)

def index_with_context(filename, bytes):
    documents_from_file = [document for document in documents if document.meta["title"] == filename]
    file_content = bytes.decode("utf-8")
    context_indexing_pipeline.run({
        "contextualiser": {
            "context": file_content,
             "documents": documents_from_file
        }
    })

for_each_markdown_file("./data", index_with_context)

Processing Markdown files:   0%|          | 0/6 [00:00<?, ?it/s]17:44:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:17 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:21 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:25 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:36 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:39 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:42 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:45 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:44:48 INFO HTTP Request: POST http://localhost:11434/api/genera

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Markdown files:  17%|█▋        | 1/6 [02:06<10:30, 126.15s/it]17:46:12 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:46:21 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:46:25 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:46:51 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:46:55 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:46:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:47:02 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:47:05 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:47:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:47:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
17:47:32 INFO HTTP Request: POST http://localhost:11434/a

In [None]:
filepath = f"data/document_stores/{os.environ['EMBEDDING_MODEL_NAME']}"
os.makedirs(filepath, exist_ok=True)
context_indexing_store.save_to_disk(f"{filepath}/context_indexing_store.json")

#### Index already contextualized documents

Use this if only the embedding model changes, but you don't want to re-contextualize all the documents

In [25]:
old_embedding_model_name = "Linq-AI-Research/Linq-Embed-Mistral"
old_store = InMemoryDocumentStore.load_from_disk(f"data/document_stores/{old_embedding_model_name}/context_indexing_store.json")

contextualized_documents = old_store.filter_documents()

In [26]:
from pipelines.indexing_pipelines.base_indexing_pipeline import get_base_indexing_pipeline

context_indexing_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

base_indexing_pipeline = get_base_indexing_pipeline(context_indexing_store)
base_indexing_pipeline.run({
    "embedder": { 
        "documents": contextualized_documents
    },
})

16:12:37 INFO Warming up component embedder...


16:12:37 INFO Running component embedder


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

16:12:47 INFO Running component writer


{'writer': {'documents_written': 321}}

In [28]:
filepath = f"data/document_stores/{os.environ['EMBEDDING_MODEL_NAME']}"
os.makedirs(filepath, exist_ok=True)
context_indexing_store.save_to_disk(f"{filepath}/context_indexing_store.json")

## Validation

In [None]:
base_documents = base_indexing_store.filter_documents()
contextualized_documents = context_indexing_store.filter_documents()

In [None]:
import numpy as np

# Get content lengths
base_lengths = [len(doc.content) for doc in base_documents]
contextualized_lengths = [len(doc.content) for doc in contextualized_documents]

# Compute stats
base_mean = np.mean(base_lengths)
base_std = np.std(base_lengths)

contextualized_mean = np.mean(contextualized_lengths)
contextualized_std = np.std(contextualized_lengths)

print(f"Base documents - Mean: {base_mean:.2f} chars, Std Dev: {base_std:.2f}")
print(f"Contextualized documents - Mean: {contextualized_mean:.2f} chars, Std Dev: {contextualized_std:.2f}")

Base documents - Mean: nan chars, Std Dev: nan
Contextualized documents - Mean: 645.78 chars, Std Dev: 193.25


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [None]:
[document.content for document in contextualized_documents]

['Eine äußere Wendung wird erwogen, wenn sich das Kind am Ende der Schwangerschaft in einer ungünstigen Lage befindet, beispielsweise mit dem Becken nach unten. Diese Lage erschwert eine vaginale Geburt, weshalb versucht wird, das Kind durch gezielten Druck in die Schädellage zu bringen. Die Schädellage ist die günstigste Position für eine vaginale Geburt, da der Kopf zuerst durch den Geburtskanal tritt.\n\nWeshalb wird eine äußere Wendung in Erwägung gezogen?\n\n',
 'Die "Schädellage" ist die ideale Position für eine vaginale Geburt, da der Kopf des Babys zuerst geboren wird. Diese Position erleichtert den Geburtsverlauf und bereitet den Weg für den Rest des Körpers. Abbildung 1 veranschaulicht diese Position.\n\nIm Laufe der Schwangerschaft kann das ungeborene Kind seine Position im Mutterleib mehrfach verändern. In der Regel dreht es sich jedoch rechtzeitig vor der Geburt in die sogenannte Schädellage – das bedeutet, der Kopf zeigt nach unten (siehe Abbildung 1). Diese Kopflage ist 