# Indexing

This part indexes documents (creates embedding and stores them in a Haystack DocumentStore). It uses different indexing variant s.a. normal indexing and contextualized indexing (used later for contextual RAG evaluation).

In [1]:
import os
# Setting temp dir to /srv/data directory, as it otherwise fills up the home directory too much
# Just comment out on machines that are not "Goober"
os.environ["TMPDIR"] = "/srv/data/tmp"
os.makedirs("/srv/data/tmp", exist_ok=True)

%pip install haystack-ai
%pip install nltk
%pip install openai
%pip install pandas
%pip install sentence-transformers
%pip install hf_xet
%pip install ollama-haystack==2.4.2
%pip install tqdm # For Progress Bar

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from utils.markdown_utils import for_each_markdown_file
import pandas as pd
from haystack.document_stores.in_memory import InMemoryDocumentStore
from tqdm import tqdm
tqdm.pandas()

from pipelines.indexing_pipelines.base_indexing_pipeline import get_base_indexing_pipeline
from pipelines.indexing_pipelines.context_indexing_pipeline import get_context_indexing_pipeline

import logging
from utils.pickle_utils import for_each_pickle_file

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s',
    datefmt='%H:%M:%S'
)

# silence haystack’s pipeline logs
logging.getLogger("haystack").setLevel(logging.WARNING)
logging.getLogger("haystack.core.pipeline").setLevel(logging.WARNING)

# if you see similar spam from transformers, ragas, etc.
logging.getLogger("transformers").setLevel(logging.WARNING)
logging.getLogger("ragas").setLevel(logging.WARNING)

In [2]:
from config.secret import OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

os.environ["EMBEDDING_MODEL_NAME"] = "Qwen/Qwen3-Embedding-4B"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./model-assets/sentence-transformers"

os.environ["LLM_NAME"] = "gemma3:12b"
os.environ["LLM_CONTEXT_SIZE"] = "8192"
os.environ["LLM_PROVIDER"] = "ollama"

## Base Indexing

In [3]:
def base_indexing(filename, df):
    documents = df["document"].tolist()

    base_indexing_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
    base_indexing_pipeline = get_base_indexing_pipeline(base_indexing_store)

    base_indexing_pipeline.run({
        "embedder": { 
            "documents": documents
        },
    })

    filepath = f"data/document_stores/{os.environ['EMBEDDING_MODEL_NAME']}"
    os.makedirs(filepath, exist_ok=True)
    clean_name = os.path.splitext(os.path.basename(filename))[0]
    base_indexing_store.save_to_disk(f"{filepath}/{clean_name}_base_indexing_store.json")

for_each_pickle_file("data/preprocessed_documents", base_indexing)

Processing Pickle files:   0%|          | 0/6 [00:00<?, ?it/s]16:18:23 INFO Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-4B


README.md:   0%|          | 0.00/17.3k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

16:20:02 INFO 2 prompts are loaded, with the keys: ['query', 'document']


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Processing Pickle files:  17%|█▋        | 1/6 [01:48<09:03, 108.75s/it]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Processing Pickle files:  33%|███▎      | 2/6 [01:55<03:14, 48.67s/it] 

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processing Pickle files:  50%|█████     | 3/6 [02:01<01:27, 29.08s/it]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Processing Pickle files:  67%|██████▋   | 4/6 [02:07<00:40, 20.30s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Pickle files:  83%|████████▎ | 5/6 [02:14<00:15, 15.40s/it]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Processing Pickle files: 100%|██████████| 6/6 [02:20<00:00, 23.50s/it]


## Context Indexing

In [4]:
def context_indexing(filename, df):
    documents = df["document"].tolist()

    context_indexing_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
    context_indexing_pipeline = get_context_indexing_pipeline(context_indexing_store)

    def index_with_context(filename, bytes):
        documents_from_file = [document for document in documents if document.meta["title"] == filename]
        file_content = bytes.decode("utf-8")
        context_indexing_pipeline.run({
            "contextualiser": {
                "context": file_content,
                "documents": documents_from_file
            }
        })

    for_each_markdown_file("data/md_files", index_with_context)

    filepath = f"data/document_stores/{os.environ['EMBEDDING_MODEL_NAME']}"
    os.makedirs(filepath, exist_ok=True)
    clean_name = os.path.splitext(os.path.basename(filename))[0]
    context_indexing_store.save_to_disk(f"{filepath}/{clean_name}_context_indexing_store.json")

for_each_pickle_file("data/preprocessed_documents", context_indexing)

Processing Pickle files:   0%|          | 0/5 [00:00<?, ?it/s]13:08:06 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:08 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:13 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:16 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:18 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:21 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:26 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:08:31 INFO HTTP Request: POST http://localhost:11434/api/generate

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:09:02 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:06 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:13 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:16 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:19 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:26 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:30 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:37 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:09:40 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

13:12:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:19 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:24 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:37 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:42 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:46 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:51 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:12:55 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:13:00 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

13:16:40 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:16:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:16:49 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:16:53 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:16:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:17:02 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:17:06 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:17:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:17:14 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:17:19 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:17:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:17:27 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

13:21:52 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:21:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:01 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:05 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:14 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:18 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:22 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:31 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:35 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:22:39 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

13:27:12 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:17 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:20 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:22 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:25 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:30 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:35 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:38 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:27:41 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Markdown files: 100%|██████████| 6/6 [20:33<00:00, 205.62s/it]
Processing Pickle files:  20%|██        | 1/5 [20:34<1:22:19, 1234.93s/it]13:28:41 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:28:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:28:47 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:28:50 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:28:55 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:28:59 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:03 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:08 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:12 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:16 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:29:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:38 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:49 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:29:53 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:00 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:05 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:30:17 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:38 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:42 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:47 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:52 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:30:56 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:31:03 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:08 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:12 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:17 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:22 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:32 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:37 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:31:42 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:45 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:48 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:51 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Markdown files: 100%|██████████| 6/6 [03:13<00:00, 32.30s/it]
Processing Pickle files:  40%|████      | 2/5 [23:48<31:07, 622.60s/it]   13:31:55 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:31:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:00 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:03 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:05 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:07 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:13 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:32:18 INFO HTTP Request: POST http://localhost:11434/api/generate "

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:33:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:13 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:17 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:20 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:30 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:36 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:39 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:42 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:33:46 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

13:36:00 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:05 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:20 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:24 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:29 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:34 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:38 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:43 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:48 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:36:53 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

13:42:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:19 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:31 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:36 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:40 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:48 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:53 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:42:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:43:01 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

13:47:14 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:18 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:31 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:36 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:40 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:48 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:53 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:47:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:48:02 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

13:52:04 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:07 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:12 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:18 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:20 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:25 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:31 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:52:34 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Markdown files: 100%|██████████| 6/6 [21:30<00:00, 215.14s/it]
Processing Pickle files:  60%|██████    | 3/5 [45:20<30:56, 928.28s/it]13:53:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:53:31 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:53:36 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:53:41 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:53:45 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:53:52 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:53:58 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:04 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:10 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:54:22 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:32 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:37 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:41 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:54:48 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:53 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:54:58 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:04 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:55:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:12 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Markdown files: 100%|██████████| 6/6 [01:48<00:00, 18.12s/it]
Processing Pickle files:  80%|████████  | 4/5 [47:09<10:04, 604.77s/it]13:55:16 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:18 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:21 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:24 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:29 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:32 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:35 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:38 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:55:41 INFO HTTP Request: POST http://localhost:11434/api/generate "HTT

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:55:59 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:02 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:06 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:13 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:16 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:20 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:27 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:30 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:34 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:56:38 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

13:57:29 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:57:34 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:57:39 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:57:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:57:48 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:57:53 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:57:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:58:02 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:58:06 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:58:11 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:58:16 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
13:58:21 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

14:00:40 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:00:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:00:48 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:00:53 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:00:57 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:01:00 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:01:04 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:01:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:01:13 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:01:18 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:01:23 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:01:27 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

14:03:15 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:19 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:24 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:28 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:33 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:37 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:42 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:46 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:51 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:55 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:03:59 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:04:03 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

14:05:42 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:05:44 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:05:47 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:05:49 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:05:52 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:05:55 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:05:58 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:06:00 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:06:03 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:06:06 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:06:09 INFO HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
14:06:12 INFO HTTP Request: POST http://loc

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Markdown files: 100%|██████████| 6/6 [11:08<00:00, 111.37s/it]
Processing Pickle files: 100%|██████████| 5/5 [58:18<00:00, 699.73s/it]


#### Index already contextualized documents

Use this if only the embedding model changes, but you don't want to re-contextualize all the documents

In [25]:
old_embedding_model_name = "Linq-AI-Research/Linq-Embed-Mistral"
old_store = InMemoryDocumentStore.load_from_disk(f"data/document_stores/{old_embedding_model_name}/context_indexing_store.json")

contextualized_documents = old_store.filter_documents()

In [26]:
from pipelines.indexing_pipelines.base_indexing_pipeline import get_base_indexing_pipeline

context_indexing_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

base_indexing_pipeline = get_base_indexing_pipeline(context_indexing_store)
base_indexing_pipeline.run({
    "embedder": { 
        "documents": contextualized_documents
    },
})

16:12:37 INFO Warming up component embedder...


16:12:37 INFO Running component embedder


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

16:12:47 INFO Running component writer


{'writer': {'documents_written': 321}}

In [28]:
filepath = f"data/document_stores/{os.environ['EMBEDDING_MODEL_NAME']}"
os.makedirs(filepath, exist_ok=True)
context_indexing_store.save_to_disk(f"{filepath}/context_indexing_store.json")

## Validation

In [11]:
# base_documents = base_indexing_store.filter_documents()
base_documents = InMemoryDocumentStore.load_from_disk("data/document_stores/Qwen/Qwen3-Embedding-4B/docs_passage_1_0_base_indexing_store.json").filter_documents()
contextualized_documents = InMemoryDocumentStore.load_from_disk("data/document_stores/Qwen/Qwen3-Embedding-4B/docs_passage_1_0_context_indexing_store.json").filter_documents()

In [12]:
import numpy as np

# Get content lengths
base_lengths = [len(doc.content) for doc in base_documents]
contextualized_lengths = [len(doc.content) for doc in contextualized_documents]

# Compute stats
base_mean = np.mean(base_lengths)
base_std = np.std(base_lengths)

contextualized_mean = np.mean(contextualized_lengths)
contextualized_std = np.std(contextualized_lengths)

print(f"Base documents - Mean: {base_mean:.2f} chars, Std Dev: {base_std:.2f}")
print(f"Contextualized documents - Mean: {contextualized_mean:.2f} chars, Std Dev: {contextualized_std:.2f}")

Base documents - Mean: 317.14 chars, Std Dev: 179.72
Contextualized documents - Mean: 654.38 chars, Std Dev: 189.42


In [13]:
[document.content for document in contextualized_documents]

['Die "Schädellage" ist die ideale Position für eine vaginale Geburt, da der Kopf des Babys zuerst geboren wird. Diese Position erleichtert den Geburtsverlauf und bereitet den Weg für den Rest des Körpers. Abbildung 1 veranschaulicht diese Position.\n\nIm Laufe der Schwangerschaft kann das ungeborene Kind seine Position im Mutterleib mehrfach verändern. In der Regel dreht es sich jedoch rechtzeitig vor der Geburt in die sogenannte Schädellage – das bedeutet, der Kopf zeigt nach unten (siehe Abbildung 1). Diese Kopflage ist für eine natürliche Entbindung besonders günstig, da der größte Körperteil – der Kopf – zuerst durch den Geburtskanal tritt und so den Weg für den restlichen Körper bereitet.\n\n',
 'Eine "äußere Wendung" ist ein Eingriff, bei dem versucht wird, das Baby durch Druck von außen in eine günstigere Geburtslage zu bringen. Dies geschieht, wenn das Baby in einer ungünstigen Lage (z.B. Beckenendlage) liegt, die eine natürliche Geburt erschweren könnte. Die Technik wird von 