# RAG Tooling Layer — Interactive Explorer

Test and explore the RAG tools for handwritten note transcription disambiguation.

## 1. Setup & Connection

In [None]:
from rag.client import get_client
from rag.schema import ensure_collection_exists

client = get_client()
ensure_collection_exists(client)

[32m2026-02-18 16:35:47.531[0m | [1mINFO    [0m | [36mRAG.client[0m:[36mget_client[0m:[36m34[0m - [1mConnected to Qdrant Cloud at https://0e38d8b1-8469-4e1e-8b41-4a2b56139430.eu-central-1-0.aws.cloud.qdrant.io. Found 1 collection(s).[0m
[32m2026-02-18 16:35:47.556[0m | [1mINFO    [0m | [36mRAG.schema[0m:[36mensure_collection_exists[0m:[36m35[0m - [1mCollection 'handwritten_notes' already exists — skipping creation.[0m


## 2. Ingest Sample Data

Upsert some synthetic handwritten note transcriptions to have data to search against.

In [None]:
from rag.ingest.upsert import upsert_transcription

In [2]:


sample_notes = [
    {
        "confirmed_text": "mitochondria is the powerhouse of the cell",
        "raw_ocr_text": "mitochondna is the powerhoue of the cel",
        "source_image_id": "img_001",
        "region_coords": {"x": 10, "y": 20, "w": 300, "h": 40},
        "confidence_score": 0.95,
        "topic_tags": ["biology", "cell-biology"],
        "chunk_type": "sentence",
    },
    {
        "confirmed_text": "photosynthesis converts light energy",
        "raw_ocr_text": "photosyntheis converts light enrgy",
        "source_image_id": "img_001",
        "region_coords": {"x": 10, "y": 70, "w": 300, "h": 40},
        "confidence_score": 0.90,
        "topic_tags": ["biology", "botany"],
        "chunk_type": "sentence",
    },
    {
        "confirmed_text": "chloroplast",
        "raw_ocr_text": "chloroplst",
        "source_image_id": "img_001",
        "region_coords": {"x": 10, "y": 120, "w": 100, "h": 30},
        "confidence_score": 0.88,
        "topic_tags": ["biology", "botany"],
        "chunk_type": "word",
    },
    {
        "confirmed_text": "Newton's second law F equals ma",
        "raw_ocr_text": "Newtons secnd law F equls ma",
        "source_image_id": "img_002",
        "region_coords": {"x": 5, "y": 10, "w": 350, "h": 45},
        "confidence_score": 0.92,
        "topic_tags": ["physics", "mechanics"],
        "chunk_type": "sentence",
    },
    {
        "confirmed_text": "acceleration",
        "raw_ocr_text": "accleration",
        "source_image_id": "img_002",
        "region_coords": {"x": 5, "y": 60, "w": 120, "h": 30},
        "confidence_score": 0.87,
        "topic_tags": ["physics"],
        "chunk_type": "word",
    },
    {
        "confirmed_text": "velocity is the rate of change of displacement",
        "raw_ocr_text": "velocty is the rate of chage of displacmnt",
        "source_image_id": "img_002",
        "region_coords": {"x": 5, "y": 110, "w": 380, "h": 40},
        "confidence_score": 0.91,
        "topic_tags": ["physics", "kinematics"],
        "chunk_type": "sentence",
    },
    {
        "confirmed_text": "the quadratic formula",
        "raw_ocr_text": "the quadratc formla",
        "source_image_id": "img_003",
        "region_coords": {"x": 20, "y": 15, "w": 200, "h": 35},
        "confidence_score": 0.93,
        "topic_tags": ["math", "algebra"],
        "chunk_type": "phrase",
    },
    {
        "confirmed_text": "derivative",
        "raw_ocr_text": "derivtive",
        "source_image_id": "img_003",
        "region_coords": {"x": 20, "y": 60, "w": 100, "h": 30},
        "confidence_score": 0.86,
        "topic_tags": ["math", "calculus"],
        "chunk_type": "word",
    },
]

for note in sample_notes:
    pid = upsert_transcription(**note)
    print(f"Upserted: {note['confirmed_text'][:40]:<40} -> {pid}")

[32m2026-02-18 16:29:45.547[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_dense_model[0m:[36m25[0m - [1mLoading BGE-large-en-v1.5 dense model (first call — ~1.2GB)...[0m
[32m2026-02-18 16:29:46.103[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_dense_model[0m:[36m27[0m - [1mBGE dense model loaded.[0m
[32m2026-02-18 16:29:46.119[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_sparse_model[0m:[36m37[0m - [1mLoading BGE-M3 sparse model (first call)...[0m
[32m2026-02-18 16:29:46.177[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_sparse_model[0m:[36m39[0m - [1mBGE-M3 sparse model loaded.[0m
[32m2026-02-18 16:29:47.322[0m | [1mINFO    [0m | [36mRAG.embeddings.visual_embedder[0m:[36m_load_clip[0m:[36m31[0m - [1mLoading CLIP ViT-B-32 model (first call — ~350MB RAM)...[0m
[32m2026-02-18 16:29:48.316[0m | [1mINFO    [0m | [36mRAG.embeddings.visual_embedder

Upserted: mitochondria is the powerhouse of the ce -> c21904c7-321b-53ba-b6f0-0f499aa8225d
Upserted: photosynthesis converts light energy     -> de58b753-78bb-550c-851f-00eac6387a12
Upserted: chloroplast                              -> deb56ab4-381c-51fe-ae0c-fce95b0df8d1


[32m2026-02-18 16:29:48.649[0m | [1mINFO    [0m | [36mRAG.ingest.upsert[0m:[36mupsert_transcription[0m:[36m107[0m - [1mUpserted point 8d8dab4c-9a13-501f-a71b-b18b768680d7 for 'Newton's second law F equals ma'[0m
[32m2026-02-18 16:29:48.715[0m | [1mINFO    [0m | [36mRAG.ingest.upsert[0m:[36mupsert_transcription[0m:[36m107[0m - [1mUpserted point 6016c103-dba1-5b2c-8ca0-59e9d855a851 for 'acceleration'[0m
[32m2026-02-18 16:29:48.779[0m | [1mINFO    [0m | [36mRAG.ingest.upsert[0m:[36mupsert_transcription[0m:[36m107[0m - [1mUpserted point d1f921dd-d9af-5c35-b671-7460692c581f for 'velocity is the rate of change of displacement'[0m
[32m2026-02-18 16:29:48.842[0m | [1mINFO    [0m | [36mRAG.ingest.upsert[0m:[36mupsert_transcription[0m:[36m107[0m - [1mUpserted point 4356af46-5f67-5cc8-924c-aa8a042a277f for 'the quadratic formula'[0m


Upserted: Newton's second law F equals ma          -> 8d8dab4c-9a13-501f-a71b-b18b768680d7
Upserted: acceleration                             -> 6016c103-dba1-5b2c-8ca0-59e9d855a851
Upserted: velocity is the rate of change of displa -> d1f921dd-d9af-5c35-b671-7460692c581f
Upserted: the quadratic formula                    -> 4356af46-5f67-5cc8-924c-aa8a042a277f


[32m2026-02-18 16:29:48.905[0m | [1mINFO    [0m | [36mRAG.ingest.upsert[0m:[36mupsert_transcription[0m:[36m107[0m - [1mUpserted point 9467a63b-96e2-5485-9be3-973e831fe6ad for 'derivative'[0m


Upserted: derivative                               -> 9467a63b-96e2-5485-9be3-973e831fe6ad


## 3. Search Context (Hybrid Dense + Sparse)

In [None]:
from rag.tools.search_context import search_context

results = search_context("cell biology energy", top_k=5)
for r in results:
    print(f"  [{r.score:.3f}] {r.confirmed_text}  (type={r.chunk_type}, tags={r.topic_tags})")

[32m2026-02-18 16:36:00.143[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_dense_model[0m:[36m25[0m - [1mLoading BGE-large-en-v1.5 dense model (first call — ~1.2GB)...[0m
[32m2026-02-18 16:36:00.698[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_dense_model[0m:[36m27[0m - [1mBGE dense model loaded.[0m
[32m2026-02-18 16:36:00.710[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_sparse_model[0m:[36m37[0m - [1mLoading BGE-M3 sparse model (first call)...[0m
[32m2026-02-18 16:36:00.775[0m | [1mINFO    [0m | [36mRAG.embeddings.bge_embedder[0m:[36m_get_sparse_model[0m:[36m39[0m - [1mBGE-M3 sparse model loaded.[0m
[32m2026-02-18 16:36:00.904[0m | [34m[1mDEBUG   [0m | [36mRAG.tools.search_context[0m:[36msearch_context[0m:[36m98[0m - [34m[1msearch_context returned 5 results for query='cell biology energy'[0m


  [0.734] photosynthesis converts light energy  (type=sentence, tags=['biology', 'botany'])
  [0.704] mitochondria is the powerhouse of the cell  (type=sentence, tags=['biology', 'cell-biology'])
  [0.693] chloroplast  (type=word, tags=['biology', 'botany'])
  [0.600] acceleration  (type=word, tags=['physics'])
  [0.542] derivative  (type=word, tags=['math', 'calculus'])


In [4]:
# Noisy query — does it still find relevant results?
results = search_context("mitochndra powrhouse cel", top_k=3)
for r in results:
    print(f"  [{r.score:.3f}] {r.confirmed_text}")

[32m2026-02-18 16:36:00.993[0m | [34m[1mDEBUG   [0m | [36mRAG.tools.search_context[0m:[36msearch_context[0m:[36m98[0m - [34m[1msearch_context returned 3 results for query='mitochndra powrhouse cel'[0m


  [0.589] mitochondria is the powerhouse of the cell
  [0.549] chloroplast
  [0.497] the quadratic formula


In [5]:
# Filter by chunk_type
results = search_context("physics", chunk_types=["word"], top_k=5)
for r in results:
    print(f"  [{r.score:.3f}] {r.confirmed_text}  (type={r.chunk_type})")

[32m2026-02-18 16:36:01.501[0m | [34m[1mDEBUG   [0m | [36mRAG.tools.search_context[0m:[36msearch_context[0m:[36m98[0m - [34m[1msearch_context returned 3 results for query='physics'[0m


  [0.706] acceleration  (type=word)
  [0.631] chloroplast  (type=word)
  [0.626] derivative  (type=word)


## 4. Search Similar Word (OCR Disambiguation)

In [None]:
from rag.tools.search_similar_word import search_similar_word

# Misspelled word lookup
matches = search_similar_word("accleration", top_k=3)
for m in matches:
    print(f"  [{m.fused_score:.4f}] {m.confirmed_text}  (text={m.text_score:.3f}, visual={m.visual_score:.3f})")

[32m2026-02-18 16:36:15.372[0m | [34m[1mDEBUG   [0m | [36mRAG.tools.search_similar_word[0m:[36msearch_similar_word[0m:[36m147[0m - [34m[1msearch_similar_word returned 3 matches for word_ocr='accleration'[0m


  [0.7578] acceleration  (text=0.758, visual=0.000)
  [0.6141] derivative  (text=0.614, visual=0.000)
  [0.5740] the quadratic formula  (text=0.574, visual=0.000)


In [7]:
# With a dummy image crop (just to verify the visual path works)
import base64, io
from PIL import Image

img = Image.new("RGB", (32, 32), color=(200, 200, 200))
buf = io.BytesIO()
img.save(buf, format="PNG")
dummy_b64 = base64.b64encode(buf.getvalue()).decode()

matches = search_similar_word("derivtive", image_crop_base64=dummy_b64, top_k=3)
for m in matches:
    print(f"  [{m.fused_score:.4f}] {m.confirmed_text}  (text={m.text_score:.3f}, visual={m.visual_score:.3f})")

[32m2026-02-18 16:36:23.677[0m | [1mINFO    [0m | [36mRAG.embeddings.visual_embedder[0m:[36m_load_clip[0m:[36m31[0m - [1mLoading CLIP ViT-B-32 model (first call — ~350MB RAM)...[0m
[32m2026-02-18 16:36:24.676[0m | [1mINFO    [0m | [36mRAG.embeddings.visual_embedder[0m:[36m_load_clip[0m:[36m37[0m - [1mCLIP model loaded on CPU.[0m
[32m2026-02-18 16:36:24.732[0m | [34m[1mDEBUG   [0m | [36mRAG.tools.search_similar_word[0m:[36msearch_similar_word[0m:[36m147[0m - [34m[1msearch_similar_word returned 3 matches for word_ocr='derivtive'[0m


  [0.0323] derivative  (text=0.795, visual=0.219)
  [0.0320] the quadratic formula  (text=0.578, visual=0.220)
  [0.0164] chloroplast  (text=0.000, visual=0.222)


## 5. Search by Topic

In [None]:
from rag.tools.search_by_topic import search_by_topic

results = search_by_topic("physics and motion", top_k=5)
for r in results:
    print(f"  [{r.score:.3f}] {r.confirmed_text}  (tags={r.topic_tags})")

[32m2026-02-18 16:37:04.897[0m | [34m[1mDEBUG   [0m | [36mRAG.tools.search_by_topic[0m:[36msearch_by_topic[0m:[36m43[0m - [34m[1msearch_by_topic returned 5 results for topic='physics and motion'[0m


  [0.750] acceleration  (tags=['physics'])
  [0.717] velocity is the rate of change of displacement  (tags=['physics', 'kinematics'])
  [0.684] Newton's second law F equals ma  (tags=['physics', 'mechanics'])
  [0.599] photosynthesis converts light energy  (tags=['biology', 'botany'])
  [0.589] the quadratic formula  (tags=['math', 'algebra'])


In [9]:
# Just the unique tags
tags = search_by_topic("mathematics", return_unique_tags_only=True)
print("Unique tags:", tags)

[32m2026-02-18 16:37:09.518[0m | [34m[1mDEBUG   [0m | [36mRAG.tools.search_by_topic[0m:[36msearch_by_topic[0m:[36m43[0m - [34m[1msearch_by_topic returned 8 results for topic='mathematics'[0m


Unique tags: ['algebra', 'biology', 'botany', 'calculus', 'cell-biology', 'kinematics', 'math', 'mechanics', 'physics']


## 6. Confidence Threshold Gating

In [14]:
# This should be skipped (confidence 0.50 < threshold 0.85)
result = upsert_transcription(
    confirmed_text="low confidence test",
    raw_ocr_text="low confidnce tst",
    source_image_id="img_skip",
    region_coords={"x": 0, "y": 0, "w": 50, "h": 20},
    confidence_score=0.50,
    topic_tags=["test"],
    chunk_type="word",
)
print(f"Result (should be None): {result}")



Result (should be None): None


## 7. Collection Stats

In [None]:
from rag.config import settings

info = client.get_collection(settings.COLLECTION_NAME)
print(f"Collection: {settings.COLLECTION_NAME}")
print(f"Points:     {info.points_count}")
print(f"Status:     {info.status}")

Collection: handwritten_notes
Points:     8
Status:     green
