In [16]:
import os
import yaml
import pickle
import pprint
import warnings
from typing import List, Any, Dict


In [17]:

# --- LangChain Core Imports ---
# Make sure you have run: pip install langchain-chroma
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.stores import BaseStore
from langchain_chroma import Chroma

# --- Model Imports ---
from sentence_transformers import SentenceTransformer

# Suppress deprecation warnings for a cleaner output
warnings.filterwarnings("ignore")

print("Libraries imported successfully.\n")



Libraries imported successfully.



In [18]:
# --- Helper Classes (Copied from your project files) ---

class LocalHuggingFaceEmbeddings(Embeddings):
    """
    This class is from src/pipeline/embed.py
    """
    def __init__(self, model_id: str):
        try:
            self.model = SentenceTransformer(model_id)
            print(f"Embedding model loaded from: {model_id}")
        except Exception as e:
            print(f"Failed to load SentenceTransformer model from {model_id}: {e}")
            raise

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=True, normalize_embeddings=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, normalize_embeddings=True).tolist()



In [19]:
class SimpleInMemoryStore(BaseStore[str, Document]):
    """
    This class is from src/pipeline/transform.py
    """
    def __init__(self):
        self._dict = {}

    def mget(self, keys: List[str]) -> List[Document]:
        return [self._dict[key] for key in keys if key in self._dict]
    
    def mset(self, key_value_pairs: List[tuple[str, Document]]) -> None:
        for key, value in key_value_pairs:
            self._dict[key] = value
    
    def mdelete(self, keys: List[str]) -> None:
        for key in keys:
            if key in self._dict:
                del self._dict[key]

    def yield_keys(self) -> List[str]:
        return list(self._dict.keys())



In [20]:
# --- 1. Load Configuration and Components ---
project_root = "/Users/adityanbhatt/fhir_rag"
os.chdir(project_root)

print(f"Current working directory: {os.getcwd()}")
print("Files in this directory:", os.listdir("."))

print("\n--- Loading Configuration ---")
# Load the main config file
with open("config/embedding_config.yaml", 'r') as f:
    config = yaml.safe_load(f)

CHROMA_PATH = config['vector_store']['chroma_path']
COLLECTION_NAME = config['vector_store']['collection_name']
MODEL_PATH = config['embedding']['model_path']
DOCSTORE_PATH = f"{CHROMA_PATH}/parent_docstore.pkl"

print(f"Chroma Path:     {CHROMA_PATH}")
print(f"Collection Name: {COLLECTION_NAME}")
print(f"Docstore Path:   {DOCSTORE_PATH}")
print(f"Model Path:      {MODEL_PATH}")

# 1. Load the Parent Document Store
print("\n--- Loading Parent Docstore (parent_docstore.pkl) ---")
try:
    with open(DOCSTORE_PATH, "rb") as f:
        docstore = pickle.load(f)
    print(" Docstore loaded.")
except FileNotFoundError:
    print(f"Error: Could not find docstore at {DOCSTORE_PATH}.")
    print("Please run the build script (src/main.py) first.")
    exit()



Current working directory: /Users/adityanbhatt/fhir_rag
Files in this directory: ['test_model.py', '.DS_Store', 'requirements.txt', 'config', '.dist', 'models', '.dockerignore', 'logs', '.gitignore', '.env', '.venv', 'app.py', 'temp_env', '.git', 'main.py', 'data', 'notebooks', 'src']

--- Loading Configuration ---
Chroma Path:     ./data/indexes/chroma_db_advanced
Collection Name: spice_healthcare_advanced
Docstore Path:   ./data/indexes/chroma_db_advanced/parent_docstore.pkl
Model Path:      ./models/bge-m3

--- Loading Parent Docstore (parent_docstore.pkl) ---
 Docstore loaded.


In [21]:
# 2. Load the Embedding Model
print("\n--- Loading Embedding Model (bge-m3) ---")
embedding_function = LocalHuggingFaceEmbeddings(model_id=MODEL_PATH)
print(" Embedding Model loaded.")




--- Loading Embedding Model (bge-m3) ---
Embedding model loaded from: ./models/bge-m3
 Embedding Model loaded.


In [22]:
# 3. Load the Chroma Vector Store (Child Chunks)
print("\n--- Loading Chroma Vector Store ---")
try:
    vector_store = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=embedding_function,
        collection_name=COLLECTION_NAME
    )
    print(" Vector Store loaded.")
except Exception as e:
    print(f"Error loading Chroma DB: {e}")
    print("Ensure the database exists at the path.")
    exit()




--- Loading Chroma Vector Store ---
 Vector Store loaded.


In [23]:
# --- 2. Explore the Data ---

print("\n\n--- INDEX STATISTICS ---")

# Get stats from the parent docstore
parent_doc_keys = list(docstore.yield_keys())
print(f"Total Parent Documents: {len(parent_doc_keys)}")

# Get stats from the vector store (child chunks)
child_chunks_count = vector_store._collection.count()
print(f"Total Child Chunks: {child_chunks_count}")

if parent_doc_keys:
    print("\n---  Example Parent Document ---")
    example_parent = docstore.mget([parent_doc_keys[0]])[0]
    pprint.pprint(example_parent.metadata)
    print("Content (snippet):", example_parent.page_content[:300] + "...")

if child_chunks_count > 0:
    print("\n---  Example Child Chunk (from DB) ---")
    example_child = vector_store._collection.peek(1)
    pprint.pprint(example_child)





--- INDEX STATISTICS ---
Total Parent Documents: 128995
Total Child Chunks: 315817

---  Example Parent Document ---
{'source_id': '10', 'source_table': 'account'}
Content (snippet): id: 10
name: Rangpur Medical Care
is_users_restricted: False
max_no_of_users: 0
country_id: 1
tenant_id: 213
created_by: 284
updated_by: 284
created_at: 2023-09-19 15:32:16.581000+00:00
updated_at: 2023-09-27 14:12:49.600000+00:00
is_active: True
is_deleted: False...

---  Example Child Chunk (from DB) ---
{'data': None,
 'documents': ['id: 10\n'
               'name: Rangpur Medical Care\n'
               'is_users_restricted: False\n'
               'max_no_of_users: 0\n'
               'country_id: 1\n'
               'tenant_id: 213\n'
               'created_by: 284\n'
               'updated_by: 284\n'
               'created_at: 2023-09-19 15:32:16.581000+00:00\n'
               'updated_at: 2023-09-27 14:12:49.600000+00:00\n'
               'is_active: True'],
 'embeddings': array([[-0.02814954, 

In [24]:
# --- 3. Run a Test Query (Simulating the RAG Pipeline) ---

print("\n\n--- üîç RUNNING TEST QUERY ---")
QUERY = "patient with high glucose"
K_CHILD_DOCS = 3 # How many child docs to find

print(f"Searching for: '{QUERY}'...\n")

# 1. Search the vector store for child chunks
child_chunks = vector_store.similarity_search(QUERY, k=K_CHILD_DOCS)

print(f"--- Found {len(child_chunks)} relevant child chunks ---")
for i, chunk in enumerate(child_chunks):
    print(f"Chunk {i+1} (Parent ID: {chunk.metadata.get('doc_id')})")
    print(f"Content: {chunk.page_content[:150]}...")
    print("---")

# 2. Get the unique parent IDs from the child chunks
parent_ids = sorted(list(set(
    chunk.metadata['doc_id'] for chunk in child_chunks if 'doc_id' in chunk.metadata
)))

print(f"\nFound {len(parent_ids)} unique parent document IDs: {parent_ids}")

# 3. Retrieve the full parent documents from the docstore
if parent_ids:
    parent_documents = docstore.mget(parent_ids)
    print(f"\n--- Retrieved {len(parent_documents)} Full Parent Documents ---")
    
    for i, doc in enumerate(parent_documents):
        if doc is None:
            print(f"Could not find parent document for ID: {parent_ids[i]}")
            continue
            
        print(f"========= PARENT DOCUMENT {i+1} (ID: {parent_ids[i]}) ==========")
        print(f"Source Table: {doc.metadata.get('source_table')}")
        print(f"Source ID: {doc.metadata.get('source_id')}")
        print("\nFull Content:")
        print(doc.page_content)
        print("===========================================================\n")
else:
    print("\nNo parent documents found for the retrieved chunks.")

print("\nExploration complete.")



--- üîç RUNNING TEST QUERY ---
Searching for: 'patient with high glucose'...

--- Found 3 relevant child chunks ---
Chunk 1 (Parent ID: 3a70cae7-ded2-45b9-ac9f-0a279a4c75aa)
Content: glucose_unit: mmol/L
is_initial_review: True
height: 156.0
weight: 55.0
is_red_risk_patient: False
bmi: 22.6
avg_diastolic: 80.0
avg_systolic: 130.0
i...
---
Chunk 2 (Parent ID: cc688b02-11b5-4028-8e28-574ad72a9643)
Content: glucose_unit: mg/dL
is_initial_review: True
height: 178.0
weight: 65.0
is_red_risk_patient: False
bmi: 20.52
avg_diastolic: 90.0
avg_systolic: 100.0
i...
---
Chunk 3 (Parent ID: 5f2706de-9397-4535-a27b-681943f24407)
Content: glucose_value: 6.9
glucose_type: fbs
glucose_unit: mmol/L
is_initial_review: False
height: 156.0
weight: 58.0
is_red_risk_patient: False
bmi: 23.83
av...
---

Found 3 unique parent document IDs: ['3a70cae7-ded2-45b9-ac9f-0a279a4c75aa', '5f2706de-9397-4535-a27b-681943f24407', 'cc688b02-11b5-4028-8e28-574ad72a9643']

--- Retrieved 3 Full Parent Documents ---
Sourc