In [12]:
import pandas as pd
from sqlalchemy import create_engine
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Chroma
from tqdm import tqdm
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
# --- 0. CUSTOM EMBEDDING CLASS ---
class LocalHuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_id):
        self.model = SentenceTransformer(model_id)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

# --- 1. DATABASE CONFIGURATION ---
DB_USER = "admin"
DB_PASSWORD = "admin" 
DB_NAME = "views"
DB_URI = f"postgresql://{DB_USER}:{DB_PASSWORD}@localhost:5432/{DB_NAME}"

engine = create_engine(DB_URI)

In [14]:
# --- 2. SQL QUERIES FOR ALL RELEVANT TABLES (NO LIMIT) ---
sql_queries = {
    "condition": """
        SELECT p.id AS patient_id, p.gender, p.birth_date, c.code_display AS condition_text, c.clinical_status
        FROM patient_flat AS p JOIN condition_flat AS c ON p.id = c.patient_id
        WHERE p.gender IS NOT NULL AND p.birth_date IS NOT NULL;
    """,
    "diagnostic_report": """
        SELECT p.id AS patient_id, p.gender, p.birth_date, dr.code_display AS report_text, dr.conclusion
        FROM patient_flat AS p JOIN diagnostic_report_flat AS dr ON p.id = dr.patient_id
        WHERE p.gender IS NOT NULL AND p.birth_date IS NOT NULL;
    """,
    "medication_request": """
        SELECT p.id AS patient_id, p.gender, p.birth_date, mr.medication_display AS medication, mr.status
        FROM patient_flat AS p JOIN medication_request_flat AS mr ON p.id = mr.patient_id
        WHERE p.gender IS NOT NULL AND p.birth_date IS NOT NULL;
    """,
    "observation": """
        SELECT 
            p.id AS patient_id, 
            p.gender, 
            p.birth_date, 
            o.code_display AS observation, 
            COALESCE(o.val_quantity::text, o.value_display, o.value_code) AS observation_value
        FROM patient_flat AS p 
        JOIN observation_flat AS o ON p.id = o.patient_id
        WHERE p.gender IS NOT NULL AND p.birth_date IS NOT NULL;
    """,
    "procedure": """
        SELECT p.id AS patient_id, p.gender, p.birth_date, pr.code_display AS procedure, pr.status
        FROM patient_flat AS p JOIN procedure_flat AS pr ON p.id = pr.patient_id
        WHERE p.gender IS NOT NULL AND p.birth_date IS NOT NULL;
    """,
    "immunization": """
        SELECT p.id AS patient_id, p.gender, p.birth_date, i.vaccinecode_display AS vaccine, i.status
        FROM patient_flat AS p JOIN immunization_flat AS i ON p.id = i.patient_id
        WHERE p.gender IS NOT NULL AND p.birth_date IS NOT NULL;
    """,
    "encounter": """
        SELECT p.id AS patient_id, p.gender, p.birth_date, e.type_display AS encounter_type, e.status
        FROM patient_flat AS p JOIN encounter_flat AS e ON p.id = e.patient_id
        WHERE p.gender IS NOT NULL AND p.birth_date IS NOT NULL;
    """
}

In [15]:
# --- 3. LOAD DATA FROM ALL TABLES ---
dataframes = {}
for name, query in sql_queries.items():
    print(f"Loading data for '{name}'...")
    try:
        df = pd.read_sql(query, engine)
        dataframes[name] = df
        print(f"Successfully loaded {len(df)} records for '{name}'.")
    except Exception as e:
        print(f"--- QUERY FAILED for '{name}' ---")
        print(f"The actual database error is: {e}")

Loading data for 'condition'...
Successfully loaded 6470 records for 'condition'.
Loading data for 'diagnostic_report'...
Successfully loaded 27333 records for 'diagnostic_report'.
Loading data for 'medication_request'...
Successfully loaded 9052 records for 'medication_request'.
Loading data for 'observation'...
Successfully loaded 438299 records for 'observation'.
Loading data for 'procedure'...
Successfully loaded 0 records for 'procedure'.
Loading data for 'immunization'...
Successfully loaded 3280 records for 'immunization'.
Loading data for 'encounter'...
Successfully loaded 169400 records for 'encounter'.


In [16]:
# --- 4. ADVANCED CHUNKING AND DOCUMENT CREATION ---
documents = []
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
    length_function=len,
)

# --- Create Patient Documents First ---
print("Processing unique patients...")
unique_patient_ids = set()
# Calculate total number of rows across all dataframes for the progress bar
total_rows = sum(len(df_table) for df_table in dataframes.values())
with tqdm(total=total_rows, desc="Finding Unique Patients") as pbar:
    for name, df_table in dataframes.items():
        for _, row in df_table.iterrows():
            patient_id = row.get('patient_id', 'N/A')
            if patient_id not in unique_patient_ids:
                patient_doc = Document(
                    page_content=f"Patient Information: ID {patient_id}, Gender: {row.get('gender', 'N/A')}, Date of Birth: {row.get('birth_date', 'N/A')}.",
                    metadata={"source_patient_id": patient_id, "resource_type": "Patient"}
                )
                documents.append(patient_doc)
                unique_patient_ids.add(patient_id)
            pbar.update(1)

# --- Create Documents for Other Resources ---
print("\nProcessing all patient records for detailed document creation...")
for name, df_table in dataframes.items():
    for _, row in tqdm(df_table.iterrows(), total=df_table.shape[0], desc=f"Creating Documents for {name}"):
        patient_id = row.get('patient_id', 'N/A')
        
        if name == "condition":
            text = row.get('condition_text', 'N/A')
            if text and text != 'N/A':
                for chunk in text_splitter.split_text(text):
                    doc = Document(page_content=f"Condition Record for Patient {patient_id}: Condition: '{chunk}', Status: '{row.get('clinical_status', 'N/A')}'.", metadata={"source_patient_id": patient_id, "resource_type": "Condition"})
                    documents.append(doc)
        
        elif name == "diagnostic_report":
            text = row.get('report_text', 'N/A')
            if text and text != 'N/A':
                doc = Document(page_content=f"Diagnostic Report for Patient {patient_id}: Report: '{text}', Conclusion: '{row.get('conclusion', 'N/A')}'.", metadata={"source_patient_id": patient_id, "resource_type": "DiagnosticReport"})
                documents.append(doc)

        elif name == "medication_request":
            text = row.get('medication', 'N/A')
            if text and text != 'N/A':
                doc = Document(page_content=f"Medication Request for Patient {patient_id}: Medication: '{text}', Status: '{row.get('status', 'N/A')}'.", metadata={"source_patient_id": patient_id, "resource_type": "MedicationRequest"})
                documents.append(doc)

        elif name == "observation":
            text = row.get('observation', 'N/A')
            if text and text != 'N/A':
                doc = Document(page_content=f"Observation for Patient {patient_id}: '{text}', Value: '{row.get('observation_value', 'N/A')}'.", metadata={"source_patient_id": patient_id, "resource_type": "Observation"})
                documents.append(doc)

        elif name == "procedure":
            text = row.get('procedure', 'N/A')
            if text and text != 'N/A':
                doc = Document(page_content=f"Procedure for Patient {patient_id}: Procedure: '{text}', Status: '{row.get('status', 'N/A')}'.", metadata={"source_patient_id": patient_id, "resource_type": "Procedure"})
                documents.append(doc)
        
        elif name == "immunization":
            text = row.get('vaccine', 'N/A')
            if text and text != 'N/A':
                doc = Document(page_content=f"Immunization for Patient {patient_id}: Vaccine: '{text}', Status: '{row.get('status', 'N/A')}'.", metadata={"source_patient_id": patient_id, "resource_type": "Immunization"})
                documents.append(doc)
        
        elif name == "encounter":
            text = row.get('encounter_type', 'N/A')
            if text and text != 'N/A':
                doc = Document(page_content=f"Encounter for Patient {patient_id}: Type: '{text}', Status: '{row.get('status', 'N/A')}'.", metadata={"source_patient_id": patient_id, "resource_type": "Encounter"})
                documents.append(doc)

print(f"Created a total of {len(documents)} documents for indexing.")

Processing unique patients...


Finding Unique Patients: 100%|██████████| 653834/653834 [00:07<00:00, 92555.18it/s]



Processing all patient records for detailed document creation...


Creating Documents for condition: 100%|██████████| 6470/6470 [00:00<00:00, 73367.04it/s]
Creating Documents for diagnostic_report: 100%|██████████| 27333/27333 [00:00<00:00, 75658.11it/s]
Creating Documents for medication_request: 100%|██████████| 9052/9052 [00:00<00:00, 75090.41it/s]
Creating Documents for observation: 100%|██████████| 438299/438299 [00:05<00:00, 86707.63it/s]
Creating Documents for procedure: 0it [00:00, ?it/s]
Creating Documents for immunization: 100%|██████████| 3280/3280 [00:00<00:00, 87247.78it/s]
Creating Documents for encounter: 100%|██████████| 169400/169400 [00:02<00:00, 84628.61it/s]

Created a total of 32557 documents for indexing.





In [17]:
# --- 5. EMBED AND STORE WITH BGE-M3 (WITH A CLEAN PROGRESS BAR) ---
from tqdm.notebook import tqdm # Use the notebook-friendly version of tqdm

# --- Update your custom embedding class to disable the nested progress bar ---
class LocalHuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_id):
        self.model = SentenceTransformer(model_id)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # --- FIX: Disable the internal progress bar ---
        return self.model.encode(texts, show_progress_bar=False).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

if documents:
    print("Initializing BGE-M3 embedding model from local path...")
    
    # This will use the model you cloned with git
    model_path = "./models/bge-m3" 
    
    embedding_model = LocalHuggingFaceEmbeddings(model_id=model_path)
    print("Model initialized.")
    persist_directory = 'chroma_db_index_bge_m3_full'
    
    print("Creating and persisting the vector store (this may take a long time)...")
    
    # Initialize Chroma with the first document
    vector_db = Chroma.from_documents(
        documents=[documents[0]],
        embedding=embedding_model,
        persist_directory=persist_directory
    )
    
    batch_size = 64
    
    # Loop through the rest of the documents in batches with a single, clean progress bar
    for i in tqdm(range(1, len(documents), batch_size), 
                  desc="Embedding Documents", 
                  unit="batch"):
        batch = documents[i:i + batch_size]
        vector_db.add_documents(documents=batch)

    print("\n--- RAG Indexing Complete! ---")
else:
    print("\nNo data was loaded. RAG indexing was skipped.")

Initializing BGE-M3 embedding model from local path...
Model initialized.
Creating and persisting the vector store (this may take a long time)...


Embedding Documents:   0%|          | 0/509 [00:00<?, ?batch/s]


--- RAG Indexing Complete! ---


In [20]:
from langchain_community.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
from typing import List

# --- Make sure the custom embedding class is defined in your notebook ---
class LocalHuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_id):
        self.model = SentenceTransformer(model_id)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=False).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

# --- 1. SETTINGS ---
model_path = "./models/bge-m3"
persist_directory = 'chroma_db_index_bge_m3_full'

# --- 2. LOAD THE EMBEDDING MODEL AND VECTOR STORE ---
print("Loading embedding model and vector store...")
embedding_model = LocalHuggingFaceEmbeddings(model_id=model_path)
vector_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
print("Vector store loaded successfully.")

# --- 3. PERFORM A TEST QUERY ---
query = "any patients born after 2016??"
print(f"\nPerforming similarity search for: '{query}'\n")

# Retrieve the 4 most relevant documents
retrieved_docs = vector_db.similarity_search(query, k=4)

# --- 4. DISPLAY THE RESULTS ---
if retrieved_docs:
    print("--- Top 4 Retrieved Documents ---")
    for i, doc in enumerate(retrieved_docs):
        print(f"\n--- Document {i+1} ---")
        print(f"Content: {doc.page_content}")
        print(f"Metadata: {doc.metadata}")
else:
    print("No relevant documents were found.")
    
# --- 5. VERIFY SOURCE DATA ---
print("Checking for 'diabetes' in the source condition data...")

# Access the 'condition' dataframe from the dictionary we created earlier
condition_df = dataframes.get('condition')

if condition_df is not None:
    # Search for rows where 'condition_text' contains 'diabetes' (case-insensitive)
    diabetes_records = condition_df[condition_df['condition_text'].str.contains('diabetes', case=False, na=False)]

    if not diabetes_records.empty:
        print(f"\nFound {len(diabetes_records)} records related to diabetes in the source data.")
        print("Here are the first 5:")
        print(diabetes_records.head())
    else:
        print("This is likely why the similarity search did not return relevant results.")
else:
    print("Could not find the 'condition' dataframe.")

Loading embedding model and vector store...
Vector store loaded successfully.

Performing similarity search for: 'any patients born after 2016??'

--- Top 4 Retrieved Documents ---

--- Document 1 ---
Content: Patient Information: ID 789399, Gender: male, Date of Birth: 2025-05-17.
Metadata: {'source_patient_id': '789399', 'resource_type': 'Patient'}

--- Document 2 ---
Content: Patient Information: ID 789399, Gender: male, Date of Birth: 2025-05-17.
Metadata: {'source_patient_id': '789399', 'resource_type': 'Patient'}

--- Document 3 ---
Content: Patient Information: ID 789399, Gender: male, Date of Birth: 2025-05-17.
Metadata: {'source_patient_id': '789399', 'resource_type': 'Patient'}

--- Document 4 ---
Content: Patient Information: ID 789399, Gender: male, Date of Birth: 2025-05-17.
Metadata: {'resource_type': 'Patient', 'source_patient_id': '789399'}
Checking for 'diabetes' in the source condition data...
This is likely why the similarity search did not return relevant results.
