This playbook is inspired by lessons in [Advanced Retrieval for AI with Chroma](https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/)

## Setup

In [None]:
# %pip install glob langchain-huggingface langchain-community langchain-chroma --user

In [1]:
# Document Loading
import glob
from langchain_community.document_loaders import JSONLoader

FHIR_BUNDLE_JQ_SCHEMA = ".entry[]"
FHIR_BUNDLE_CONTENT_KEY = ".resource"

synthea_bundles = glob.glob("../fhir/*.json")

loaders = [
    JSONLoader(
        file_path=xpath,
        jq_schema=FHIR_BUNDLE_JQ_SCHEMA,
        content_key=FHIR_BUNDLE_CONTENT_KEY,
        is_content_key_jq_parsable=True,
        text_content=False
    )
    for xpath in synthea_bundles[:10] # limiting to just the first 10 for now
]

docs = []
for loader in loaders:
    docs.extend(loader.load())
print(len(docs))
print(docs[5])

2961
page_content='{"resourceType": "Condition", "id": "0472ed1c-4efa-4c86-bcf1-5eea6bda2432", "clinicalStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active"}]}, "verificationStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "confirmed"}]}, "code": {"coding": [{"system": "http://snomed.info/sct", "code": "429007001", "display": "History of cardiac arrest (situation)"}], "text": "History of cardiac arrest (situation)"}, "subject": {"reference": "urn:uuid:5cbc121b-cd71-4428-b8b7-31e53eba8184"}, "encounter": {"reference": "urn:uuid:f78d73fc-9f9b-46d5-93aa-f5db86ba914c"}, "onsetDateTime": "1965-11-15T06:22:41-05:00", "recordedDate": "1965-11-15T06:22:41-05:00"}' metadata={'source': 'C:\\Users\\Peter\\Documents\\GitHub\\agentic-healthcare-analytics\\fhir\\Aaron697_Brekke496_2fa15bc7-8866-461a-9000-f739e425860a.json', 'seq_num': 6}


In [3]:
# Embedding
from langchain_community.embeddings import OllamaEmbeddings
embedding = OllamaEmbeddings(model="llama3.2:1b")

In [None]:
# Vector Store
from langchain_chroma import Chroma

persist_directory = 'docs/chroma'
!rmdir /s /q ".\\docs\\chroma" # clear out the persist directory

vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=persist_directory
)
print(vectordb._collection.count())

The system cannot find the file specified.
The system cannot find the file specified.
The system cannot find the file specified.
The system cannot find the file specified.
The system cannot find the file specified.
The system cannot find the file specified.


In [None]:
# LLM
# !ollama pull llama3.2:1b
from langchain_community.llms import Ollama

llm = Ollama(model="llama3.2:1b") # using smaller model due to resource limitations on my old laptop 🙃

## Visualize Embeddings Space
Use UMAP projection to visualize the high-dimensional embedding space in 2D

In [None]:
# %pip install umap tqdm matplotlib --user

In [None]:
import umap
import numpy as np
from tqdm import tqdm

chroma_embeddings = vectordb._collection.get(include=['embeddings'])['embeddings']
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(chroma_embeddings)

In [None]:
def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings

In [None]:
projected_dataset_embeddings = project_embeddings(chroma_embeddings, umap_transform)

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10)
plt.gca().set_aspect('equal', 'datalim')
plt.title('Projected Embeddings')
plt.axis('off')

## Try Query Expansion

Ask LLM to come up with additional questions along the lines of the provided user question and apply those to the VectorDB in addition to the provided user question.

## Try Cross-encoder Re-ranking

Cross-encoders are a different method for encoding queries that are related, rather than treating them as independent.... I think 🤔

## Try Embedding Adaptors

## Try Other Techniques