# Ingest Chunks to ChromaDB

This notebook loads the chunked documents and ingests them into a ChromaDB vector store for RAG applications.


In [1]:
import json
from pathlib import Path
from datetime import datetime
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define paths
chunks_file = Path("../data/chunks/all_chunks.json")
persist_directory = Path("../chroma_db")
persist_directory.mkdir(parents=True, exist_ok=True)

print(f"Chunks file: {chunks_file}")
print(f"ChromaDB directory: {persist_directory}")


Chunks file: ../data/chunks/all_chunks.json
ChromaDB directory: ../chroma_db


In [3]:
# Load chunks from JSON file
with open(chunks_file, 'r', encoding='utf-8') as f:
    chunks_data = json.load(f)

print(f"✓ Loaded {len(chunks_data)} chunks from {chunks_file}")


✓ Loaded 131 chunks from ../data/chunks/all_chunks.json


In [4]:
# Convert chunks to LangChain Document objects
# Add ingestion timestamp to metadata
ingestion_timestamp = datetime.now().isoformat()

documents = []
for chunk in chunks_data:
    doc = Document(
        page_content=chunk['text'],
        metadata={
            **chunk['metadata'],
            'chunk_id': chunk['chunk_id'],
            'chunk_index': chunk['chunk_index'],
            'ingestion_timestamp': ingestion_timestamp
        }
    )
    documents.append(doc)

print(f"✓ Converted {len(documents)} chunks to LangChain Documents")
print(f"✓ Ingestion timestamp: {ingestion_timestamp}")
print(f"\nSample document metadata:")
print(f"  Chunk ID: {documents[0].metadata['chunk_id']}")
print(f"  Source: {documents[0].metadata['filename']}")
print(f"  Type: {documents[0].metadata.get('type', 'N/A')}")
print(f"  Condition: {documents[0].metadata.get('condition', 'N/A')}")
print(f"  Ingestion timestamp: {documents[0].metadata.get('ingestion_timestamp', 'N/A')}")


✓ Converted 131 chunks to LangChain Documents
✓ Ingestion timestamp: 2025-11-16T12:57:25.183898

Sample document metadata:
  Chunk ID: hypertension_medlineplus_overview_chunk_0000
  Source: hypertension_medlineplus_overview.md
  Type: conditions
  Condition: hypertension
  Ingestion timestamp: 2025-11-16T12:57:25.183898


In [5]:
# Initialize OpenAI Embeddings
# Make sure OPENAI_API_KEY is set in your environment
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

print("✓ Initialized OpenAI Embeddings")
print(f"  Model: {embeddings_model.model}")


✓ Initialized OpenAI Embeddings
  Model: text-embedding-3-small


In [6]:
# Create ChromaDB Vector Store
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings_model,
    persist_directory=str(persist_directory),
    collection_name="health_education_chunks"
)

print(f"✓ Created ChromaDB vector store")
print(f"  Persist directory: {persist_directory}")
print(f"  Collection name: health_education_chunks")
print(f"  Documents ingested: {len(documents)}")


✓ Created ChromaDB vector store
  Persist directory: ../chroma_db
  Collection name: health_education_chunks
  Documents ingested: 131


In [7]:
# Verify the vector store
collection_count = vector_store._collection.count()
print(f"✓ Vector store verification")
print(f"  Documents in collection: {collection_count}")

if collection_count == len(documents):
    print("  ✓ All documents successfully ingested!")
else:
    print(f"  ⚠ Warning: Expected {len(documents)} documents, found {collection_count}")


✓ Vector store verification
  Documents in collection: 131
  ✓ All documents successfully ingested!


In [8]:
# Test a simple similarity search
test_query = "What is high blood pressure?"
results = vector_store.similarity_search(test_query, k=3)

print(f"\n{'='*60}")
print("SAMPLE SIMILARITY SEARCH")
print(f"{'='*60}")
print(f"\nQuery: '{test_query}'")
print(f"\nTop 3 results:\n")

for i, doc in enumerate(results, 1):
    print(f"{i}. Chunk ID: {doc.metadata['chunk_id']}")
    print(f"   Source: {doc.metadata['filename']}")
    print(f"   Type: {doc.metadata.get('type', 'N/A')}")
    print(f"   Condition: {doc.metadata.get('condition', 'N/A')}")
    print(f"   Ingestion timestamp: {doc.metadata.get('ingestion_timestamp', 'N/A')}")
    print(f"   Text preview: {doc.page_content[:150]}...")
    print()



SAMPLE SIMILARITY SEARCH

Query: 'What is high blood pressure?'

Top 3 results:

1. Chunk ID: hypertension_medlineplus_treatment_chunk_0001
   Source: hypertension_medlineplus_treatment.md
   Type: treatments
   Condition: hypertension
   Ingestion timestamp: 2025-11-16T12:57:25.183898
   Text preview: ## Summary

### What is high blood pressure?

[High blood pressure](https://medlineplus.gov/highbloodpressure.html) , also called hypertension, is whe...

2. Chunk ID: hypertension_medlineplus_overview_chunk_0004
   Source: hypertension_medlineplus_overview.md
   Type: conditions
   Condition: hypertension
   Ingestion timestamp: 2025-11-16T12:57:25.183898
   Text preview: | High Blood Pressure Stage 2                                                          | 140 or higher             | **or**  | 90 or higher           ...

3. Chunk ID: hypertension_medlineplus_overview_chunk_0002
   Source: hypertension_medlineplus_overview.md
   Type: conditions
   Condition: hypertension
   Ingestio

In [9]:
# Summary
print("\n" + "="*60)
print("INGESTION SUMMARY")
print("="*60)
print(f"\n✓ Chunks loaded: {len(chunks_data)}")
print(f"✓ Documents created: {len(documents)}")
print(f"✓ Vector store location: {persist_directory.absolute()}")
print(f"✓ Collection name: health_education_chunks")
print(f"✓ Embedding model: {embeddings_model.model}")
print(f"\nThe vector store is now ready for RAG applications!")



INGESTION SUMMARY

✓ Chunks loaded: 131
✓ Documents created: 131
✓ Vector store location: /home/mateusdelai/Desktop/applied-ai-health-rag/notebooks/../chroma_db
✓ Collection name: health_education_chunks
✓ Embedding model: text-embedding-3-small

The vector store is now ready for RAG applications!
