In [1]:
# setup env
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# our prompt
prompt = "What services does RMHC offer?"


In [3]:
# setup chroma client
import chromadb
chroma_client = chromadb.PersistentClient('./tmp/chroma-db')

In [4]:
# custom embedding function using Gemini Embeddings API
# ref: https://github.com/google-gemini/cookbook/blob/main/examples/chromadb/Vectordb_with_chroma.ipynb
import google.genai as genai

client = genai.Client(api_key=os.getenv("GENAI_API_KEY"))

class GeminiEmbeddingFunction(chromadb.EmbeddingFunction):
  def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
    EMBEDDING_MODEL_ID = "gemini-embedding-001"  # @param ["gemini-embedding-001", "text-embedding-004"] {"allow-input": true, "isTemplate": true}
    title = "Custom query"
    response = client.models.embed_content(
        model=EMBEDDING_MODEL_ID,
        contents=input,
        config=genai.types.EmbedContentConfig(
          task_type="retrieval_document",
          title=title
        )
    )

    return response.embeddings[0].values

In [5]:
# create collection for storage
collection = chroma_client.create_collection(name="rmhc",
                                             embedding_function=GeminiEmbeddingFunction()
                                             )

  embedding_function=GeminiEmbeddingFunction()


In [6]:
# function for chunking text into smaller pieces for better embedding and retrieval
# makes use of huggingface tokenizers library to tokenize text
#  and chunk it based on a specified chunk size

import transformers
from transformers import AutoTokenizer, GPT2Tokenizer
# define tokenizer
tokenizer:GPT2Tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2-medium')
# chunking function
def chunker(text, chunk_size=5) -> list[str]:
    tokens = tokenizer.tokenize(text)
    # get length of tokens
    _len = len(tokens)
    chunks = []
    chunk = []
    for i in range(_len):
        if i%chunk_size==0 and i!=0:
            print(chunk)    
            print('chunk limit reached')
            chunks.append(tokenizer.convert_tokens_to_string(chunk))
            chunk = []
        chunk.append(tokens[i])
        print(f'Token {i}: {tokens[i]}')
        
    if chunk:  # Append the last chunk if it exists
        print(chunk)
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

  from .autonotebook import tqdm as notebook_tqdm
PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
# read text from rmhc_docs dir
# note we will not be doing file readong for the scraper most likely
# as far as i (chi) understand
# companies usually run scrapers through timed jobs or specific triggers
# that manage the scraping and data ingestion process
# a common approach is to have sth like an aws lambda function or a serverless function
# that gets triggered when new data is available or at scheduled intervals 
# to perform the scraping and then directly ingest the data 
# into the vector db

import os
import uuid

docs = {}

rmhc_docs_dir = './rmhc_docs'
for filename in os.listdir(rmhc_docs_dir):
    print(filename)
    if filename.endswith(('.txt','.md')):
        with open(os.path.join(rmhc_docs_dir, filename), 'rb') as f:
            # read doc
            content = f.read().decode('utf-8', errors='ignore')
            # chunk doc into 500 token chunks
            chunks = chunker(content, chunk_size=500)
            # add chunks to docs dict with unique id
            for i in chunks:
                doc_id = str(uuid.uuid4())
                docs[doc_id] = i

Token indices sequence length is longer than the specified maximum sequence length for this model (4917 > 1024). Running this sequence through the model will result in indexing errors


H4I AI Chatbot Workshop - RMHC Info.txt
Token 0: ï
Token 1: »
Token 2: ¿
Token 3: About
Token 4: ĠRonald
Token 5: ĠMcDonald
Token 6: ĠHouse
Token 7: ĠChar
Token 8: ities
Token 9: Ġ(
Token 10: RM
Token 11: HC
Token 12: ):
Token 13: č
Token 14: Ċ
Token 15: *
Token 16: ĠA
Token 17: Ġglobal
Token 18: Ġnonprofit
Token 19: Ġorganization
Token 20: Ġdedicated
Token 21: Ġto
Token 22: Ġsupporting
Token 23: Ġthe
Token 24: Ġhealth
Token 25: Ġand
Token 26: Ġwell
Token 27: -
Token 28: being
Token 29: Ġof
Token 30: Ġchildren
Token 31: Ġand
Token 32: Ġtheir
Token 33: Ġfamilies
Token 34: .
Token 35: č
Token 36: Ċ
Token 37: *
Token 38: ĠCore
Token 39: Ġmission
Token 40: :
Token 41: Ġprovide
Token 42: Ġessential
Token 43: Ġservices
Token 44: Ġthat
Token 45: Ġremove
Token 46: Ġbarriers
Token 47: ,
Token 48: Ġstrengthen
Token 49: Ġfamilies
Token 50: Ġand
Token 51: Ġpromote
Token 52: Ġhealing
Token 53: Ġwhen
Token 54: Ġchildren
Token 55: Ġneed
Token 56: Ġhealthcare
Token 57: .
Token 58: č
Token 59: Ċ
Token 

In [8]:
import pprint
print(list(docs.keys()))
pprint.pprint(docs)

['9f0e4c7b-bd0f-4f5e-831c-6980b2451005', '971cceae-c0e8-4832-802a-a8fc3afb666b', 'ccd1e26f-4bb0-40a8-93ac-511786643286', 'ef6648fc-7c4a-4b61-9266-437f61467a35', '2d2d7722-c7d6-46f4-837b-bc55668c0409', '05a62d39-a848-401d-be3e-18a6c6b20c35', '294b81b6-f7d8-4104-8ff8-b1dbdab8f7ad', 'a75200a0-9fb4-46cd-a569-a86a14b3f11c', 'e27526e3-f1cb-46c5-b2a1-1a626fae5fee', '6570a6c4-5f11-4718-aa2a-c2e65f617fe4']
{'05a62d39-a848-401d-be3e-18a6c6b20c35': 'ella, and chickenpox (unless they’ve '
                                         'had chickenpox in the past). Signed '
                                         'documentation by a doctor must be '
                                         'provided for medically exempt '
                                         'patients. \r\n'
                                         '   * CAREGIVER AGE: Caregivers 21 or '
                                         'under must have a support person (22 '
                                         'years or older) througho

In [9]:
for i in docs:
    collection.add(ids=[i], documents=[docs[i]])

In [10]:
# another search test
from pprint import pprint
res = collection.query(
    query_texts=[prompt],
    n_results=4
)
# pprint(res)
# print(len(res['documents']))
doc_chunks = [i for i in res['documents'][0]]
pprint(doc_chunks)

['\ufeffAbout Ronald McDonald House Charities (RMHC):\r\n'
 '* A global nonprofit organization dedicated to supporting the health and '
 'well-being of children and their families.\r\n'
 '* Core mission: provide essential services that remove barriers, strengthen '
 'families and promote healing when children need healthcare.\r\n'
 '* Vision: A world where every family has what they need to ensure the best '
 'health outcomes for their children.\r\n'
 '* Every Ronald McDonald House is a haven for families with children in the '
 'hospital. It provides all the comforts of home, plus the compassion of '
 'staff, volunteers, and other families — all just steps away from the '
 'hospital.\r\n'
 '* When the best medical treatment for a child is far from home, families '
 'don’t have to worry about where to stay. At a Ronald McDonald House, '
 'families can be together, enjoy home-cooked meals and receive compassionate '
 'hospitality and support from staff and volunteers — all near the chil

In [11]:
RAG_prompt_template = '''
CONTEXT:
{retrieved_documents}

QUESTION:
{user_question}

INSTRUCTIONS:
Answer the QUESTION using only the information provided in the CONTEXT above.
Keep your answer grounded in the facts of the CONTEXT.
Use [chunk_id] notation immediately after each statement to cite sources.
If the CONTEXT doesn't contain enough information to fully answer the QUESTION, state: "I don't have enough information to answer this completely" and explain what's missing.
Match the language of the user's QUESTION in your response.

Provide a clear, factual answer based solely on the CONTEXT provided.
'''

In [12]:
# little llm test for fun
from google import genai
from google.genai.types import HttpOptions

# format prompt with context from retrieved doc chunks
formatted_prompt = RAG_prompt_template.format(
    retrieved_documents='\n\n'.join(doc_chunks),
    user_question=prompt
)

client = genai.Client(http_options=HttpOptions(api_version="v1"))
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=formatted_prompt,
    config=genai.types.GenerateContentConfig(
        max_output_tokens=1024,
        temperature=0.2,
        top_p=0.8,
        stop_sequences=["###"]
    )
)
print(response.text)

RMHC offers the following services:

*   **Core Programs:**
    *   Ronald McDonald House: A "home away from home" near hospitals where families can stay together while a child receives medical treatment [chunk_id: 10]. This includes comforting meals, quality sleep, and playtime for children [chunk_id: 11]. Houses typically include bedrooms, kitchens, dining areas, common spaces, play rooms, and support services [chunk_id: 17].
    *   Ronald McDonald Family Room: Calm, private lounge spaces inside hospitals [chunk_id: 10]. These rooms offer a quiet place for families to recharge, grab a bite to eat, shower, or get rest [chunk_id: 12]. Amenities typically include a kitchen and snacks, private bathrooms and shower facilities, sleeping rooms, laundry facilities, internet access, reading materials, a seating area with a television, and a quiet room [chunk_id: 14].

*   **Additional Services (Global):**
    *   Nutritious meal programs [chunk_id: 24].
    *   Education programs for outpati