In [None]:
# setup env
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
# our prompt
prompt = "what is a pod in kubernetes? is it just a docker container?"


In [None]:
# setup chroma client
import chromadb
chroma_client = chromadb.PersistentClient('./tmp/chroma-db')

In [None]:
# custom embedding function using Gemini Embeddings API
# ref: https://github.com/google-gemini/cookbook/blob/main/examples/chromadb/Vectordb_with_chroma.ipynb
import google.genai as genai

client = genai.Client(api_key=os.getenv("GENAI_API_KEY"))

class GeminiEmbeddingFunction(chromadb.EmbeddingFunction):
  def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
    EMBEDDING_MODEL_ID = "gemini-embedding-001"  # @param ["gemini-embedding-001", "text-embedding-004"] {"allow-input": true, "isTemplate": true}
    title = "Custom query"
    response = client.models.embed_content(
        model=EMBEDDING_MODEL_ID,
        contents=input,
        config=genai.types.EmbedContentConfig(
          task_type="retrieval_document",
          title=title
        )
    )

    return response.embeddings[0].values

In [None]:
# create collection for storage
collection = chroma_client.get_collection(name="test",
                                             embedding_function=GeminiEmbeddingFunction()
                                             )

In [None]:
# function for chunking text into smaller pieces for better embedding and retrieval
# makes use of huggingface tokenizers library to tokenize text
#  and chunk it based on a specified chunk size

import transformers
from transformers import AutoTokenizer, GPT2Tokenizer
# define tokenizer
tokenizer:GPT2Tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2-medium')
# chunking function
def chunker(text, chunk_size=5) -> list[str]:
    tokens = tokenizer.tokenize(text)
    # get length of tokens
    _len = len(tokens)
    chunks = []
    chunk = []
    for i in range(_len):
        if i%chunk_size==0 and i!=0:
            print(chunk)    
            print('chunk limit reached')
            chunks.append(tokenizer.convert_tokens_to_string(chunk))
            chunk = []
        chunk.append(tokens[i])
        print(f'Token {i}: {tokens[i]}')
        
    if chunk:  # Append the last chunk if it exists
        print(chunk)
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

In [None]:
# read text from test_docs dir
# note we will not be doing file readong for the scraper most likely
# as far as i (chi) understand
# companies usually run scrapers through timed jobs or specific triggers
# that manage the scraping and data ingestion process
# a common approach is to have sth like an aws lambda function or a serverless function
# that gets triggered when new data is available or at scheduled intervals 
# to perform the scraping and then directly ingest the data 
# into the vector db

import os
import uuid

docs = {}

test_docs_dir = './test_docs'
for filename in os.listdir(test_docs_dir):
    print(filename)
    if filename.endswith(('.txt','.md')):
        with open(os.path.join(test_docs_dir, filename), 'rb') as f:
            # read doc
            content = f.read().decode('utf-8', errors='ignore')
            # chunk doc into 500 token chunks
            chunks = chunker(content, chunk_size=500)
            # add chunks to docs dict with unique id
            for i in chunks:
                doc_id = str(uuid.uuid4())
                docs[doc_id] = i

In [None]:
import pprint
print(list(docs.keys()))
pprint.pprint(docs)

In [None]:
for i in docs:
    collection.add(ids=[i], documents=[docs[i]])

In [None]:
# another search test
from pprint import pprint
res = collection.query(
    query_texts=[prompt],
    n_results=4
)
# pprint(res)
# print(len(res['documents']))
doc_chunks = [i for i in res['documents'][0]]
pprint(doc_chunks)

In [None]:
RAG_prompt_template = '''
CONTEXT:
{retrieved_documents}

QUESTION:
{user_question}

INSTRUCTIONS:
Answer the QUESTION using only the information provided in the CONTEXT above.
Keep your answer grounded in the facts of the CONTEXT.
Use [chunk_id] notation immediately after each statement to cite sources.
If the CONTEXT doesn't contain enough information to fully answer the QUESTION, state: "I don't have enough information to answer this completely" and explain what's missing.
Match the language of the user's QUESTION in your response.

Provide a clear, factual answer based solely on the CONTEXT provided.
'''

In [None]:
# little llm test for fun
from google import genai
from google.genai.types import HttpOptions

# format prompt with context from retrieved doc chunks
formatted_prompt = RAG_prompt_template.format(
    retrieved_documents='\n\n'.join(doc_chunks),
    user_question=prompt
)

client = genai.Client(http_options=HttpOptions(api_version="v1"))
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=formatted_prompt,
    config=genai.types.GenerateContentConfig(
        max_output_tokens=1024,
        temperature=0.2,
        top_p=0.8,
        stop_sequences=["###"]
    )
)
print(response.text)