In [1]:
# setup env
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# our prompt
prompt = "what is a pod in kubernetes? is it just a docker container?"


In [3]:
# setup chroma client
import chromadb
chroma_client = chromadb.PersistentClient('./tmp/chroma-db')

In [4]:
# custom embedding function using Gemini Embeddings API
# ref: https://github.com/google-gemini/cookbook/blob/main/examples/chromadb/Vectordb_with_chroma.ipynb
import google.genai as genai

client = genai.Client(api_key=os.getenv("GENAI_API_KEY"))

class GeminiEmbeddingFunction(chromadb.EmbeddingFunction):
  def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
    EMBEDDING_MODEL_ID = "gemini-embedding-001"  # @param ["gemini-embedding-001", "text-embedding-004"] {"allow-input": true, "isTemplate": true}
    title = "Custom query"
    response = client.models.embed_content(
        model=EMBEDDING_MODEL_ID,
        contents=input,
        config=genai.types.EmbedContentConfig(
          task_type="retrieval_document",
          title=title
        )
    )

    return response.embeddings[0].values

In [9]:
# create collection for storage
collection = chroma_client.get_collection(name="test",
                                             embedding_function=GeminiEmbeddingFunction()
                                             )

  embedding_function=GeminiEmbeddingFunction()


In [6]:
# function for chunking text into smaller pieces for better embedding and retrieval
# makes use of huggingface tokenizers library to tokenize text
#  and chunk it based on a specified chunk size

import transformers
from transformers import AutoTokenizer, GPT2Tokenizer
# define tokenizer
tokenizer:GPT2Tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2-medium')
# chunking function
def chunker(text, chunk_size=5) -> list[str]:
    tokens = tokenizer.tokenize(text)
    # get length of tokens
    _len = len(tokens)
    chunks = []
    chunk = []
    for i in range(_len):
        if i%chunk_size==0 and i!=0:
            print(chunk)    
            print('chunk limit reached')
            chunks.append(tokenizer.convert_tokens_to_string(chunk))
            chunk = []
        chunk.append(tokens[i])
        print(f'Token {i}: {tokens[i]}')
        
    if chunk:  # Append the last chunk if it exists
        print(chunk)
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

  from .autonotebook import tqdm as notebook_tqdm
PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
# read text from test_docs dir
# note we will not be doing file readong for the scraper most likely
# as far as i (chi) understand
# companies usually run scrapers through timed jobs or specific triggers
# that manage the scraping and data ingestion process
# a common approach is to have sth like an aws lambda function or a serverless function
# that gets triggered when new data is available or at scheduled intervals 
# to perform the scraping and then directly ingest the data 
# into the vector db

import os
import uuid

docs = {}

test_docs_dir = './test_docs'
for filename in os.listdir(test_docs_dir):
    print(filename)
    if filename.endswith(('.txt','.md')):
        with open(os.path.join(test_docs_dir, filename), 'rb') as f:
            # read doc
            content = f.read().decode('utf-8', errors='ignore')
            # chunk doc into 500 token chunks
            chunks = chunker(content, chunk_size=500)
            # add chunks to docs dict with unique id
            for i in chunks:
                doc_id = str(uuid.uuid4())
                docs[doc_id] = i

Token indices sequence length is longer than the specified maximum sequence length for this model (3216 > 1024). Running this sequence through the model will result in indexing errors


compprog-c5-snippet.txt
Token 0: Chapter
Token 1: Ġ5
Token 2: č
Token 3: Ċ
Token 4: Complete
Token 5: Ġsearch
Token 6: č
Token 7: Ċ
Token 8: Complete
Token 9: Ġsearch
Token 10: Ġis
Token 11: Ġa
Token 12: Ġgeneral
Token 13: Ġmethod
Token 14: Ġthat
Token 15: Ġcan
Token 16: Ġbe
Token 17: Ġused
Token 18: Ġto
Token 19: Ġsolve
Token 20: Ġalmost
Token 21: Ġany
Token 22: č
Token 23: Ċ
Token 24: al
Token 25: gorithm
Token 26: Ġproblem
Token 27: .
Token 28: ĠThe
Token 29: Ġidea
Token 30: Ġis
Token 31: Ġto
Token 32: Ġgenerate
Token 33: Ġall
Token 34: Ġpossible
Token 35: Ġsolutions
Token 36: Ġto
Token 37: Ġthe
Token 38: Ġproblem
Token 39: č
Token 40: Ċ
Token 41: using
Token 42: Ġbrute
Token 43: Ġforce
Token 44: ,
Token 45: Ġand
Token 46: Ġthen
Token 47: Ġselect
Token 48: Ġthe
Token 49: Ġbest
Token 50: Ġsolution
Token 51: Ġor
Token 52: Ġcount
Token 53: Ġthe
Token 54: Ġnumber
Token 55: Ġof
Token 56: č
Token 57: Ċ
Token 58: s
Token 59: olutions
Token 60: ,
Token 61: Ġdepending
Token 62: Ġon
Token 63:

In [12]:
import pprint
print(list(docs.keys()))
pprint.pprint(docs)

['c363322d-fc0f-4bb1-aad6-d6d8f097475e', '5279d4bc-a3c5-4b0c-98b0-4e9a07ae4bf9', 'b1823673-9ad6-40e9-822d-bcd9967c1577', 'b7d757ed-ca64-4c7f-af7d-7c73de9c8ce7', 'f6874ee2-cb3d-40b3-bd97-c14ee4d59b37', 'd9a47b1e-7c09-4434-8f6a-eb893d648a25', '279d4892-972a-4068-b233-c68e9b31a33b', 'abef84eb-bd1e-437a-ac73-ba049a904b46', '76cd9a12-caf6-48b0-9dc6-e00486783b6b', 'fea4f6b4-913a-4d16-b373-20e80830d567', '53702b42-a3eb-4a0f-b792-ebd84c7bd4ea', '00b03d04-996e-403c-bb79-4408764a8bb2', 'c5c0d08d-a50c-4866-a4e4-878d7311e9f6', 'f7cb7c85-58a0-4b95-927f-695ae66baee3']
{'00b03d04-996e-403c-bb79-4408764a8bb2': ' to each other?\r\n'
                                         '\r\n'
                                         'That’s where **Services** come '
                                         'in.\r\n'
                                         '\r\n'
                                         'A **Service** provides a stable '
                                         'endpoint (like a permanent address) 

In [10]:
for i in docs:
    collection.add(ids=[i], documents=[docs[i]])

In [13]:
# another search test
from pprint import pprint
res = collection.query(
    query_texts=[prompt],
    n_results=4
)
# pprint(res)
# print(len(res['documents']))
doc_chunks = [i for i in res['documents'][0]]
pprint(doc_chunks)

['# **Introduction to Kubernetes (K8s)**\r\n'
 '\r\n'
 '## **1. What is Kubernetes?**\r\n'
 '\r\n'
 'Kubernetes (often abbreviated as **K8s**, where “8” represents the eight '
 'letters between K and s) is an open-source system for automating the '
 'deployment, scaling, and management of containerized applications. '
 'Originally designed by Google based on their experience running massive '
 'containerized workloads (like Search, Gmail, and YouTube), Kubernetes was '
 'later donated to the Cloud Native Computing Foundation (CNCF), where it is '
 'now maintained by a large global community.\r\n'
 '\r\n'
 'At its core, Kubernetes is a **container orchestration platform**. This '
 'means that instead of manually running individual containers on individual '
 'machines, Kubernetes manages groups of machines (called a **cluster**) and '
 'automatically schedules, restarts, scales, and monitors your containers '
 'across that infrastructure.\r\n'
 '\r\n'
 'If Docker is like a way to packag

In [14]:
RAG_prompt_template = '''
CONTEXT:
{retrieved_documents}

QUESTION:
{user_question}

INSTRUCTIONS:
Answer the QUESTION using only the information provided in the CONTEXT above.
Keep your answer grounded in the facts of the CONTEXT.
Use [chunk_id] notation immediately after each statement to cite sources.
If the CONTEXT doesn't contain enough information to fully answer the QUESTION, state: "I don't have enough information to answer this completely" and explain what's missing.
Match the language of the user's QUESTION in your response.

Provide a clear, factual answer based solely on the CONTEXT provided.
'''

In [15]:
# little llm test for fun
from google import genai
from google.genai.types import HttpOptions

# format prompt with context from retrieved doc chunks
formatted_prompt = RAG_prompt_template.format(
    retrieved_documents='\n\n'.join(doc_chunks),
    user_question=prompt
)

client = genai.Client(http_options=HttpOptions(api_version="v1"))
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=formatted_prompt,
    config=genai.types.GenerateContentConfig(
        max_output_tokens=1024,
        temperature=0.2,
        top_p=0.8,
        stop_sequences=["###"]
    )
)
print(response.text)

A Pod is the smallest deployable unit in Kubernetes [4]. It typically contains one or more containers that share the same network, storage, and lifecycle [4]. In Kubernetes, you don't run containers directly; instead, you run Pods [4].

A Pod is not just a Docker container [4]. While a Pod contains containers, it can hold one or more containers [4]. The container runtime, which is usually Docker or containerd, actually runs the containers within a worker node [3.2].
