### RAG

In [6]:
import os
from dotenv import load_dotenv

In [7]:
load_dotenv()
api_key = os.getenv("GEMINI_KEY")

In [8]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7df2d7f86900>

In [9]:
from google import genai
gemini_client = genai.Client(api_key="api_key")

In [7]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [8]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [9]:
def llm(prompt):
    gemini_client = genai.Client(api_key=api_key)
    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt
    )
    
    return print(response.text)

In [10]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [11]:
rag('how do I run kafka?')

To run kafka with Java, in the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

To run kafka with Python, create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once):
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate). Also, the virtual environment should be created only to run the python file, and Docker images should first all be up and running.

If you encounter a "ModuleNotFoundError: No module named 'kafka.vendor.six.moves'" error, use pip install kafka-python-ng instead.



In [None]:
import os
print(os.environ.get("GEMINI_KEY"))  # Returns None if not set

In [None]:
rag('the course has already started, can I still enroll?')

### RAG with Vector Search

In [11]:
from qdrant_client import QdrantClient, models

In [12]:
# Initialise the client and connect to our local instance
qd_client = QdrantClient("http://localhost:6333", timeout=60.0)

In [13]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [None]:
#qd_client.delete_collection(collection_name="zoomcamp-faq")

True

In [19]:
# create a new collection
# qd_client.delete_collection(collection_name=collection_name) this is to delete if it exists and you dont want it
# let me rather check for the collection 

collection_name = "zoomcamp-faq"

In [20]:
# Check if collection exists
if not qd_client.collection_exists(collection_name):
    # Create the collection if it doesn't exist
    qd_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
            distance=models.Distance.COSINE  # Distance metric for similarity search
        )
    )
    print(f"Collection '{collection_name}' created successfully")
else:
    print(f"Collection '{collection_name}' already exists")

Collection 'zoomcamp-faq' already exists


In [16]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [5]:
# include the questions unlike earlier
# switch to batch processing to see if terminal would stop crashing
'''
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)
'''

"\npoints = []\n\nfor i, doc in enumerate(documents):\n    text = doc['question'] + ' ' + doc['text']\n    vector = models.Document(text=text, model=model_handle)\n    point = models.PointStruct(\n        id=i,\n        vector=vector,\n        payload=doc\n    )\n    points.append(point)\n"

In [17]:
# Process in batches
batch_size = 100
points_batch = []
for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload={"question": doc["question"]}  # Minimal payload
    )
    points_batch.append(point)
    if len(points_batch) >= batch_size:
        qd_client.upsert(collection_name=collection_name, points=points_batch)
        points_batch = []
if points_batch:
    qd_client.upsert(collection_name=collection_name, points=points_batch)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

In [30]:
#points[-1]

In [15]:
question = 'I just discovered the course. Can I still join it?'

In [22]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [25]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [27]:
def llm(prompt):
    gemini_client = genai.Client(api_key=api_key)
    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt
    )
    
    return print(response.text)

In [28]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [29]:
rag('how do I run kafka?')

vector_search is used


To run kafka, use `docker ps` to confirm that your kafka broker docker container is working. If it is not working, navigate to the docker compose yaml file folder and run `docker compose up -d` to start all instances.

To run producer/consumer/kstreams in the terminal, in the project directory, run: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`.

If you are running a producer.py file, create a virtual environment and run requirements.txt in that environment. Then activate the virtual environment.
*   To create a virtual env and install packages (run only once):
    `python -m venv env`
    `source env/bin/activate`
    `pip install -r ../requirements.txt`
*   To activate it (you'll need to run it every time you need the virtual env):
    `source env/bin/activate`
*   To deactivate it:
    `deactivate`


### Lessons learnt
Due to persistence of the Docker container, I don't need to re-run the entire codes again. All I need is:

- `docker ps -a`
- `docker start containerID`
- ensure the collection exists on qdrant dashboard and is healthy with points before proceeding with the next.
- For quadrant 
    - initiate client, embed, collection name
    - use the check code, "if collecton exists" skip
    - move to questions, vector search, build_prompt, llm and the rag
- if you proceed with recreating points and upsert, the kernel will crash.