### RAG

In [2]:
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()
api_key = os.getenv("GEMINI_KEY")

In [4]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x706604515040>

In [6]:
from google import genai
gemini_client = genai.Client(api_key="api_key")

In [7]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [8]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [9]:
def llm(prompt):
    gemini_client = genai.Client(api_key=api_key)
    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt
    )
    
    return print(response.text)

In [10]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [11]:
rag('how do I run kafka?')

To run kafka with Java, in the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

To run kafka with Python, create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once):
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate). Also, the virtual environment should be created only to run the python file, and Docker images should first all be up and running.

If you encounter a "ModuleNotFoundError: No module named 'kafka.vendor.six.moves'" error, use pip install kafka-python-ng instead.



In [None]:
import os
print(os.environ.get("GEMINI_KEY"))  # Returns None if not set

In [13]:
rag('the course has already started, can I still enroll?')

Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects, so don't leave everything for the last minute.



### RAG with Vector Search

In [8]:
from qdrant_client import QdrantClient, models

In [9]:
# Initialise the client and connect to our local instance
qd_client = QdrantClient("http://localhost:6333", timeout=60.0)

In [10]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [11]:
qd_client.delete_collection(collection_name="zoomcamp-faq")

True

In [12]:
# create a new collection
# qd_client.delete_collection(collection_name=collection_name) this is to delete if it exists and you dont want it
collection_name = "zoomcamp-faq"

In [13]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE,
    )
)

True

In [14]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [15]:
# Process in batches
batch_size = 100
points_batch = []
for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload={"question": doc["question"]}  # Minimal payload
    )
    points_batch.append(point)
    if len(points_batch) >= batch_size:
        qd_client.upsert(collection_name=collection_name, points=points_batch)
        points_batch = []
if points_batch:
    qd_client.upsert(collection_name=collection_name, points=points_batch)

: 

: 

: 

In [23]:
# include the questions unlike earlier
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [16]:
points[-1]

PointStruct(id=947, vector=Document(text='How to destroy infrastructure created via GitHub Actions Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin', model='jinaai/jina-embeddings-v2-small-en', options=None), payload={'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin', 'section': 'Module 6: Best practices', 'question': 'How to destroy infrastructure created via GitHub Actions', 'course': 'mlops-zoomcamp'})

In [24]:
# upload data to the collection in batch
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

: 

: 

: 

In [None]:
question = 'I just discovered the course. Can I still join it?'