In [1]:
import requests
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [2]:
documents = []
for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines 

In [4]:
import pandas as pd
df_docs = pd.DataFrame.from_records(documents)
df_docs.head()

Unnamed: 0,text,section,question,course
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp
2,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp
3,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp


In [5]:
import minsearch

In [6]:
index = minsearch.Index(
    text_fields = ['question', 'text', 'section'],
    keyword_fields = ['course']
)
index.fit(documents)

<minsearch.minsearch.Index at 0x7b9f32a9e630>

In [7]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}
    results = index.search(
        query=query,
        filter_dict={'course' : 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results

In [8]:
import ollama

In [9]:
def build_prompt(query, search_result):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION: {question}
    CONTEXT:
    {context}
    """.strip()
    context = ''
    for doc in search_result:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context)
    return prompt.strip()

def llm(prompt):
    model_name = 'llama3.2:1b'
    print(model_name)
    response = ollama.chat(model=model_name, messages=[{'role' : 'user', \
    'content' : prompt}])
    return response['message']['content'] # ollama version

def rag(query):
    search_result = search(query)
    prompt = build_prompt(query, search_result)
    answer = llm(prompt)
    return answer

In [10]:
from datetime import datetime
print(datetime.now())
print(rag('i just found out about this course. can i still join?'))
print(datetime.now())

2025-06-27 03:24:16.632419
llama3.2:1b
Since the course starts on January 15th, 2024 at 17h00, it is possible for students to join the course after its start date.
2025-06-27 03:24:32.109103


In [11]:
import google.generativeai as genai
import os

try:
    genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
except KeyError:
    print('set the GOOGLE_API_KEY environment variable first')

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def llm_google(prompt):
    model_name = 'gemma-3-1b-it'
    print(model_name)
    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt)
    return response.text # google version

def rag_google(query):
    search_result = search(query)
    prompt = build_prompt(query, search_result)
    answer = llm_google(prompt)
    return answer

In [13]:
from qdrant_client import QdrantClient
client = QdrantClient('http://localhost:6333')
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='vector+rag'), CollectionDescription(name='sparse'), CollectionDescription(name='sparse+dense'), CollectionDescription(name='zoomcamp-rag')])

In [14]:
from qdrant_client import models

In [15]:
embed_size = 512
model_handle = 'jinaai/jina-embeddings-v2-small-en'
print(embed_size)
print(model_handle)

512
jinaai/jina-embeddings-v2-small-en


In [16]:
client.delete_collection(collection_name='vector+rag')
client.create_collection(
	collection_name='vector+rag',
	vectors_config=models.VectorParams(
		size=embed_size,
		distance=models.Distance.COSINE
	)
)

True

In [17]:
points = []

for id, doc in enumerate(documents[0::9]):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=id,
        vector=vector,
        payload=doc
    )
    points.append(point)

len(points)

106

In [18]:
print(datetime.now())
client.upsert(
	collection_name='vector+rag',
	points=points
)
print(datetime.now())

2025-06-27 03:25:05.637453
2025-06-27 03:25:27.889991


In [19]:
client.create_payload_index(
    collection_name='vector+rag',
    field_name='course',
    field_schema='keyword'
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [20]:
def vector_search(query, limit=1):
    print('vector search is used')
    course = 'data-engineering-zoomcamp'
    query_points = client.query_points(
        collection_name='vector+rag',
        query=models.Document(
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key='course',
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit,
        with_payload=True
    )
    results = [point.payload for point in query_points.points]
    return results

In [21]:
vector_search('the course has already started. can i still join?')

vector search is used


[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'}]

In [23]:
def rag(query, limit=1):
    search_result = vector_search(query, limit=limit)
    prompt = build_prompt(query, search_result)
    answer = llm(prompt)
    return answer

In [24]:
print(datetime.now())
print(rag('i just found out about this course. can i still join?'))
print(datetime.now())

2025-06-27 03:26:20.063438
vector search is used
llama3.2:1b
I couldn't find any information about the course in the provided CONTEXT, so I'll answer based on the available text:

i just found out about this course. can i still join?

NONE
2025-06-27 03:26:26.983354


In [25]:
print(datetime.now())
print(rag('how do i run kafka?', 5))
print(datetime.now())

2025-06-27 03:26:37.720018
vector search is used
llama3.2:1b
I do not have any information in the CONTEXT about running Kafka or streaming data from it. TheCONTEXT only contains questions and answers related to building a project called "java-kafka-rides" which uses Java, PySpark, and possibly other libraries for various tasks such as data processing, streaming, and workflow orchestration.

Therefore, I cannot provide any information on how to run Kafka or what to do if you encounter errors running it.
2025-06-27 03:27:06.687667
