In [1]:
import minsearch
import json
import openai
import os
from openai import OpenAI
from tqdm import tqdm
from elasticsearch import Elasticsearch

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict['course']
        documents.append(doc)

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [5]:
index.fit(documents)

<minsearch.minsearch.Index at 0x77248ca8a750>

In [6]:
q = 'the course has already started, can I still enroll?'

In [7]:
client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.environ["HF_TOKEN"],
)

In [8]:
completion = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3-0324",
    messages=[{'role':'user', 'content':q}]
)

In [9]:
completion.choices[0].message.content

'It depends on the institution and the specific course. Here are some general steps to check if late enrollment is possible:\n\n1. **Check the Course Policy**: Some courses allow late enrollment with penalties (e.g., missed assignments), while others enforce strict deadlines.  \n2. **Contact the Instructor/Registrar**: Email the instructor or administrative office to ask if late registration is permitted and what steps to take.  \n3. **Review Missed Work**: If allowed, confirm whether you can make up missed lectures, assignments, or exams.  \n4. **Online Courses (Flexible Start)**: Some online platforms (e.g., Coursera, Udemy) allow self-paced enrollment anytime.  \n\n**Tip**: If the course is ongoing but critical to your goals, ask about auditing (attending without credit) as an alternative.  \n\nWould you like help finding contact details for the institution or program?'

In [10]:
context = ""
for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

NameError: name 'results' is not defined

In [None]:
prompt = prompt_template.format(question=q, context=context).strip()

In [None]:
response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3-0324",
    messages=[{'role':'user', 'content':prompt}]
)

response.choices[0].message.content

In [11]:
def search(query):
    boost = {'question':3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [12]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the question.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT: {context}
""".strip()
    
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [13]:
def llm(prompt):
    response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3-0324",
    messages=[{'role':'user', 'content':prompt}]
    )
    
    return response.choices[0].message.content

In [14]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [15]:
query = 'how do i run kafka'
rag(query)

"To run Kafka in Java, you can use the following command in the project directory (as mentioned in the CONTEXT):  \n\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nFor Python Kafka, ensure you are in a virtual environment (with dependencies installed via `requirements.txt`) and that Docker containers are running. If you encounter a `Permission denied` error when running `build.sh`, execute:\n\n```\nchmod +x build.sh\n```\n\nIf you face a module import issue (`No module named 'kafka.vendor.six.moves'`), consider using `kafka-python-ng` instead:\n\n```\npip install kafka-python-ng\n```"

In [7]:
es_client = Elasticsearch('http://localhost:9200')

In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████| 948/948 [00:09<00:00, 104.61it/s]


In [10]:
query = 'I just discovered this course, can I still join it?'

In [11]:
def elastic_search(query):
    search_query ={
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [36]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [39]:
print(rag(query))

Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


In [12]:
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta