In [1]:
import minsearch

In [2]:
import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

### Querying the model WITHOUT CONTEXT

In [7]:
q = 'the course has already started, can I still enroll?'

In [8]:
index.fit(documents)

<minsearch.minsearch.Index at 0x229bdb39130>

In [9]:
from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1/',
    api_key='ollama',
)

In [11]:
from ollama import chat
from ollama import ChatResponse

response: ChatResponse = chat(model='gemma3:4b', messages=[
  {
    'role': 'user',
    'content': q, # CHECKING THIS q ONLY
  },
])
# print(response['message']['content'])
# or access fields directly from the response object
# print(response.message.content) - ONLY ONE
print(response.message.content)

Whether you can still enroll in a course that's already started depends entirely on the course and the institution offering it. Here's a breakdown of the possibilities and what you should do:

**1. Check the Course Website:**

* **Most Important Step:** The first thing you should do is go to the official website of the course or the institution offering it.  Look for a section called "Enrollment," "Registration," "Add/Drop," or something similar.  This is where they'll clearly state the enrollment deadlines and any policies.

**2. Common Scenarios and Possibilities:**

* **Open Enrollment (Rare):** Some courses, especially online ones, might have open enrollment for a short period after the start date.  This is uncommon but does happen.
* **Drop/Add Period:** Many courses have a "drop/add" period shortly after the course starts. During this time, you can usually add the course to your schedule, even if it’s already in progress.  This period is typically a few days (often 7-14 days).
* 

In [12]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [13]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

### Querying the model WITH CONTEXT

In [14]:
def llm(prompt):
    from ollama import chat
    from ollama import ChatResponse

    response: ChatResponse = chat(model='gemma3:4b', messages=[ 
    {
        'role': 'user',
        'content': prompt, # FOR ANY GENERAL PROMPT HERE
    },
    ])
    
    return response.message.content

In [15]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [16]:
rag(query)

'To run the Java Kafka producer, run: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`. If you are running `build.sh`, you need to run `chmod +x build.sh` in the same directory.'

In [17]:
rag('the course has already started, can I still enroll?')

'Yes, you can still enroll.'

In [18]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

### Elastic Search

In [7]:
from elasticsearch import Elasticsearch

In [8]:
es_client = Elasticsearch('http://localhost:9200') 

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x00000203CA78CE90>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it))

In [None]:
documents[0]

In [None]:
from tqdm.auto import tqdm

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

In [None]:
query = 'I just disovered the course. Can I still join it?'

In [None]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [None]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag(query)