In [4]:
pip install minsearch

Note: you may need to restart the kernel to use updated packages.


In [1]:
import minsearch

In [2]:
import json

In [13]:
with open('Module 1: Introduction/documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [14]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [15]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [16]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [17]:
q = 'the course has already started, can I still enroll?'

In [18]:
index.fit(documents)

<minsearch.minsearch.Index at 0x72a872b4e4e0>

In [19]:
from openai import OpenAI

In [20]:
client = OpenAI()

In [22]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

In [23]:
response.choices[0].message.content

"Whether you can still enroll in a course that has already started depends on the specific policies of the institution or organization offering the course. Here are a few steps you can take:\n\n1. **Check the Course Enrollment Policies:** Look at the course website or contact the registrar's office to understand the late enrollment policy.\n\n2. **Contact the Instructor:** Reach out to the course instructor to inquire whether they allow late enrollment. They might require you to catch up on missed work.\n\n3. **Institution-Specific Guidelines:** Some institutions have a grace period for late registration, so check if this applies.\n\n4. **Consider the Impact:** Assess your ability to catch up on missed material and participate fully in the remaining coursework.\n\n5. **Online/Catch-Up Resources:** Find out if there are resources available, like recorded lectures, to help you catch up.\n\nIf you receive approval to enroll late, make sure to understand any requirements for making up miss

In [27]:
def search(query): 
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [28]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION. 

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results: 
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [32]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [33]:
query = 'how do i run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [34]:
rag(query)

"To run Kafka using Java in the terminal, go to the project directory and execute the following command:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nFor running Kafka with Python, you should create and activate a virtual environment, install the necessary packages from `requirements.txt`, and then run your Python files within that environment:\n\n1. Create and activate the virtual environment:\n\n   ```bash\n   python -m venv env\n   source env/bin/activate   # Use `env/Scripts/activate` on Windows\n   ```\n\n2. Install the required packages:\n\n   ```bash\n   pip install -r ../requirements.txt\n   ```\n\n3. Deactivate the virtual environment after you're done:\n\n   ```bash\n   deactivate\n   ```\n\nMake sure all Docker images are running before working with the virtual environment for the Python Kafka."

In [35]:
rag('the course has already started, can i still enroll?')

'Yes, you can still enroll in the course even after it has started. You are eligible to submit the homework assignments, but be aware of the deadlines for turning in the final projects to avoid leaving everything until the last minute.'

In [37]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [38]:
from elasticsearch import Elasticsearch

In [39]:
es_client = Elasticsearch('http://localhost:9200')

In [40]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x72a870472b40>: Failed to establish a new connection: [Errno 111] Connection refused))