In [97]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [98]:
import urllib.request

url = "https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py"
urllib.request.urlretrieve(url, "minsearch.py")

('minsearch.py', <http.client.HTTPMessage at 0x1d8c9e8b2d0>)

In [99]:
import minsearch

In [100]:
import json

In [101]:
import os

In [102]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [103]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [104]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [105]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [106]:
q = 'the course has already started, can I still enroll?'

In [107]:
index.fit(documents)

<minsearch.Index at 0x1d8ca703a50>

In [108]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

True

In [109]:
client = OpenAI()

In [110]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can enroll in a course that has already started depends on the institution or platform offering the course and their specific policies. Here are some steps you can take to find out:\n\n1. **Check the Course Website:** Visit the course's website or the institution's enrollment page to see if they provide information about late enrollment.\n\n2. **Contact the Instructor or Administration:** Reach out to the course instructor or the administrative office responsible for enrollments. They might be able to make exceptions or give you guidance on late enrollment procedures.\n\n3. **Review the Course Policy:** Some courses have specific policies about late enrollment, which might be detailed in the course syllabus or guidelines.\n\n4. **Consider Audit Options:** If formal enrollment is not possible, ask if you can audit the course, which might allow you to attend classes without earning credit.\n\n5. **Look for Online Options:** If the course is part of an online platform, check 

In [111]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [112]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [113]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [114]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [115]:
rag(query)

'To run Kafka in a terminal, you can execute the following command from the project directory:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nMake sure to replace `<jar_name>` with the actual name of your jar file.'

In [116]:
rag('the course has already started, can I still enroll?')

'Yes, you can enroll and participate in the course even after it has started. You are still eligible to submit homework assignments, but keep in mind that there will be deadlines for the final projects. Make sure not to leave everything for the last minute.'

In [117]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [118]:
from elasticsearch import Elasticsearch

In [119]:
es_client = Elasticsearch('http://localhost:9200') 

In [120]:
import requests
print(requests.get("https://localhost:9200", verify=False).text)#print(requests.get("http://localhost:9200/_cat/indices?v").text)

{"error":{"root_cause":[{"type":"security_exception","reason":"missing authentication credentials for REST request [/]","header":{"WWW-Authenticate":["Basic realm=\"security\" charset=\"UTF-8\"","Bearer realm=\"security\"","ApiKey"]}}],"type":"security_exception","reason":"missing authentication credentials for REST request [/]","header":{"WWW-Authenticate":["Basic realm=\"security\" charset=\"UTF-8\"","Bearer realm=\"security\"","ApiKey"]}},"status":401}




In [121]:
ELASTIC_PASSWORD = os.getenv('ELASTIC_PASSWORD')

In [122]:
es_client = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", ELASTIC_PASSWORD),
    verify_certs=False  # Only use verify_certs=False for local/testing/self-signed certs
)

  _transport = transport_class(


In [123]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    }
}

index_name = "test-index"

# Delete the index if it already exists (optional, for a clean start)
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)

# Create the index
#es_client.indices.create(index=index_name, body=index_settings,  request_timeout=60)
es_client.indices.create(
    index=index_name,
    body=index_settings,
    request_timeout=60,
    timeout="30s"
)

  es_client.indices.create(


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'test-index'})

In [128]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", ELASTIC_PASSWORD),  # Use the password from your container logs
    verify_certs=False
)


In [140]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [126]:
from tqdm.auto import tqdm

In [134]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:55<00:00, 17.04it/s]


## to set password:

docker start elsticsearch

docker exec -it elasticsearch bin/elasticsearch-reset-password -u elastic

In [135]:
query = 'I just disovered the course. Can I still join it?'
index_name = "course-questions"
#es_client = Elasticsearch(
#    "https://localhost:9200",
#    basic_auth=("elastic", ELASTIC_PASSWORD),  # Use the password from your container logs
#    verify_certs=False
#)

def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

rag(query)



"It seems that the relevant information needed to answer your question is not provided in the given context. Please refer to the course's official registration page or contact the course administrators directly to find out if you can still join the course."

In [136]:
query = 'I just disovered the course. Can I still join it?'

rag(query)



"The context provided does not give any information about whether you can join the course after it has started. Therefore, I cannot determine if you're able to join the course at this time. It's best to check the specific course enrollment policies or reach out to the course organizer or institution for accurate information."

In [143]:
query = 'What are the prerequisites for the data-engineering-zoomcamp?'

rag(query)



'The FAQ database does not provide any specific prerequisites for the data-engineering-zoomcamp. You may need to consult the course description or contact the course organizers for detailed information regarding prerequisites.'