In [1]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [2]:
import urllib.request

url = "https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py"
urllib.request.urlretrieve(url, "minsearch.py")

('minsearch.py', <http.client.HTTPMessage at 0x2a823240450>)

In [3]:
import minsearch

In [4]:
import json

In [5]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [6]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [7]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [9]:
q = 'the course has already started, can I still enroll?'

In [10]:
index.fit(documents)

<minsearch.Index at 0x2a852402750>

In [11]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
client = OpenAI()

In [13]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"It depends on the specific course and the institution offering it. Some courses allow late enrollment, especially if they are designed with flexibility in mind, such as online courses or those with rolling admissions. Other courses may have strict deadlines and may not permit late entry. \n\nTo find out if you can still enroll, you should:\n\n1. **Check the Course Website:** Look for any information regarding late enrollment policies.\n2. **Contact the Instructor or Course Coordinator:** Reach out directly to see if exceptions are possible.\n3. **Contact the Admissions Office:** They may provide guidance on the institution's policy regarding late enrollment.\n\nGood luck!"

In [14]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [15]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [16]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [17]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [18]:
rag(query)

'To run Kafka, follow these instructions based on your setup:\n\n1. **Java Kafka**: If you are using Java, you can run a producer, consumer, or KStream by using the following command in your project\'s directory:\n   ```\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **Python Kafka**: If you are encountering a "Module \'kafka\' not found" error when trying to run `producer.py`, you should:\n   - Create a virtual environment and install the required packages using `requirements.txt`.\n   - Run the following commands:\n     ```\n     python -m venv env\n     source env/bin/activate\n     pip install -r ../requirements.txt\n     ```\n   - Activate this virtual environment each time you need it by running `source env/bin/activate`.\n   - Deactivate it with `deactivate` when done.\n   - Make sure Docker images are up and running if applicable.\n\nMake sure to adapt paths and commands to suit your operating system, noting dif

In [19]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course after it has started. You are eligible to submit the homework assignments. However, keep in mind that there will be deadlines for turning in the final projects, so it's important to manage your time and not leave everything for the last minute."

In [20]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [21]:
from elasticsearch import Elasticsearch

In [43]:
#es_client = Elasticsearch('https://localhost:9200', verify_certs=False)
es_client = Elasticsearch('http://localhost:9200') 

#from elasticsearch import Elasticsearch

#es_client = Elasticsearch(
#    "https://localhost:9200",
#    basic_auth=("elastic", "yourpassword"),
#    verify_certs=False  # Only use verify_certs=False for local/testing/self-signed certs
#)

In [44]:
import requests
print(requests.get("http://localhost:9200").text)
#print(requests.get("http://localhost:9200/_cat/indices?v").text)


{
  "name" : "085d86246266",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "Dz7D3rDaRUOB2BFZ8y7EYg",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}



In [49]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    }
}

index_name = "test-index"

# Delete the index if it already exists (optional, for a clean start)
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)

# Create the index
#es_client.indices.create(index=index_name, body=index_settings,  request_timeout=60)
es_client.indices.create(
    index=index_name,
    body=index_settings,
    request_timeout=60,
    timeout="30s"
)



  es_client.indices.create(


ConnectionTimeout: Connection timed out

In [48]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client = Elasticsearch("http://localhost:9200", request_timeout=30)


In [34]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [35]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:10<?, ?it/s]


ConnectionTimeout: Connection timed out

In [None]:
query = 'I just disovered the course. Can I still join it?'

In [37]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [38]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [39]:
rag(query)

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [course-questions]', course-questions, index_or_alias)