In [2]:
import json
from openai import OpenAI
from dotenv import dotenv_values
import minsearch

In [3]:
API_KEY = dotenv_values(".env")["API_KEY"]
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)
    
documents = []
for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [4]:
client = OpenAI(api_key=API_KEY, base_url="https://api.perplexity.ai")

In [5]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
    )
index.fit(documents)

<minsearch.Index at 0x7e1895509600>

In [6]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}
    results = index.search(
        query = query,
        filter_dict = {"course": "data-engineering-zoomcamp"}, # when we want to limit to data engineering zoomcamp course
        boost_dict = boost,
        num_results = 10
    )
    return results

In [7]:
def build_prompt(query, search_results): 
    propmt_template = """"You're a course teaching assistant. Answer the QUESTION based on CONTEXT. 
    Use only the facts from CONTEXT when answering the QUESTION.


    QUESTION: {question}

    CONTEXT: {context}""".strip() ## specify the role # prompt engineering 
    context = ""
    for doc in search_results:
        context = context + f"section : {doc['section']} \nquestion: {doc['question']} \nanswer: {doc['text']}\n\n"
    prompt = propmt_template.format(question = query, context = context).strip()
    return prompt

In [8]:
def llm(prompt):
    response = client.chat.completions.create(
    model = "llama-3-sonar-large-32k-chat",
    messages = [{"role": 'user', "content": prompt}],
    )
    return response.choices[0].message.content

In [9]:
def rag(query):
    #query = "how do i run kafka?"
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [10]:
rag("how do i run kafka ?")

'To run Kafka, you need to make sure that your Kafka broker docker container is running. You can check this by running `docker ps` in your terminal. If the container is not running, navigate to the folder where your docker compose yaml file is located and run `docker compose up -d` to start all the instances.'

In [11]:
# REPLACING THE SEARCH WITH ELASTIC SEARCH: elastic search is persistant - it will save the data to disk, when it starts next time it will use that index
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [51]:
from elasticsearch import Elasticsearch


In [52]:
es_client = Elasticsearch("http://localhost:9200")


In [53]:
es_client.info()

ObjectApiResponse({'name': '33f94adb439b', 'cluster_name': 'docker-cluster', 'cluster_uuid': '77UDi3wDT1i8QFm4K4hoAA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [54]:
# creating an index in elastic search:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}


index_name = "course-questions"
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [55]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document =doc)

  0%|          | 0/948 [00:00<?, ?it/s]

100%|██████████| 948/948 [00:20<00:00, 46.61it/s]


In [56]:
# now we can query this data:
query = "how do I run kafka?"
def elastic_search_query(query):
    search_query = {
        "size": 5, # WHEN WE WANT 5 ANSWERS IN THE RESULT
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], # ^3 MEANS QUESTION FIELD IS 3 TIMES MORE IMPORTANT
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp" # FILTERING COMPONENET
                    }
                }
            }
        }
    }
    return search_query

In [57]:
response = es_client.search(index= index_name, body = elastic_search_query(query))


In [None]:
result = []
for hit in response["hits"]["hits"]:
    result.append(hit["_source"])
    

In [60]:
def rag(query):
    search_results = elastic_search_query(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer


In [61]:
result

[{'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'question': 'Java Kafka: How to run producer/consumer/kstreams/etc in terminal',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repositor