In [7]:
!pip install python-dotenv
#!pip install openai elasticsearch

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [8]:
import json
from openai import OpenAI
from dotenv import dotenv_values
import minsearch

In [9]:
API_KEY = dotenv_values(".env")["API_KEY"]
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)
    
documents = []
for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [10]:
client = OpenAI(api_key=API_KEY, base_url="https://api.perplexity.ai")

In [11]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
    )
index.fit(documents)

<minsearch.Index at 0x7a26b5c043d0>

In [12]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}
    results = index.search(
        query = query,
        filter_dict = {"course": "data-engineering-zoomcamp"}, # when we want to limit to data engineering zoomcamp course
        boost_dict = boost,
        num_results = 10
    )
    return results

In [13]:
def build_prompt(query, search_results): 
    propmt_template = """"You're a course teaching assistant. Answer the QUESTION based on CONTEXT. 
    Use only the facts from CONTEXT when answering the QUESTION.


    QUESTION: {question}

    CONTEXT: {context}""".strip() ## specify the role # prompt engineering 
    context = ""
    for doc in search_results:
        context = context + f"section : {doc['section']} \nquestion: {doc['question']} \nanswer: {doc['text']}\n\n"
    prompt = propmt_template.format(question = query, context = context).strip()
    return prompt

In [14]:
def llm(prompt):
    response = client.chat.completions.create(
    model = "llama-3-sonar-large-32k-chat",
    messages = [{"role": 'user', "content": prompt}],
    )
    return response.choices[0].message.content

In [15]:
def rag(query):
    #query = "how do i run kafka?"
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [16]:
rag("how do i run kafka ?")

'To run Kafka, you need to make sure that your Kafka broker Docker container is running. You can check this by running `docker ps` in your terminal. If the container is not running, navigate to the folder containing your Docker Compose YAML file and run `docker compose up -d` to start all the instances.'

In [17]:
# REPLACING THE SEARCH WITH ELASTIC SEARCH: elastic search is persistant - it will save the data to disk, when it starts next time it will use that index
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [58]:
from elasticsearch import Elasticsearch


In [59]:
es_client = Elasticsearch("http://localhost:9200")


In [60]:
es_client.info()

ObjectApiResponse({'name': '4892a6b17ed3', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'H3US50BmQzmo5qX-0RlgQQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [61]:
# creating an index in elastic search:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}


index_name = "course-questions"
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [66]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document =doc)

100%|██████████| 948/948 [00:21<00:00, 44.53it/s]


In [62]:
# now we can query this data:
query = "how do I run kafka?"
def elastic_search_query(query):
    search_query = {
        "size": 5, # WHEN WE WANT 5 ANSWERS IN THE RESULT
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], # ^3 MEANS QUESTION FIELD IS 3 TIMES MORE IMPORTANT
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp" # FILTERING COMPONENET
                    }
                }
            }
        }
    }
    response = es_client.search(index = index_name, body = search_query)
    result = []
    for hit in response["hits"]["hits"]:
        result.append(hit["_source"])
    return result

In [63]:
result = []
for hit in response["hits"]["hits"]:
    result.append(hit["_source"])
    

In [67]:
def rag(query):
    search_results = elastic_search_query(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer


In [68]:
rag(query)

'To run Kafka, you need to create a virtual environment, install the required packages, and then activate the virtual environment. Here are the steps:\n\n1. Create a virtual environment: `python -m venv env`\n2. Activate the virtual environment: `source env/bin/activate` (on MacOS/Linux) or `env/Scripts/activate` (on Windows)\n3. Install the required packages: `pip install -r ../requirements.txt`\n\nAfter setting up the virtual environment, you can run the Kafka producer/consumer/kstreams/etc. in the terminal using the command: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n\nNote: Make sure to create the virtual environment only to run the Python file, and ensure that the Docker images are up and running before proceeding.'