In [18]:
import os
from groq import Groq

# Instantiation of Groq Client
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY")
)

In [1]:
import minsearch

In [2]:
import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents=[]
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [11]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]                 
)

In [9]:
q='the course has already started, can I still enroll?'

In [12]:
index.fit(documents)

<minsearch.Index at 0x7c62a06944f0>

In [62]:
def search(query):
    boost={'question': 3.0, 'section' : 0.5}

    results= index.search(
    query=query,
    filter_dict={'course': 'data-engineering-zoomcamp'},
    boost_dict = boost,
    num_results=5
    )
    return results

In [63]:
results= search('how do I run Kafka?')

In [73]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [78]:
context

"section: General course-related questions\nquestion: Course - Can I still join the course after the start date?\nanswer: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nsection: General course-related questions\nquestion: Course - Can I follow the course after it finishes?\nanswer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.\n\nsection: General course-related questions\nquestion: Course - When will the course start?\nanswer: The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course 

In [74]:
def llm(prompt):
    response = client.chat.completions.create(
    messages=[
        {
           "role": "user", "content": prompt
        }
    ],
    model="mixtral-8x7b-32768",
    )
    return response.choices[0].message.content
        

In [75]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [76]:
rag(query)

'Based on the context provided, there are separate instructions for running Kafka with Java and with Python.\n\nFor Java, the command to run the producer is:\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nReplace `<jar_name>` with the actual name of your jar file.\n\nFor Python, you\'ll need to create a virtual environment and run the following commands:\n```\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\n```\nIf you encounter a "Permission denied" error when running the build script, try running `chmod +x build.sh` in the same directory.\n\nIf you encounter a "ModuleNotFoundError: No module named \'kafka.vendor.six.moves\'" error, try using `kafka-python-ng` instead by running `pip install kafka-python-ng`.'

In [77]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. According to the course FAQ, you are still eligible to submit homeworks even if you don't register, but there will be deadlines for turning in the final projects. The course materials will be kept available after the course finishes, so you can follow the course at your own pace after it has ended. It is recommended to install and set up all the dependencies and requirements beforehand, and you can also start by looking over the prerequisites and syllabus. Additionally, support is available through the slack channel even if you take the course in the self-paced mode."

## Elasticsearch

- open a new terminal
- run the elasticsearch in docker by using the following command:

```
    docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3 
```

In [79]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [100]:
from elasticsearch import Elasticsearch

In [106]:
es_client= Elasticsearch('http://localhost:9200')

In [107]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name= "course-questions"



In [None]:
es_client.indices.create(index=index_name, body=index_settings)

In [109]:
from tqdm.auto import tqdm

In [110]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document = doc)
    

100%|██████████████████████████████████████████████████████████████████████████████| 948/948 [00:25<00:00, 37.41it/s]


In [115]:
query = 'I just disovered the course. Can I still join it?'

In [120]:

def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [121]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [122]:
rag(query)

'Yes, you can still join the course even if it has already started. However, please be aware that you will have to catch up on any missed material and assignments. Additionally, the course syllabus and pacing are designed for students who started at the beginning of the course, so you may need to adjust your study schedule accordingly. We recommend reaching out to the instructor or teaching assistant for guidance on how to best join the course at this point.'

In [123]:
search_results

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make change