In [10]:
import json

from openai import OpenAI

In [11]:
client = OpenAI()
test_msg = client.chat.completions.create(
    model="gpt-4o", messages=[{"role": "user", "content": "what the fuck is going on?"}]
)

Bedrock, Mistral AI

----

Playground

In [12]:
with open("docs/documents.json") as f:
    docs_raw = json.load(f)

In [13]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [14]:
import minsearch

In [15]:
idx = minsearch.Index(
    text_fields=["question", "text", "section"], keyword_fields=["course"]
)

idx.fit(documents)

<minsearch.Index at 0x16c0c9cd0>

In [16]:
q = "Where is the homework section"

boost = {"question": 3.0, "section": 0.5}

results = idx.search(
    query=q, boost_dict=boost, num_results=5, filter_dict={"course": "mlops-zoomcamp"}
)

In [17]:
results

[{'text': 'Here',
  'section': 'Module 4: Deployment',
  'question': 'Where is the FAQ for Prefect questions?',
  'course': 'mlops-zoomcamp'},
 {'text': 'I was using an old version of sklearn due to which I got the wrong number of parameters because in the latest version min_impurity_split for randomforrestRegressor was deprecated. Had to upgrade to the latest version to get the correct number of params.',
  'section': 'Module 2: Experiment tracking',
  'question': 'Parameters Mismatch in Homework Q3',
  'course': 'mlops-zoomcamp'},
 {'text': 'In order to obtain the certificate, completion of the final capstone project is mandatory. The completion of weekly homework assignments is optional, but they can contribute to your overall progress and ranking on the top 100 leaderboard.',
  'section': '+-General course questions',
  'question': 'Can I still graduate when I didn’t complete homework for week x?',
  'course': 'mlops-zoomcamp'},
 {'text': 'You can get a few cloud points by using ku

----

Prompting

In [18]:
prompt_template = """
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT does not contain the answer, output something generically helpful.

    QUESTION: {question}
    CONTEXT:
    {context}
"""

In [19]:
context = ""

for doc in results:
    context = (
        context
        + f"""
        section: {doc["section"]}\n question:{doc["question"]}\n answer: {doc["text"]}\n\n
    """
    )

In [20]:
prompt = prompt_template.format(question=q, context=context).strip()
print(prompt)

You are a course teaching assistant. Answer the QUESTION based on the CONTEXT.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT does not contain the answer, output something generically helpful.

    QUESTION: Where is the homework section
    CONTEXT:
    
        section: Module 4: Deployment
 question:Where is the FAQ for Prefect questions?
 answer: Here


    
        section: Module 2: Experiment tracking
 question:Parameters Mismatch in Homework Q3
 answer: I was using an old version of sklearn due to which I got the wrong number of parameters because in the latest version min_impurity_split for randomforrestRegressor was deprecated. Had to upgrade to the latest version to get the correct number of params.


    
        section: +-General course questions
 question:Can I still graduate when I didn’t complete homework for week x?
 answer: In order to obtain the certificate, completion of the final capstone project is mandatory. The completio

In [21]:
response = client.chat.completions.create(
    model="gpt-4o", messages=[{"role": "user", "content": prompt}]
)

In [22]:
print(response.choices[0].message.content)

It seems the CONTEXT does not specifically mention the location of the homework section. However, typically, the homework section can be found in the course platform or learning management system under a specific module or section titled "Assignments" or "Homework." If you have access to the course syllabus or homepage, it might also provide guidance on where to find the homework section. If you're still unable to find it, consider reaching out to your course instructor or peers for assistance.


---

RAG Flow Cleaning and Modularizing

In [23]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    results = idx.search(
        query=query,
        boost_dict=boost,
        num_results=5,
        filter_dict={"course": "mlops-zoomcamp"},
    )

    return results

In [24]:
def build_prompt(query, search_results):
    prompt_template = """
        You are a course teaching assistant. Answer the QUESTION based on the CONTEXT.
        Use only the facts from the CONTEXT when answering the QUESTION.
        If the CONTEXT does not contain the answer, output something generically helpful.

        QUESTION: {question}
        CONTEXT:
        {context}
    """
    context = ""

    for doc in search_results:
        context = (
            context
            + f"""
            section: {doc["section"]}\n question:{doc["question"]}\n answer: {doc["text"]}\n\n
        """
        )  # noqa: E501
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [25]:
def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [26]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [39]:
query = "Docker installation of ElasticSearch did not work. How do I fix this"

In [40]:
x = rag(query)

In [41]:
x

"If your Docker installation of Elasticsearch is not working, here are a few general troubleshooting steps you can try:\n\n1. **Check Docker Version**: Ensure that your version of Docker is up-to-date. Elasticsearch typically requires a recent version of Docker.\n\n2. **Memory Allocation**: By default, Elasticsearch requires more memory than Docker might allocate. Check and increase memory allocation in your Docker settings if needed.\n\n3. **ElasticSearch Version**: Make sure you are using a compatible Elasticsearch version. Sometimes, certain versions of Elasticsearch have specific dependencies or requirements.\n\n4. **Docker Logs**: Check the logs by running `docker logs <container_id>` to get more information about what might be going wrong.\n\n5. **Network Configuration**: Ensure that any necessary ports are exposed and not in conflict with existing services on your machine.\n\n6. **Restart Docker**: Occasionally, simply restarting Docker can solve issues.\n\nIf these steps do not

----


RAG Flow with ElasticSearch

In [29]:
from elasticsearch import Elasticsearch

In [30]:
es_client = Elasticsearch("http://localhost:9200")

In [32]:
index_setting = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
        }
    },
}

index_name = "course-questions"
# es_client.indices.create(index=index_name, body=index_setting)

In [33]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [34]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
100%|██████████| 948/948 [00:03<00:00, 287.45it/s]


In [60]:
query = "If I don't finish all the homeworks, can I still get teh certificate"

In [61]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^5", "text", "section"],
                    "type": "best_fields",
                }
            },
            "filter": {"term": {"course": "data-engineering-zoomcamp"}},
        }
    },
}

In [62]:
response = es_client.search(index=index_name, body=search_query)

In [63]:
search_query

{'size': 5,
 'query': {'bool': {'must': {'multi_match': {'query': "If I don't finish all the homeworks, can I still get teh certificate",
     'fields': ['question^5', 'text', 'section'],
     'type': 'best_fields'}},
   'filter': {'term': {'course': 'data-engineering-zoomcamp'}}}}}

In [64]:
response["hits"]["hits"][0]["_source"]["text"]

"No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running."

In [65]:
result_docs = []


for hit in response["hits"]["hits"]:
    result_docs.append(hit["_source"])

In [66]:
result_docs

[{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
  'section': 'General course-related questions',
  'question': 'Course - Can I get support if I take the course in the self-paced 

In [67]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": "data-engineering-zoomcamp"}},
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [68]:
def elastic_rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [69]:
output = elastic_rag(query)

In [70]:
output

"The CONTEXT does not specify whether you can get a certificate if you don't finish all the homeworks. Generally, completing all course requirements, including homeworks and projects, is necessary to earn a certificate. You might want to check the specific course requirements or contact your instructor for more precise information."