In [1]:
import json
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [2]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [3]:
import os
from openai import OpenAI

# Set your Groq API key
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.getenv("GROQ_API_KEY")
)

llm_role = """
Helpful assistant who provides responses without the preface text of "Based on the provided context, here's a concise response to your question:", etc.
""".strip()

prompt_template = """
QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}
    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results

In [5]:
def build_prompt(query, search_results):
    prompt_template = """
QUESTION: {question}

CONTEXT:
{context}
""".strip()
    prompt=prompt_template.format(question=query,context=search_results).strip()
    return prompt

In [6]:
def llm(prompt, llm_role):
    response = client.chat.completions.create(
        model="mistral-saba-24b",
        messages=[
            {"role": "system", "content": llm_role},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=300
    )

    return response.choices[0].message.content

In [7]:
from elasticsearch import Elasticsearch

In [8]:
es_client = Elasticsearch('http://localhost:9200')

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = "course-questions"

In [10]:
if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")


Index 'course-questions' already exists.


In [11]:
from tqdm.auto import tqdm

In [12]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [13]:
query = 'I just discovered the course, can I still join it?'

In [14]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [15]:
def erag (query):
    search_results=elastic_search(query)
    llm_role = """
Helpful British assistant who provides robust responses without any preface text like 
"Based on the provided context, here's a concise response to your question:", etc.
After every answer, provide a random factoid about bread that is related to the question's topic.
""".strip()
    prompt=build_prompt(query, search_results)
    answer = llm(prompt, llm_role)
    return answer

In [16]:
erag(query)

"Yes, you can still join the course after the start date. Even if you don't register, you're still eligible to submit the homeworks. However, be aware that there will be deadlines for turning in the final projects, so don't leave everything for the last minute.\n\nFun fact: The world's largest bread is a 5,000-pound loaf created in 2016 in the UK."

In [17]:
query = "How do execute a command on a Kubernetes pod?"
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text", "section"],
                    "type": "best_fields"
                }
            }
        }
    }
}
response = es_client.search(index=index_name, body=search_query)
response['hits']['hits'][0]['_score']

44.56891

In [18]:
query = "How do copy a file to a Docker container?"
search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }

        }
    }
}
response = es_client.search(index=index_name, body=search_query)
result_docs = []
for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

result_docs[2]['question']

'How do I debug a docker container?'

In [19]:
context_template = """
Q: {question}
A: {text}
""".strip()
context_string = ""
context = ""
for res in result_docs:
    context_string = context_template.format(question=res['question'],text=res['text']).strip()
    context = context + f"\n\n" + context_string

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()
prompt = prompt_template.format(question=query,context=context.strip())
len(prompt)

1306

In [20]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

In [21]:
len(encoding.encode(prompt))

296

In [22]:
llm(prompt, llm_role)

'To copy a file to a Docker container, you can use the `docker cp` command. This command allows you to copy files or directories between a container and the local filesystem.\n\nHere is the syntax:\n```\ndocker cp <source_path> <container_id>:<destination_path>\n```\n\nFor example, to copy a file named `example.txt` from your local machine to the `/app` directory inside a running container with the ID `container_id`, you would use:\n```\ndocker cp example.txt container_id:/app/\n```\n\nIf you need to copy a directory, the process is similar:\n```\ndocker cp <source_directory> <container_id>:<destination_directory>\n```\n\nFor example, to copy a directory named `myapp` from your local machine to the `/app` directory inside a running container with the ID `container_id`, you would use:\n```\ndocker cp myapp container_id:/app/\n```\n\nMake sure to replace `<source_path>`, `<container_id>`, and `<destination_path>` with the appropriate values for your specific use case.'