In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [8]:
q = 'the course has already started, can I still enroll?'

In [9]:
index.fit(documents)

<minsearch.Index at 0x7e2482d8cad0>

In [10]:
from openai import OpenAI

In [11]:
client = OpenAI()

In [12]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

In [13]:
response.choices[0].message.content

"It depends on the specific course and the institution offering it. Many courses have a late enrollment period that allows students to join after the official start date, often with certain conditions or limitations. It's best to contact the course instructor or the admissions office of the institution offering the course to inquire about late enrollment options and any potential consequences, such as missed coursework or additional fees."

In [14]:
def search(query): 
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [15]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION. 

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results: 
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [16]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [17]:
query = 'how do i run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [18]:
rag(query)

"To run Kafka, if you are working with Java, you should execute the following command in your project directory:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nIf you are working with Python, consider creating a virtual environment and running the requirements.txt in that environment. This ensures that all necessary dependencies are available for running your Kafka producer/consumer:\n\n```bash\n# Create a virtual environment (run only once)\npython -m venv env\n\n# Activate the virtual environment\nsource env/bin/activate\n\n# Install required packages\npip install -r ../requirements.txt\n```\n\nEnsure you activate the virtual environment every time you need to run the Python files:\n\n```bash\nsource env/bin/activate\n```\n\nDeactivate the virtual environment when you're done:\n\n```bash\ndeactivate\n```\n\nThis process works on MacOS and Linux. For Windows, the activation command is slightly different:\n\n```bash\n

In [19]:
rag('the course has already started, can i still enroll?')

"Yes, you can still enroll in the course after it has started. Even if you don't register right away, you are still eligible to submit the homework assignments. However, be mindful of the deadlines for submitting the final projects, so it’s best not to leave everything until the last minute."

In [20]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [21]:
from elasticsearch import Elasticsearch

In [22]:
es_client = Elasticsearch('http://localhost:9200')

In [23]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [24]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [25]:
from tqdm.auto import tqdm

In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [27]:
query = 'I just disovered the course. Can I still join it?'

In [28]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [29]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [30]:
rag(query)

'Yes, you can still join the course even if it has already started. You are eligible to submit the homework without registration. However, be mindful of the deadlines for turning in the final projects and avoid leaving everything for the last minute.'

# Homework

### Q1

Run Elastic Search 8.17.6, and get the cluster information. If you run it on localhost, this is how you do it:

```curl localhost:9200 ```

What's the ```version.build_hash``` value?

```$ curl localhost:9200
{
  "name" : "c77b8b921161",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "BU90ushSTFqQ7QY_RiebYg",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
} ```

In [31]:
import requests

https://raw.githubusercontent.com/milanimcgraw/LLM-Zoomcamp-2025/refs/heads/main/Module%201%3A%20Introduction/documents.json

In [32]:
#fetch docs
docs_url = 'https://raw.githubusercontent.com/milanimcgraw/LLM-Zoomcamp-2025/refs/heads/main/Module%201%3A%20Introduction/documents.json'

docs_response = requests.get(docs_url)

documents_raw = docs_response.json()

In [33]:
#prep docs
documents = []
for course in documents_raw:
    course_name = course['course']
    
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

### Q2

Which function do you use for adding your data to elastic?

- ```insert```
- ```index```
- ```put```
- ```add```

In [68]:
#index settings/mappings
index_name = "faq-index"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "question": {"type": "text"},
            "section": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}


In [69]:
#connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

In [70]:
#create index
es.indices.create(index=index_name, body=index_settings, ignore=400)

  es.indices.create(index=index_name, body=index_settings, ignore=400)


ObjectApiResponse({'error': {'root_cause': [{'type': 'resource_already_exists_exception', 'reason': 'index [faq-index/rsSYsQNNRpmBX7aaIgOVcA] already exists', 'index_uuid': 'rsSYsQNNRpmBX7aaIgOVcA', 'index': 'faq-index'}], 'type': 'resource_already_exists_exception', 'reason': 'index [faq-index/rsSYsQNNRpmBX7aaIgOVcA] already exists', 'index_uuid': 'rsSYsQNNRpmBX7aaIgOVcA', 'index': 'faq-index'}, 'status': 400})

In [71]:
#index docs
for i, doc in enumerate(documents):
    es.index(index=index_name, id=i, body=doc)

### Q3

We will execute a query "How do execute a command on a Kubernetes pod?".

Use only ```question``` and ```text``` fields and give ```question``` a boost of 4, and use ```"type": "best_fields"```.

What's the score for the top ranking result?

- 84.50
- 64.50
- 44.50
- 24.50
- 
Look at the ```_score``` field.

In [81]:
#searching
query = {
    "query": {
        "multi_match": {
            "query": "How do execute a command on a Kubernetes pod?",
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    }
}

response = es.search(index=index_name, body=query)

top_score = response['hits']['hits'][0]['_score']

print(top_score)

44.50556


### Q4

Now ask a different question: "How do copy a file to a Docker container?".

This time we are only interested in questions from ```machine-learning-zoomcamp```.

Return 3 results. What's the 3rd question returned by the search engine?

- How do I debug a docker container?
- How do I copy files from a different folder into docker container’s working directory?
- How do Lambda container images work?
- How can I annotate a graph?

In [89]:
#filtering
query =  {
    "size": 3,  # limit to 3 results here
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": "How do copy a file to a Docker container?",
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}


response = es.search(index=index_name, body=query)

for hit in response["hits"]["hits"]:
    print(hit["_score"], hit["_source"].get("question"))

73.38676 How do I debug a docker container?
66.688705 How do I copy files from my local machine to docker container?
59.812744 How do I copy files from a different folder into docker container’s working directory?


### Q5

Now we're ready to build a prompt to send to an LLM.

Take the records returned from Elasticsearch in Q4 and use this template to build the context. Separate context entries by two linebreaks ```(\n\n)```

```python
context_template = """
Q: {question}
A: {text}
""".strip()

In [97]:
# q&a context template for each result
context_template = """Q: {question}
A: {text}""".strip()

In [98]:
# join context parts
context_parts = []
for hit in response["hits"]["hits"]:
    q = hit["_source"]["question"]
    a = hit["_source"]["text"]
    context_parts.append(context_template.format(question=q, text=a))

context = "\n\n".join(context_parts)


In [99]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [100]:
query = "How do copy a file to a Docker container?"

final_prompt = prompt_template.format(question=query, context=context)

In [101]:
print(len(final_prompt))

1446


### Q6
When we use the OpenAI Platform, we're charged by the number of tokens we send in our prompt and receive in the response.

The OpenAI Python package uses `tiktoken` for tokenization:

```bash
pip install tiktoken
```

Let's calculate the number of tokens in our query:

```python
encoding = tiktoken.encoding_for_model("gpt-4o")
```

Use the `encode` function. How many tokens does our prompt have?

- 120  
- 220  
- 320  
- 420  

> 💡 **Note**: To decode a token back into a word, you can use the `decode_single_token_bytes` function:

```python
encoding.decode_single_token_bytes(63842)
```


In [103]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Downloading tiktoken-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m796.9/796.9 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, tiktoken
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [tiktoken]
[1A[2KSuccessfully installed regex-2024.11.6 tiktoken-0.9.0
Note: you may need to restart the kernel to use update

In [104]:
import tiktoken

In [105]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [107]:
token_count = len(encoding.encode(final_prompt))

print("Token count:", token_count)

Token count: 320
