In [29]:
import requests
from elasticsearch8 import Elasticsearch
from tqdm.auto import tqdm
import tiktoken

In [85]:
docs_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

In [32]:
for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Q1. Running Elastic

In [4]:
es_client = Elasticsearch(hosts=['http://localhost:9200'])

In [9]:
index_settings = dict(
    settings=dict(
        number_of_shards=1,
        number_of_replicas=0
    ),
    mappings=dict(
        properties=dict(
            text=dict(type='text'),
            section=dict(type='text'),
            question=dict(type='text'),
            course=dict(type='keyword')
        )
    )
)
index_name = 'course-questions-hw'

In [10]:
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-hw'})

## Q2. Indexing the data

In [35]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

100%|██████████| 948/948 [00:09<00:00, 99.49it/s] 


## Q3. Searching

In [None]:
query = "How do execute a command on a Kubernetes pod?"

In [89]:
def get_search_query(query, fields=['question^4', 'text'], field_type='best_fields', size=5, course='machine-learning-zoomcamp'):

    search_query = dict(
        size=size,
        query=dict(
            bool=dict(
                must=dict(
                    multi_match=dict(
                        query=query,
                        fields=fields,
                        type=field_type
                    )
                ),
                filter=dict(
                    term=dict(
                        course='machine-learning-zoomcamp'
                    )
                )
            )
        ),
    )
    return search_query

In [90]:
search_query = get_search_query(query=query)

In [91]:
res = es_client.search(index=index_name, body=search_query)

In [92]:
hits = res['hits']['hits']
hits[0]

{'_index': 'course-questions-hw',
 '_id': '9u72S5cBrNRyWu8uLPSq',
 '_score': 84.050095,
 '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'}}

## Q4. Filtering

In [93]:
query = "How do copy a file to a Docker container?"

In [94]:
search_query = get_search_query(query=query)

In [99]:
res = es_client.search(index=index_name, body=search_query)
hits = res['hits']['hits']
hits[:3]

[{'_index': 'course-questions-hw',
  '_id': '9u72S5cBrNRyWu8uLPSq',
  '_score': 73.38676,
  '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'course-questions-hw',
  '_id': 'Fe72S5cBrNRyWu8uLfXI',
  '_score': 66.688705,
  '_source': {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local

## Q5. Building a prompt

In [100]:
context_template = """
Q: {question}
A: {text}
""".strip()

In [101]:
context = ""
for doc in tqdm(hits):
    hit = doc['_source']
    context += f'{context_template.format(question=hit["question"], text=hit["text"])}\n\n'

100%|██████████| 5/5 [00:00<00:00, 3299.48it/s]


In [102]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [103]:
query = "How do I execute a command in a running docker container?"

In [104]:
prompt = prompt_template.format(question=query, context=context)

In [105]:
len(prompt)

2177

## Q6. Tokens

In [107]:
encoding = tiktoken.encoding_for_model('gpt-4o')

In [108]:
encodings = encoding.encode(prompt.strip())

In [109]:
len(encodings)

492