In [14]:
!wget https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/parse-faq.ipynb

--2025-06-13 00:20:10--  https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/parse-faq.ipynb
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘parse-faq.ipynb’

parse-faq.ipynb         [ <=>                ] 183.76K  --.-KB/s    in 0.01s   

2025-06-13 00:20:11 (13.3 MB/s) - ‘parse-faq.ipynb’ saved [188167]



In [1]:
import os
from openai import OpenAI
import json
from elasticsearch import Elasticsearch
import tqdm
import dotenv


In [2]:
dotenv.load_dotenv(".env")

True

In [3]:
with open("documents.json", "r") as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []
for course in docs_raw:
    for doc in course["documents"]:
        doc["course"] = course["course"]
        documents.append(doc)

In [5]:
es_client = Elasticsearch('http://localhost:9200') 

In [6]:
index_name = "course-questions"

In [10]:
es_client.indices.delete(index=index_name)

ObjectApiResponse({'acknowledged': True})

In [11]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [12]:
for doc in tqdm.tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:03<00:00, 264.83it/s]


In [13]:
def search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [14]:
def build_prompt(question):
    results = search(question)
    context = ""
    for doc in results:
        context += f"""
Q: {doc['question']}
A: {doc['text']}
""".strip()
        context += "\n\n"
    prompt = f"""
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()
    return prompt


In [15]:
query = "How do copy a file to a Docker container?"

In [16]:
prompt = build_prompt(query)

In [17]:
len(prompt)

1446

In [18]:
import tiktoken

In [19]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [20]:
tokens = encoding.encode(prompt)

In [21]:
len(tokens)

320

In [22]:
for token in tokens:
    decoded_token = encoding.decode_single_token_bytes(token)
    print(decoded_token)

b"You're"
b' a'
b' course'
b' teaching'
b' assistant'
b'.'
b' Answer'
b' the'
b' QUESTION'
b' based'
b' on'
b' the'
b' CONT'
b'EXT'
b' from'
b' the'
b' FAQ'
b' database'
b'.\n'
b'Use'
b' only'
b' the'
b' facts'
b' from'
b' the'
b' CONT'
b'EXT'
b' when'
b' answering'
b' the'
b' QUESTION'
b'.\n\n'
b'QUESTION'
b':'
b' How'
b' do'
b' copy'
b' a'
b' file'
b' to'
b' a'
b' Docker'
b' container'
b'?\n\n'
b'CON'
b'TEXT'
b':\n'
b'Q'
b':'
b' How'
b' do'
b' I'
b' debug'
b' a'
b' docker'
b' container'
b'?\n'
b'A'
b':'
b' Launch'
b' the'
b' container'
b' image'
b' in'
b' interactive'
b' mode'
b' and'
b' overriding'
b' the'
b' entry'
b'point'
b','
b' so'
b' that'
b' it'
b' starts'
b' a'
b' bash'
b' command'
b'.\n'
b'docker'
b' run'
b' -'
b'it'
b' --'
b'entry'
b'point'
b' bash'
b' <'
b'image'
b'>\n'
b'If'
b' the'
b' container'
b' is'
b' already'
b' running'
b','
b' execute'
b' a'
b' command'
b' in'
b' the'
b' specific'
b' container'
b':\n'
b'docker'
b' ps'
b' ('
b'find'
b' the'
b' container'
b'-id'
b'

In [23]:
client = OpenAI(api_key=os.getenv("OPENAI_KEY"))

In [24]:
response = client.chat.completions.create(
    model="gpt-4.1-nano",
    messages = [
        {
            "role": "user",
            "content": prompt
        }
    ]
)

In [25]:
answer = response.choices[0].message.content

In [28]:
answer

'You can copy a file to a Docker container using the `docker cp` command. The syntax is:  \n`docker cp /path/to/local/file_or_directory container_id:/path/in/container`'

In [26]:
len(encoding.encode(answer))

39

In [27]:
0.005/1000*320 + 0.015/1000*41

0.002215