In [68]:
# Question 1
# Run Elastic Search 8.4.3, and get the cluster information. If you run it on localhost
# What's the version.build_hash value?

!curl localhost:9200

{
  "name" : "428ab23ee454",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "9afoeuE7RaeqJBo8lKsKiA",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   539  100   539    0     0   4535      0 --:--:-- --:--:-- --:--:--  4567


In [69]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [70]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [71]:
# Question 2
# hich function do you use for adding your data to elastic?
# Answer: Index

In [72]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

es_client = Elasticsearch('http://localhost:9200') 


In [73]:
#create index settings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

for doc in tqdm(documents):
    # input the data to elastic search
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:54<00:00, 17.37it/s]


In [86]:
def elastic_search(query, filter, num_of_results):
    search_query = {
        "size": num_of_results, # number of results
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"], # question boost 4
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": filter
                    }
                }
            }
        }
    }

    response = es_client.search(index="course-questions", body=search_query)
    
    result_docs = []
    # print(response)
    for hit in response['hits']['hits']:
        # result_docs.append(hit['_source']),
        result_docs.append(hit)
    
    return result_docs

In [87]:
# Question 3
# We will execute a query "How do I execute a command in a running docker container?".
# Use only question and text fields and give question a boost of 4, and use "type": "best_fields".

# What's the score for the top ranking result?

query = 'How do I execute a command in a running docker container?'
result = elastic_search(query, "data-engineering-zoomcamp", 5)
print(f"Top Score: {result[0]['_score']}")

Top Score: 75.54128


In [89]:
# Question 4
# Now let's only limit the questions to machine-learning-zoomcamp
# Return 3 results. What's the 3rd question returned by the search engine?

query = 'How do I execute a command in a running docker container?'
search_result = elastic_search(query, "machine-learning-zoomcamp",3)
print(f"The third question: {search_result[2]['_source']['question']}")

The third question: How do I copy files from a different folder into docker container’s working directory?


In [90]:
# question 5
#  Take the records returned from Elasticsearch in Q4 and use this template to build the context. 
# Separate context entries by two linebreaks (\n\n)
# Now use the context you just created along with the "How do I execute a command in a 
# running docker container?" question to construct a prompt using the template below:

# What's the length of the resulting prompt? (use the len function)

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['_source']['section']}\nquestion: {doc['_source']['question']}\nanswer: {doc['_source']['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

prompt = build_prompt(query, search_result)

print(f"length of the prompt: {len(prompt)}")


length of the prompt: 1637


In [78]:
#question 6

#When we use the OpenAI Platform, we're charged by the number of tokens we send in our prompt and receive in the response.
# The OpenAI python package uses tiktoken for tokenization:
# Let's calculate the number of tokens in our query:
# Use the encode function. How many tokens does our prompt have?

import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")
print(f"tokens that we have from our prompt there are:{len(encoding.encode(prompt))}")

tokens that we have from our prompt there are:356
