In [3]:
pip install requests

Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
Downloading charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl (105 kB)
Downloading idna-3.10-py3-none-any.whl (70 kB)
Installing collected packages: idna, charset-normalizer, requests

   ---------------------------------------- 0/3 [idna]
   ------------- -------------------------- 1/3 [charset-normalizer]
   -------------------------- ------------- 2/3 [requests]
   ---------------------------------------- 3/3 [requests]

Successfully installed charset-normalizer-3.4.2 idna-3.10 requests-2.32.3
Note: you may need to restart the kernel to use updated packages.


In [9]:
pip uninstall -y elasticsearch

Found existing installation: elasticsearch 9.0.1
Uninstalling elasticsearch-9.0.1:
  Successfully uninstalled elasticsearch-9.0.1
Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install elasticsearch==8.18.0

Collecting elasticsearch==8.18.0
  Downloading elasticsearch-8.18.0-py3-none-any.whl.metadata (9.2 kB)
Downloading elasticsearch-8.18.0-py3-none-any.whl (895 kB)
   ---------------------------------------- 0.0/895.2 kB ? eta -:--:--
   ---------------------------------------- 0.0/895.2 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/895.2 kB ? eta -:--:--
   ----------------------- ---------------- 524.3/895.2 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 895.2/895.2 kB 1.2 MB/s eta 0:00:00
Installing collected packages: elasticsearch
Successfully installed elasticsearch-8.18.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from elasticsearch import Elasticsearch

# For Elasticsearch 8.x
es = Elasticsearch("http://localhost:9200")

# Check connection
print(f"Connected to Elasticsearch: {es.ping()}")

Connected to Elasticsearch: True


In [1]:
# Check your elasticsearch Python package version
import elasticsearch
print(f"Elasticsearch Python client version: {elasticsearch.__version__}")

Elasticsearch Python client version: (8, 18, 0)


In [4]:
import requests 
from elasticsearch import Elasticsearch
import json

# Get the documents
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

# Display the structure of the data (helpful in VS Code)
print(f"Number of courses: {len(documents_raw)}")
print(f"Sample course: {documents_raw[0]['course']}")
print(f"Sample document: {json.dumps(documents_raw[0]['documents'][0], indent=2)}")

# Process the documents
documents = []
for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

print(f"Total documents: {len(documents)}")

Number of courses: 3
Sample course: data-engineering-zoomcamp
Sample document: {
  "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
  "section": "General course-related questions",
  "question": "Course - When will the course start?"
}
Total documents: 948


In [5]:
# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Check if connection is successful
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")
    # If you're having connection issues, make sure Elasticsearch is running
    # and check if there are any network restrictions

# Define index settings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

# Create the index
index_name = "faq_documents"

# Delete the index if it already exists (optional)
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Deleted existing index: {index_name}")

# Create the index
es.indices.create(index=index_name, body=index_settings)
print(f"Created index: {index_name}")

# Index the documents
for i, doc in enumerate(documents):
    es.index(index=index_name, id=i, document=doc)
    
    # Print progress every 50 documents (helpful in VS Code)
    if (i + 1) % 50 == 0:
        print(f"Indexed {i + 1}/{len(documents)} documents")

print(f"Indexed all {len(documents)} documents")

Connected to Elasticsearch
Deleted existing index: faq_documents
Created index: faq_documents
Indexed 50/948 documents
Indexed 100/948 documents
Indexed 150/948 documents
Indexed 200/948 documents
Indexed 250/948 documents
Indexed 300/948 documents
Indexed 350/948 documents
Indexed 400/948 documents
Indexed 450/948 documents
Indexed 500/948 documents
Indexed 550/948 documents
Indexed 600/948 documents
Indexed 650/948 documents
Indexed 700/948 documents
Indexed 750/948 documents
Indexed 800/948 documents
Indexed 850/948 documents
Indexed 900/948 documents
Indexed all 948 documents


In [None]:
query = "How do execute a command on a Kubernetes pod?"
search_query = {
    "size": 5,
    "query": {
        "multi_match": {
            "query": query,
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    }
}
response = es.search(index=index_name, body=search_query)
# Print the results in a readable format
print(f"Total hits: {response['hits']['total']['value']}")
print("\nTop 5 results:")
for i, hit in enumerate(response['hits']['hits']):
    print(f"\nResult {i+1}:")
    print(f"Score: {hit['_score']}")
    print(f"Course: {hit['_source']['course']}")
    print(f"Question: {hit['_source']['question']}")
    print(f"Text: {hit['_source']['text'][:100]}...")  # Show first 100 chars
# Print the top result score
top_score = response['hits']['hits'][0]['_score']
print(f"\nTop score (answer to Q3): {top_score}")

Total hits: 739

Top 5 results:

Result 1:
Score: 44.50556
Course: machine-learning-zoomcamp
Question: How do I debug a docker container?
Text: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a ba...

Result 2:
Score: 35.433445
Course: machine-learning-zoomcamp
Question: Kubernetes-dashboard
Text: Deploy and Access the Kubernetes Dashboard
Luke...

Result 3:
Score: 33.70974
Course: machine-learning-zoomcamp
Question: How do I copy files from a different folder into docker container’s working directory?
Text: You can copy files from your local machine into a Docker container using the docker cp command. Here...

Result 4:
Score: 33.2635
Course: machine-learning-zoomcamp
Question: How to run a script while a web-server is working?
Text: Problem description:
I started a web-server in terminal (command window, powershell, etc.). How can ...

Result 5:
Score: 32.589073
Course: machine-learning-zoomcamp
Question: How can I annotate a graph?
Tex

In [7]:
query = "How do copy a file to a Docker container?"
search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}
response = es.search(index=index_name, body=search_query)
# Print the results in a readable format
print(f"Total hits: {response['hits']['total']['value']}")
print("\nTop 3 results from machine-learning-zoomcamp:")
for i, hit in enumerate(response['hits']['hits']):
    print(f"\nResult {i+1}:")
    print(f"Score: {hit['_score']}")
    print(f"Question: {hit['_source']['question']}")
    print(f"Text: {hit['_source']['text'][:100]}...")  # Show first 100 chars
# Store the results for use in Q5
filtered_results = response['hits']['hits']
# Print the 3rd question specifically for Q4
if len(filtered_results) >= 3:
    third_question = filtered_results[2]['_source']['question']
    print(f"\nThird question (answer to Q4): {third_question}")
else:
    print("\nLess than 3 results returned")

Total hits: 340

Top 3 results from machine-learning-zoomcamp:

Result 1:
Score: 73.38676
Question: How do I debug a docker container?
Text: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a ba...

Result 2:
Score: 66.688705
Question: How do I copy files from my local machine to docker container?
Text: You can copy files from your local machine into a Docker container using the docker cp command. Here...

Result 3:
Score: 59.812744
Question: How do I copy files from a different folder into docker container’s working directory?
Text: You can copy files from your local machine into a Docker container using the docker cp command. Here...

Third question (answer to Q4): How do I copy files from a different folder into docker container’s working directory?
