In [7]:
import json
import os
import google.generativeai as genai
from elasticsearch import Elasticsearch
import requests
from tqdm.auto import tqdm

In [8]:
# getting the data

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [11]:
#es_client.indices.delete(index=index_name)

In [16]:
# q2 - indexing the data

es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings" : {
        "number_of_shards" : 1,
        "number_of_replicas" : 0
    },
    "mappings": {
        "properties" : {
            "text" : {"type" : "text"},
            "section" : {"type" : "text"},
            "question" : {"type" : "text"},
            "course" : {"type" : "keyword"}
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index = index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [17]:
for doc in tqdm(documents):
    es_client.index(index = index_name, document = doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [18]:
# q3 - searching
query = "How do execute a command on a Kubernetes pod?"

In [19]:
def elastic_search(query):
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
               #  "filter": {
               #      "term": {
               #          "course": "data-engineering-zoomcamp"
               #      }
               # }
            }
        }
    }
    # ctrl + / for commenting a block of code

    response = es_client.search(index = index_name, body = search_query)

    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return response

In [20]:
elastic_search(query)

ObjectApiResponse({'took': 203, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 551, 'relation': 'eq'}, 'max_score': 43.415455, 'hits': [{'_index': 'course-questions', '_id': 'kYWNWJcBCHLGK404huws', '_score': 43.415455, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}}, {'_index': 'course-questions', '_id': 'sYWNWJcBCHLGK404huyi', '_score': 33.15944, '_source': {'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nI

In [21]:
# q4 - filtering
query = "How do copy a file to a Docker container?"

In [22]:
def elastic_search(query):
    
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
               }
            }
        }
    }
    # ctrl + / for commenting a block of code

    response = es_client.search(index = index_name, body = search_query)

    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [23]:
elastic_search(query)

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\nHrithik Kumar Advani",
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I copy files from my local machine to docker container?',
 

In [33]:
# q5 - building a prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    #This part takes the 'results' from minsearch and formats them into a single string.
    llmcontext = ""

    for doc in search_results:
        # llmcontext += f"section: {doc['section']}\n"
        llmcontext += f"q: {doc['question']}\n"
        llmcontext += f"a: {doc['text']}\n\n"
    
    # Integrating the user's question 'q' and the 'context'
    prompt = prompt_template.format(question = 'How do I execute a command in a running docker container?', context = llmcontext.strip())

    return prompt

In [34]:
search_results = elastic_search(query)
prompt_response = build_prompt(query, search_results)

len(prompt_response)

1478

In [35]:
# q6 - tokens - using gemini api key instead of openai
api_key = os.environ.get("GOOGLE_API_KEY")
genai.configure(api_key = api_key)
model = genai.GenerativeModel('gemini-1.5-flash-latest')

In [36]:
tokens = model.count_tokens(prompt_response)

tokens

total_tokens: 359

HOMEWORK FOR MODULE 1