In [None]:
# pip install fast-autocomplete

### Searching the documents

In [1]:
# Loads the FAQ document in json format
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-06-27 23:38:34--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 20.26.156.215
Connecting to github.com (github.com)|20.26.156.215|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-06-27 23:38:34--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-06-27 23:38:34 (103 MB/s) - ‘documents.json’ saved [658332/658332]



In [2]:
# Displays the head of the document
!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {


In [3]:
# Import the JSON module for working with JSON data
import json

# Load the document
with open('./documents.json', 'rt') as f_in:
    # Read the JSON file and parse it into a Python dictionary
    documents_file = json.load(f_in)

# Initialize an empty list to hold the processed documents
documents = []

# Iterate through each course entry in the loaded JSON data
for course in documents_file:
    # Extract the course name from the current course entry
    course_name = course['course']

    # Iterate through each document associated with the current course
    for doc in course['documents']:
        # Add the course name to the current document's data
        doc['course'] = course_name
        
        # Append the updated document to the documents list
        documents.append(doc)


In [4]:
# Access and print the third document in the documents list (index 2 because indexing starts from 0)
documents[2]


{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
# Get and print the number of documents in the documents list
len(documents)


948

Now I need to load the documents into a database. With Elastic serach we ave indexces instead of tables in 

In [6]:
# Import the Elasticsearch class from the elasticsearch module
from elasticsearch import Elasticsearch

# Create an instance of the Elasticsearch client, connecting to the Elasticsearch server running on localhost at port 9200
es = Elasticsearch("http://localhost:9200")

# Retrieve and print information about the Elasticsearch cluster, such as its version, name, and other metadata
es.info()


ObjectApiResponse({'name': '82f2efaf5ca3', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'dQOmSE3yR7O6Twkbyfhkzg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Creating an Index

Before we can index the documents, we need to create an index (an index in elasticsearch is like a table in a "usual" databases):

In [7]:
from elasticsearch import exceptions as es_exceptions

# Define the settings and mappings for the Elasticsearch index
index_settings = {
    "settings": {
        "number_of_shards": 1,  # Specify the number of primary shards (partitions) for the index
        "number_of_replicas": 0  # Specify the number of replica shards (copies) for the index
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},  # Define the "text" field as a full-text searchable field
            "section": {"type": "text"},  # Define the "section" field as a full-text searchable field
            "question": {"type": "text"},  # Define the "question" field as a full-text searchable field
            "course": {"type": "keyword"}  # Define the "course" field as a keyword (exact match) field
        }
    }
}

# Name of the Elasticsearch index to be created
index_name = "course-questions"

try:
    # Attempt to create the index in Elasticsearch with the specified settings and mappings
    response = es.indices.create(index=index_name, body=index_settings)
    print(response)  # Print the response from Elasticsearch if successful

except es_exceptions.RequestError as e:
    # Handle the specific error for index already exists
    if e.error == 'resource_already_exists_exception':
        print("resource_already_exists")  # Output "resource_already_exists" for this specific error
    else:
        raise  # Raise the error if it's not the expected "index already exists" error


resource_already_exists


Now we're ready to index all the documents:

In [8]:
# Import the tqdm module for creating a progress bar
from tqdm.auto import tqdm

# Iterate through each document in the documents list with a progress bar
for doc in tqdm(documents):
    # Index each document into the specified Elasticsearch index
    es.index(index=index_name, document=doc)


  from .autonotebook import tqdm as notebook_tqdm
100%|████████████████████████████████████████████| 948/948 [00:25<00:00, 37.72it/s]


### Retrieving the docs

In [9]:
# The user's search query
user_question = "How do I join the course after it has started?"

In [10]:
# Construct the search query for Elasticsearch
search_query = {
    "size": 5,  # Limits the number of search results returned to 5.
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,  # The user's search query.
                    "fields": ["question^3", "text", "section"],  # Fields to search in, with a boost on the "question" field.
                    "type": "best_fields"  # Multi-match query type, selecting the best matching fields.
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"  # Filters results to only include documents where the "course" field matches "data-engineering-zoomcamp".
                }
            }
        }
    }
}


This query is intended to search an Elasticsearch index for documents that match a user's question, with specific boosting and filtering criteria. Let's break down each part of the query:

**Detailed Explanation:**

1. **'"size": 5:'**

    - Limits the number of search results returned to 5. This is useful when you only want to see the top results.
      
<br>

2. **'"query": { "bool": { ... } }:'**

    - The **`bool`** query allows combining multiple query clauses (must, filter, should, must_not) to build complex search queries.

<br>

3. **'"must": { "multi_match": { ... } }:'**

    - **'multi_match' Query**: A type of query that allows searching for a query string across multiple fields.
        - **'"query":'** user_question: The user's search input.
        - **'"fields": ["question^3", "text", "section"]':** Specifies the fields to search within. The caret (**'^3'**) denotes a boost, meaning matches in the "question" field are considered three times more relevant than matches in the "text" or "section" fields.
        - **'"type": "best_fields"':** Specifies the multi-match query type. The best_fields type finds the single best matching field and uses its score.
     
<br>

4. **'"filter": { "term": { "course": "data-engineering-zoomcamp" } }':**

- **'term' Filter:** Filters documents to include only those where the "course" field exactly matches the value "data-engineering-zoomcamp".

- The **'filter'** clause is used to restrict the search results to documents that meet the filter criteria, without affecting the relevance score.

**Summary:**

The query is searching an Elasticsearch index for documents related to the user's question. It searches the "question", "text", and "section" fields, giving a higher relevance to matches in the "question" field due to the boost (^3). The search results are filtered to include only documents where the "course" field is "data-engineering-zoomcamp", and it limits the returned results to the top 5 matches.

In [11]:
# Execute the search query against the specified Elasticsearch index
response = es.search(index=index_name, body=search_query)


In [12]:
# Print the response from Elasticsearch, which includes the search results
response


ObjectApiResponse({'took': 221, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2035, 'relation': 'eq'}, 'max_score': 53.79927, 'hits': [{'_index': 'course-questions', '_id': 'ZOrYW5ABnVrkGru2X54i', '_score': 53.79927, '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}}, {'_index': 'course-questions', '_id': 'GOrqW5ABnVrkGru2-qJp', '_score': 53.79927, '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 

In [13]:
# Iterate through each search hit (document) in the response
for hit in response['hits']['hits']:
    # Extract the document source from the hit
    doc = hit['_source']
    
    # Print the section of the document
    print(f"Section: {doc['section']}")
    
    # Print the question from the document
    print(f"Question: {doc['question']}")
    
    # Print the first 60 characters of the answer text from the document followed by "..."
    print(f"Answer: {doc['text'][:60]}...\n")


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...



### Cleaning the Code for Reusability

I can make it cleaner by putting it into a function:

In [14]:
# Initialize Elasticsearch connection
es = Elasticsearch("http://localhost:9200")

from elasticsearch import Elasticsearch

def retrieve_documents(query, index_name="course-questions", max_results=5):
    """
    Retrieve documents from Elasticsearch based on a search query.

    Args:
        query (str): The search query.
        index_name (str, optional): The name of the Elasticsearch index to search in. Defaults to "course-questions".
        max_results (int, optional): Maximum number of results to retrieve. Defaults to 5.

    Returns:
        list: List of documents matching the search query.
    """
    
    # Construct Elasticsearch search query
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    # Execute search query
    response = es.search(index=index_name, body=search_query)
    
    # Extract relevant documents from Elasticsearch response
    documents = [hit['_source'] for hit in response['hits']['hits']]
    
    return documents


In [15]:
# Example user question
user_question = "How do I join the course after it has started?"

# Retrieve documents related to the user's question
response = retrieve_documents(user_question)

# Iterate over each retrieved document and print relevant information
for doc in response:
    print(f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n")  
    # Print the section of the document
    # Print the original question related to the document
    # Print the beginning of the answer for preview


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section

In [16]:
# Example user question
user_question = "How do I join the course after it has started?"

# Retrieve documents related to the user's question
response = retrieve_documents(user_question)

# Iterate over each retrieved document and print relevant information
for doc in response:
    print(f"Section: {doc['section']}")  # Print the section of the document
    print(f"Question: {doc['question']}")  # Print the original question related to the document
    print(f"Answer: {doc['text'][:60]}...")  # Print the beginning of the answer for preview
    print()  # Empty line for separation between documents


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...



### Generation - Answering Questions

In [21]:
# Import the OpenAI client
from openai import OpenAI, APIError

# Initialize the OpenAI client
client = OpenAI()

# Define the message to send to the chat model
user_message = {
    "role": "user",
    "content": "The course already started. Can I still join?"
}

try:
    # Try sending the message to the GPT-4o model
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[user_message]
    )

except APIError as e:
    # Handle specific APIError if model `gpt-4o` is not found or accessible
    if e.status_code == 404 and 'model_not_found' in str(e):
        print("GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...")
        
        # Fallback to GPT-3.5-turbo model
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[user_message]
        )
    else:
        # Handle other API errors
        print(f"Error with GPT-4o model: {e}")

# Print the content of the response from the chat model
print(response.choices[0].message.content)


GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

### Building a Prompt

First, I'll concatenate all the documents into one string

In [22]:
# Context template for formatting document information
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

# Retrieve documents related to the user's question
context_docs = retrieve_documents(user_question)

# Initialize an empty string to accumulate formatted document strings
context_result = ""

# Iterate over each document and format it using the context template
for doc in context_docs:
    doc_str = context_template.format(**doc)  # Format the document using context_template
    context_result += ("\n\n" + doc_str)  # Append formatted document to context_result with extra newline

# Clean up the leading and trailing whitespace in the final context result
context = context_result.strip()

# Print the final formatted context
print(context)


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: G

Now I'll build the actual prompt

In [23]:
# Define the user prompt template
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contain the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


In [24]:
try:
    # Try sending the message to the GPT-4o model
    response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    
except APIError as e:
    # Handle specific APIError if model `gpt-4o` is not found or accessible
    if e.status_code == 404 and 'model_not_found' in str(e):
        print("GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...")
        
        # Fallback to GPT-3.5-turbo model
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        answer = response.choices[0].message.content
    else:
        # Handle other API errors
        print(f"Error with GPT-4o model: {e}")

# Print the content of the response from the chat model
print(answer)

GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

### Cleaning the Code for Reusability

In [25]:
# Template for formatting document information in the context
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

# Template for the prompt presented to the OpenAI model
prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

def build_context(documents):
    """
    Build a formatted context string based on retrieved documents.

    Args:
        documents (list of dict): List of documents retrieved from the FAQ database.

    Returns:
        str: Formatted context string containing document information.
    """
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()

def build_prompt(user_question, documents):
    """
    Build a prompt string based on the user question and retrieved documents.

    Args:
        user_question (str): User's question.
        documents (list of dict): List of documents retrieved from the FAQ database.

    Returns:
        str: Formatted prompt string for OpenAI model.
    """
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_openai(prompt, model="gpt-4o"):
    """
    Send a prompt to the OpenAI chat model and retrieve the model's response.

    Args:
        prompt (str): Prompt string to send to the OpenAI model.
        model (str, optional): Model name to use (default is "gpt-4o").

    Returns:
        str: Response from the OpenAI model.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}]
        )
        answer = response.choices[0].message.content
    except APIError as e:
        if e.status_code == 404 and 'model_not_found' in str(e):
            print("GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...")
            # Fallback to GPT-3.5-turbo model
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}]
            )
            answer = response.choices[0].message.content
        else:
            # Handle other API errors
            print(f"Error with GPT-4o model: {e}")
            answer = "NONE"  # Provide a default answer or indication of failure

    return answer

def qa_bot(user_question):
    """
    Answer a user question by querying the FAQ database, constructing a prompt, 
    and using an OpenAI model to generate the answer.

    Args:
        user_question (str): User's question.

    Returns:
        str: Answer generated by the OpenAI model.
    """
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer


In [26]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")  # Example call to qa_bot to answer a specific user question


GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [27]:
qa_bot("I can't connect to postgres port 5432, my password doesn't work")  # Example call to qa_bot to answer a specific user question about PostgreSQL connection issues


GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [28]:
qa_bot("how can I run kafka?")  # Example call to qa_bot to answer a specific user question about running Kafka


GPT-4o model not found or access denied. Falling back to GPT-3.5-turbo...


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}