# Retrieval-Augmented Generation (RAG) Demo

This notebook demonstrates a simple retrieval-augmented generation workflow using a custom search engine.

I practiced using Github Copilot to write the code cell by cell. 

I then made the code modular using functions. 

In [2]:
#Download the custom search engine and documents json file from the Datatalks Club GitHub repository.
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py
#!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json

In [1]:
#Import necessary libraries
import json # for loading JSON data
import minsearch # for custom search engine
from openai import OpenAI # for LLM
from elasticsearch import Elasticsearch # for vector search
from tqdm.auto import tqdm # progress bar for loops

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the documents from the JSON file
with open('documents.json', 'r') as f:
    docs_raw = json.load(f)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
# Create the search index
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [5]:
# Fit the index with the documents
index.fit(documents)

<minsearch.Index at 0x1053d73a0>

In [6]:
# Search the index for relevant documents based on the query.
def search(query):
    # Set boost values to give more importance to certain fields during search
    boost = { "question": 3, "section": 0.5}
    # Search the index with the query, filtering by course and applying boost
    results = index.search(
        query=query,
        num_results=5,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost
    )
    return results
    

In [7]:
# Build a prompt for the LLM using the query and search results.
def prompt_builder(query, search_results):
    # Create prompt template for the question and context
    prompt_template = """
    You are a course assistant for the Data Engineering Zoomcamp. 
    You will be given a question. Your task is to answer the question based on the CONTEXT from the FAQ Documents. Use only facts from the CONTEXT to answer the question. If you don't know the answer, say "I don't know".
    Question: {question}
    Context: {context}
    Answer: 
    """.strip()

    # Create context of documents from our search results
    context = ""

    for doc in search_results: 
        context += f"Question: {doc['question']}\n"
        context += f"Answer: {doc['text']}\n"
        context += f"Section: {doc['section']}\n\n"

        # Create the prompt
        prompt = prompt_template.format(
            question=query,
            context=context
        ).strip()
    return prompt
    

In [8]:
# Send the prompt to the OpenAI LLM and returns the response.
def llm(prompt):
    # Create an OpenAI client instance
    client = OpenAI()
    # Send the prompt with context  to the OpenAI model and get the response
    response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt},
    ])
    return response.choices[0].message.content

In [9]:
# Implement the full RAG pipeline: search, prompt building, and LLM call.
def rag(query): 
    # Get search results for the query
    search_results = search(query)
    # Build the prompt with the query and search results
    prompt = prompt_builder(query, search_results)
    # Get the answer from the LLM
    answer = llm(prompt)
    return answer

In [10]:
# Set the query 
query = 'the course has already started, can I still enroll?'


In [11]:
# Example usage
answer = rag(query)
# Print the answer
print(rag(query))

Yes, you can still enroll in the course even after it has started. You are eligible to submit the homeworks, but be aware that there will be deadlines for turning in the final projects, so it's advisable not to leave everything for the last minute.


In [12]:
# Initialize the Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')

In [13]:
# Define the index settings and mappings for Elasticsearch
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "text": {"type": "text"},
            "section": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}
# Create the Elasticsearch index with the specified settings and mappings
index_name = 'course_questions'
es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


RequestError: RequestError(400, 'resource_already_exists_exception', 'index [course_questions/vm2ZETlcQQ6-mg0fx-spgQ] already exists')

In [14]:
# Index the documents into Elasticsearch with a progress bar (tqdm)
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:01<00:00, 577.08it/s]


In [15]:
def elastic_search(query):

    # Define the search query for Elasticsearch
    search_query = {
        "query": {
            "bool": {
                "must": [
                    {"multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                        }
                        }
                ],
                "filter": [
                    {"term": {"course": "data-engineering-zoomcamp"}}
                ]
            }
        },
        "size": 5
    }
    # Perform the search in Elasticsearch using the defined query
    response = es_client.search(index=index_name, body=search_query)
    # Extract the relevant documents from the search response
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [16]:
result_docs = elastic_search(query)

  response = es_client.search(index=index_name, body=search_query)


In [17]:
# Implement the full RAG pipeline: search, prompt building, and LLM call. This time with Elasticsearch.
def rag(query): 
    # Get search results for the query
    search_results = elastic_search(query)
    # Build the prompt with the query and search results
    prompt = prompt_builder(query, search_results)
    # Get the answer from the LLM
    answer = llm(prompt)
    return answer

In [18]:
# Example usage
answer = rag(query)
# Print the answer
print(rag(query))

  response = es_client.search(index=index_name, body=search_query)


Yes, even if the course has already started, you can still join. You are eligible to submit the homeworks without registering. However, be aware that there will be deadlines for turning in the final projects, so don't leave everything for the last minute.
