# Multimodal RAG with presentation slides

## Install libraries

In [None]:
%pip install openai azure-identity azure-search-documents

## Initialize clients for Azure AI Search and Azure OpenAI Service

In [None]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents  import SearchClient
from openai import AzureOpenAI

AZURE_OPENAI_ENDPOINT = "<your-azure-openai-endpoint>"
AZURE_OPENAI_API_KEY = "<your-azure-openai-api-key>"
AZURE_AI_SEARCH_ENDPOINT ="<your-azure-ai-search-endpoint>"
AZURE_AI_SEARCH_ADMIN_KEY = "<your-azure-ai-search-admin-key>"

# Create a search client to connect to the Azure AI Search service
search_client = SearchClient(
    index_name="docs",
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(key=AZURE_AI_SEARCH_ADMIN_KEY),
)

# Create an Azure OpenAI client to connect to the Azure OpenAI service
aoai_client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2024-10-01-preview",
)

## RAG pipeline

In [None]:
import json
from azure.search.documents.models import QueryAnswerType, QueryCaptionType, QueryType, VectorizedQuery

# Define a function to generate a search query
def get_search_query(user_question: str):
    system_prompt = f"""
    # Instructions
    - You are an AI assistant.
    - Given the user's question, respond with a search query that can be used to retrieve relevant documents for the user's question based on the intent.
    - Be specific in what the user is asking about.
    - Provide only the search query in the response.

    # Example
    With a user query like below:
    "What was the total revenue in 2024?"

    Respond with:
    "total revenue in 2024"
    """

    user_prompt = f"Return the search query for the following user question: {user_question}"

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_prompt,
        }
    ]

    response = aoai_client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.1
    )

    search_query = response.choices[0].message.content

    return search_query

# Define a function that retrieves documents from the search index based on a search query
def get_documents(search_query: str):
    # Embed the search query
    embedding = aoai_client.embeddings.create(model="text-embedding-3-large", input=search_query)
    search_vector = embedding.data[0].embedding

    # Search the index for document chunks matching the search query
    vector_query = VectorizedQuery(
        vector=search_vector,
        k_nearest_neighbors=50,
        fields="content_vector"
    )

    search_results = search_client.search(
        search_text=search_query,
        vector_queries=[vector_query],
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name="default",
        query_caption=QueryCaptionType.EXTRACTIVE,
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=10,
        select=["id", "page", "base64_image", "content"]
    )

    documents = [
        {
            "id": result["id"],
            "page": result["page"],
            "base64_image": result["base64_image"],
            "content": result["content"]
        }
        for result in search_results
    ]

    return documents

# Define a function that generates an answer with documents
def get_answer_from_documents(user_question: str, documents: list):
    system_prompt = """
    - You are an expert helping employees from AVL to find information in the knowledge base of AVL.
    - You are given a user question and a set of text descriptions and screenshots of slides from a presentation.
    - Use the text descriptions and screenshots as context to answer the questions as completely, correctly, and concisely as possible.
    - Not all documents are relevant to the question, so only use the relevant documents to answer the question.
    - Don't try to make up any answers. If the answer cannot be retrieved from the context and you do not know the answer, then answer 'Sorry, I do not know.'.
    - Add sources to the answer listing the page numbers you used and that are relevant to answer the question.
    - The final response must be in JSON format with two fields:
        - answer: The generated answer to the user's question.
        - sources: A list of page numbers for each cited source
    - Do not use ```json```

    Here is an example of the final response:
    {
        "answer": "The total revenue in 2024 was $245,122 million."
        "sources": [1]
    }
    """

    context = "\n\n".join(f"Document ID: {document['id']}\nPage: {document['page']}\nContent:\n{document['content']}" for document in documents)
    user_prompt = f"""User: {user_question}\n\n
    Answer the user's question based on the following context:\n\nDocuments:\n\n{context}"""
    user_content = [
        {
            "type": "text",
            "text": user_prompt,
        }
    ]
    images = [{ "type": "image_url", "image_url": {"url": f"data:image/png;base64,{document['base64_image']}"} } for document in documents]
    user_content += images

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_content,
        }
    ]

    response = aoai_client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.1
    )

    result_str = response.choices[0].message.content
    result_json = json.loads(result_str)

    return result_json

# RAG pipeline function
def rag(user_question: str):
    # Generate search query
    search_query = get_search_query(user_question=user_question)

    # Retrieve documents with search query
    documents = get_documents(search_query=search_query)

    # Generate answer based on retrieved documents
    answer = get_answer_from_documents(user_question=user_question, documents=documents)
    
    return {
        "answer": answer["answer"], "documents": documents, "sources": answer["sources"], "search_query": search_query
    }

In [None]:
import base64
from IPython.display import display, Markdown, Image

user_question = "The customer complained about an issue with a noisy E-Axle. What can I offer?"

result = rag(user_question=user_question)
display(Markdown(f"🔎 Search Query: {result['search_query']}"))
display(Markdown(f"💬 Answer: {result['answer']}"))
display(Markdown(f"📕 Sources: {result['sources']}"))
for document in result['documents']:
    print(f"Document ID: {document['id']}")
    display(Image(base64.b64decode(document["base64_image"])))
    print(100 * "-")