## Azure AI Search Multi-Modal RAG - Simple Image Verbalisation Lab

![simple_image_verbalisation](./Assets/simple_image_verbalisation.png)

### Installing Required Packages

In [None]:
%pip install openai PyMuPDF requests python-dotenv matplotlib

### Loading Variables from the .env file

In [None]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv
load_dotenv()

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_API_KEY")

### Creating Azure OpenAI Client

In [None]:
from openai import AzureOpenAI

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

azure_openai_client = AzureOpenAI(
    api_key=azure_openai_key,
    api_version="2024-02-15-preview",
    azure_endpoint=azure_openai_endpoint
)

### Creating the Embedding Generator Function

In [None]:
def generate_embeddings(client, text):
    embedding_model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")
    
    response = client.embeddings.create(
        input=text,
        model = embedding_model
    )
    
    embeddings=response.model_dump()
    return embeddings['data'][0]['embedding']
    

In [None]:
user_query = "can you tell me something about the sustainability initiatives at BMW?"
vectorised_user_query = generate_embeddings(azure_openai_client, user_query)
print(vectorised_user_query)


In [None]:
context=[]

### Sending API Call to the Search Index

In [None]:
import requests
import json


url = f"{service_endpoint}/indexes/{index_name}/docs/search?api-version=2023-11-01"
    
headers = {
        "Content-Type": "application/json",
        "api-key": key
    }
    
body =   {
        "count": True,
        "select": "document_title, content_text, locationMetadata, image_document_id",
        "vectorQueries": [
            {
                "vector": vectorised_user_query,
                "k": 10,
                "fields": "content_embedding",
                "kind": "vector"
            }
        ]
    }
    
response = requests.post(url, headers=headers, data=json.dumps(body))
documents = response.json()['value']

for doc in documents:
    context.append(dict(
        {
            "document_title": doc['document_title'],
            "chunk": doc['content_text'],
            "score": doc['@search.score'],
            "locationMetadata": doc['locationMetadata'] if 'locationMetadata' in doc else None,
            "image_document_id": doc['image_document_id'] if 'image_document_id' in doc else None
        }
    ))
    
for doc in context:
    print(doc)



### Function for locating and displaying context in the PDF docs

In [None]:
import fitz  # PyMuPDF
import requests
import os
import json
from PIL import Image, ImageDraw
from io import BytesIO
import matplotlib.pyplot as plt


def highlight_pdf_chunk(
    document_title: str,
    storage_account_url: str,
    content_text: str = None,
    location_metadata: dict = None,
    download_dir: str = ".",
    zoom: float = 2.0
):
    def get_clean_text(text: str) -> str:
        """
        Cleans the text for reliable search:
        - Removes escape characters
        - Truncates at the first newline (to avoid malformed extra content)
        - Strips extra whitespace
        """
        return text.split('\n')[2].replace('\r', '').strip()

    # Construct download URL and local path
    pdf_url = f"{storage_account_url.rstrip('/')}/{document_title}"
    print(f"[INFO] Downloading PDF from: {pdf_url}")

    os.makedirs(download_dir, exist_ok=True)
    pdf_path = os.path.join(download_dir, document_title)

    # Download the PDF if it doesn't already exist
    if not os.path.exists(pdf_path):
        try:
            response = requests.get(pdf_url)
            response.raise_for_status()

            if not response.content.startswith(b'%PDF'):
                raise ValueError("Downloaded file is not a valid PDF.")

            with open(pdf_path, 'wb') as f:
                f.write(response.content)
            print(f"[INFO] PDF saved to: {pdf_path}")

        except Exception as e:
            print(f"[ERROR] Failed to download or save PDF: {e}")
            return

    # Open the PDF
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"[ERROR] Failed to open PDF with PyMuPDF: {e}")
        return

    img = None
    title = ""

    # Highlight via bounding box metadata
    if location_metadata:
        try:
            page_number = location_metadata["pageNumber"] - 1  # 0-based indexing
            page = doc.load_page(page_number)

            bounding_poly = json.loads(location_metadata["boundingPolygons"])
            x_vals = [pt["x"] for pt in bounding_poly[0]]
            y_vals = [pt["y"] for pt in bounding_poly[0]]
            bbox = (min(x_vals), min(y_vals), max(x_vals), max(y_vals))

            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat)
            img = Image.open(BytesIO(pix.tobytes("png")))

            draw = ImageDraw.Draw(img)
            scaled_bbox = tuple([v * zoom for v in bbox])
            draw.rectangle(scaled_bbox, outline="red", width=4)

            title = f"{document_title} - Page {page_number + 1} [via bounding box]"

        except Exception as e:
            print(f"[ERROR] Failed to highlight using location metadata: {e}")
            doc.close()
            return

    # Highlight via text search
    elif content_text:
        clean_text = get_clean_text(content_text)
        

        if not clean_text:
            
            doc.close()
            return

        found = False
        for page_number in range(len(doc)):
            page = doc[page_number]
            instances = page.search_for(clean_text, quads=True)
            if instances:
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat)
                img = Image.open(BytesIO(pix.tobytes("png")))
                draw = ImageDraw.Draw(img)

                for quad in instances:
                    rect = quad.rect
                    scaled = [
                        rect.x0 * zoom,
                        rect.y0 * zoom,
                        rect.x1 * zoom,
                        rect.y1 * zoom,
                    ]
                    draw.rectangle(scaled, outline="blue", width=4)

                title = f"{document_title} - Page {page_number + 1} [via text match]"
                found = True
                break

        if not found:
            doc.close()
            return

    else:
        print("[WARNING] Neither location_metadata nor content_text was provided.")
        doc.close()
        return

    # Display the result
    if img:
        plt.figure(figsize=(10, 10))
        plt.imshow(img)
        plt.axis('off')
        plt.title(title)
        plt.show()
    else:
        print("[ERROR] No image was generated.")

    doc.close()


### Calling GPT Engine For Summarisation

In [None]:
system_prompt = f""""You are meant to behave as a RAG chatbot that derives its context from a database stored in Azure AI Search Solution.
please answer strictly from the context from the database provided and if you dont have an answer please politely say so. dont include any extra 
information that is not in the context and dont include links as well.
the context passed to you will be in the form of a pythonic list with each object in the list having the following structure:

{{
    "document_title": "the title of the document",
    "chunk": "the chunk of text from the document",
    "score": "the score of the match based on cosine similarity",
    "locationMetadata": "the location metadata if available, else None",
    "image_document_id": "the image document id if available, else None",
}}

the pythonic list contains best 10 matches to the user query based on cosine similarity of the embeddings of the user query and the review descriptions.
please structure your answers in a very professional manner and in such a way that the user does not get to know that its RAG working under the hood
and its as if they are talking to a human. """

user_prompt = f""" the user query is: {user_query}
the context is : {context}"""

chat_completions_response = azure_openai_client.chat.completions.create(
    model = os.getenv("AZURE_OPENAI_CHAT_COMPLETIONS_MODEL"),
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0.7
)

print(chat_completions_response.choices[0].message.content)



### Displaying the Context in PDF Docs

In [None]:
for context_item in context:
    document_title = context_item.get("document_title")
    content_text = context_item.get("chunk")
    location_metadata = context_item.get("locationMetadata")
    document_title = context_item.get("document_title")

    storage_account_url = f"https://{os.getenv('STORAGE_ACCOUNT_NAME')}.blob.core.windows.net/{os.getenv('STORAGE_ACCOUNT_CONTAINER_NAME')}/{os.getenv('STORAGE_ACCOUNT_FOLDER_NAME')}/"
    highlight_pdf_chunk(
            document_title=document_title,
            storage_account_url=storage_account_url,
            content_text=content_text,
            location_metadata=location_metadata
        )
 