# Data ingestion for multimodal RAG with presentation slides 

### Install libraries

In [None]:
%pip install PyMuPDF openai azure-identity azure-search-documents

### Credentials

In [None]:
AZURE_OPENAI_ENDPOINT = "<your-azure-openai-endpoint>"
AZURE_OPENAI_API_KEY = "<your-azure-openai-api-key>"
AZURE_AI_SEARCH_ENDPOINT ="<your-azure-ai-search-endpoint>"
AZURE_AI_SEARCH_ADMIN_KEY = "<your-azure-ai-search-admin-key>"

### Initialize Azure OpenAI client

In [None]:
from openai import AzureOpenAI

aoai_client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2024-10-01-preview"
)

### Generate description and embedding for each slide

In [None]:
import base64

# Function to encode the image
def encode_image(image_bytes):
    return base64.b64encode(image_bytes).decode("utf-8")

def generate_description(base64_image):
    system_message = """You are an expert in analyzing and describing technical presentation slides for search and retrieval.
    You are given a screenshot from a presentation slide.
    Analyze the content of this presentation slide and generate a concise, clear summary.
    Ensure the summary captures the core idea in a way that is useful for search and retrieval.
    Identify key points, technical concepts, and relationships between different elements.
    Use precise technical language appropriate for a knowledgeable audience.
    Identify title and summarize the main topic and any subheadings.
    Transcribe and explain text elements, bullet points, and messages.
    If the slide contains images, diagrams, charts, or graphs, describe their purpose and what insights they convey including color, labels, trends, relationships and data insights.
    Maintain clarity and accuracy in the description.
    Do not mention page number or data or any information in the footer.
    """

    prompt = f"""Analyze the given screenshot of a presentation slide and provide a detailed description that can be used for search and retrieval purposes."""

    response = aoai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"},
                    },
                ],
            }
        ],
        temperature=0.1
    )

    return response.choices[0].message.content

def generate_embedding(text):
    response = aoai_client.embeddings.create(
        input = text,
        model= "text-embedding-3-large"
    )

    return response.data[0].embedding

In [None]:
import fitz

file_path = "/lakehouse/default/Files/sample-slides.pdf"
document = fitz.open(file_path)

docs = []
id = 1
for page in document:  # iterate through the pages
    pix = page.get_pixmap()  # render page to an image
    image_bytes = pix.tobytes()  # convert image to bytes
    base64_image = encode_image(image_bytes)  # encode image to base64
    description = generate_description(base64_image)  # generate description
    embedding = generate_embedding(description)  # generate embedding
    doc = {
        "id": str(id),
        "page": page.number + 1,
        "base64_image": base64_image,
        "content": description,
        "content_vector": embedding
    }
    docs.append(doc)
    id += 1

In [None]:
from IPython.display import display, Image, Markdown

for doc in docs:
    display(Image(data=base64.b64decode(doc["base64_image"])))
    display(Markdown(doc["content"]))
    print(50 * "-")

### Initialize Azure AI Search index client

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient

search_index_client = SearchIndexClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT, credential=AzureKeyCredential(key=AZURE_AI_SEARCH_ADMIN_KEY)
)

### Create search index

In [None]:
from azure.search.documents.indexes.models import (
    SemanticSearch,
    SearchField,
    SimpleField,
    SearchableField,
    SearchFieldDataType,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    SearchIndex,
)


# The fields we want to index. The "content_vector" field is a vector field that will be used for vector search.
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SimpleField(name="page", type=SearchFieldDataType.Int64),
    SimpleField(name="base64_image", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=3072,  # Dimension of embedding with text-embedding-3-large
        vector_search_profile_name="my-hnsw-profile",
    ),
]

# For vector search, we want to use the HNSW (Hierarchical Navigable Small World)
# algorithm (a type of approximate nearest neighbor search algorithm) with cosine
# distance.
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="my-hnsw-config",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=1000,
                ef_search=1000,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="my-eknn-config",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(metric=VectorSearchAlgorithmMetric.COSINE),
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="my-hnsw-profile",
            algorithm_configuration_name="my-hnsw-config",
        ),
        VectorSearchProfile(
            name="my-eknn-profile",
            algorithm_configuration_name="my-eknn-config",
        ),
    ],
)

# The "content" field should be prioritized for semantic ranking.
semantic_config = SemanticConfiguration(
    name="default",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")],
    ),
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create search index
index_name = "docs"
search_index = SearchIndex(
    name=index_name,
    fields=fields,
    semantic_search=semantic_search,
    vector_search=vector_search,
)
search_index_client.create_index(search_index)

### Upload documents to search index

In [None]:
from azure.search.documents import SearchClient

# Add the documents to the index using the Azure AI Search client
search_client = SearchClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    index_name=index_name,
    credential=AzureKeyCredential(key=AZURE_AI_SEARCH_ADMIN_KEY),
)

search_client.upload_documents(docs)