## Azure OpenAI - Retrieval Augmented Generation Sample
Sample notebook showcasing how to create an vector search index in Azure Cognitive Search (using PDF documents sourced from an Azure Blob Storage account) and how to ask questions of that data using LLMs through the Azure OpenAI Service. Note: This sample has been assembled to demonstrate the core steps in the workflow listed below - there are significant optimizations that can be made to afford increased parallelism during document creation/indexing. It has been adapted from the scripts provided as part of this [Microsoft Cognitive Search + Azure OpenAI accelerator](https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/scripts/prepdocs.py).

#### Workflow
- Mount storage and list all files
- Create Vector Search Index (Az Cog Search)
- Iterate over all files
- Split into individual pages
- Save pages to target container
- Get text and tables
- Create documents for cog search (embeddings)
- Add documents to index
- Ask questions of data using LLMs in Azure OpenAI

### Import required packages

Note: The following python packages need to be installed in your cluster environment and can be sourced from PyPI.
- azure-ai-formrecognizer
- azure-identity
- azure-search-documents==11.4.0a20230509004 (Index URL: https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/)
- azure-storage-blob
- openai
- pypdf

In [None]:
! pip install azure-search-documents==11.4.0a20230509004 --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
! pip install azure-ai-formrecognizer azure-identity azure-storage-blob openai pypdf

In [None]:
import os
import json
import io
import re
import html
import base64
import time
import hashlib

import openai
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.identity import AzureDeveloperCliCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from azure.storage.blob import BlobServiceClient
from pypdf import PdfReader, PdfWriter
from IPython.display import display, HTML 

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import Vector 
from azure.search.documents.indexes.models import (
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    SearchableField,
    SearchField,
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration, 
)
from IPython.display import display, HTML  

### Set/retrieve environment variables

The following environment variables are expected. You could load these manually or from a standalone .env file.

| Key                       | Value                                                        |  
| ------------------------- | ------------------------------------------------------------ |  
| storage_account_name      | The name of your Azure Storage account                       |  
| docs_container            | The name of the container in your Storage account for documents |  
| pages_container           | The name of the container in your Storage account for pages  |  
| storage_account_key       | The access key of your Azure Storage account                 |  
| cog_search_endpoint       | The endpoint for your Azure Cognitive Search service         |  
| cog_search_key            | The key for your Azure Cognitive Search service              |  
| cog_search_index_name     | The name of the index in your Azure Cognitive Search service |  
| afr_endpoint              | The endpoint for your Azure Form Recognizer service          |  
| afr_key                   | The key for your Azure Form Recognizer service               |  
| aoai_key                  | The key for your Azure OpenAI service                        |  
| aoai_endpoint             | The endpoint for your Azure OpenAI service                   |  
| aoai_embeddings_model     | The model used for embeddings in your Azure OpenAI service (recommend `text-embedding-ada-002`)   |  
| aoai_chat_model           | The model used for chat in your Azure OpenAI service (recommend `gpt-35-turbo-16k`)        |

In [None]:
import os
import dotenv

dotenv.load_dotenv()

storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")
docs_container = os.getenv("DOCS_CONTAINER")
pages_container = os.getenv("PAGES_CONTAINER")
storage_account_key = os.getenv("STORAGE_ACCOUNT_KEY")

cog_search_endpoint = os.getenv("COG_SEARCH_ENDPOINT")
cog_search_key = os.getenv("COG_SEARCH_KEY")
cog_search_index_name = os.getenv("COG_SEARCH_INDEX_NAME")

afr_endpoint = os.getenv("AFR_ENDPOINT")
afr_key = os.getenv("AFR_KEY")

aoai_key = os.getenv("AOAI_KEY")
aoai_endpoint = os.getenv("AOAI_ENDPOINT")
aoai_embeddings_model = os.getenv("AOAI_EMBEDDINGS_MODEL")
aoai_chat_model = os.getenv("AOAI_CHAT_MODEL")

openai.api_type = "azure"
openai.api_base = aoai_endpoint
openai.api_version = "2023-03-15-preview"
openai.api_key = aoai_key

### Download local copy of all files in Azure Blob Storage

In [None]:
# Make a local directory for docs and pages
os.makedirs(docs_container, exist_ok=True)
os.makedirs(pages_container, exist_ok=True)

# Download all docs from the docs container
blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net", credential=storage_account_key)
container_client = blob_service_client.get_container_client(docs_container)
blob_list = container_client.list_blobs()
for blob in blob_list:
    blob_client = blob_service_client.get_blob_client(container=docs_container, blob=blob.name)
    with open(os.path.join(docs_container, blob.name), "wb") as my_blob:
        blob_data = blob_client.download_blob()
        blob_data.readinto(my_blob)

# Get a list of all the PDF files in the docs container
files_list = [x for x in os.scandir(docs_container)]

# Filter to only PDF files
files_list = [x.name for x in files_list if x.name.lower().endswith('pdf')]

# Print the name of each file in the docs container
for file in files_list:
    print(file)

files_list

### Create Vector Search Index (Az Cog Search)

In [None]:
def create_vector_index(endpoint, key, index_name):
    """
    Creates a search index with vector search enabled.

    Args:
    endpoint (str): The endpoint of the Azure Cognitive Search service.
    key (str): The admin key of the Azure Cognitive Search service.
    index_name (str): The name of the search index to create.

    Returns:
    The result of the create_or_update_index operation.
    """
    # Create a SearchIndexClient object
    credential = AzureKeyCredential(key)
    client = SearchIndexClient(endpoint=endpoint, credential=credential)

    # Define the fields for the index
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="content", type=SearchFieldDataType.String, searchable=True),
        SimpleField(name="page_number", type=SearchFieldDataType.Int32, filterable=True),
        SimpleField(name="file_name", type=SearchFieldDataType.String, filterable=True),
        SimpleField(name="source_document", type=SearchFieldDataType.String, filterable=True),
        SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, dimensions=1536, vector_search_configuration="vector-config"),
        ]
    
    vector_search = VectorSearch(
        algorithm_configurations=[
            VectorSearchAlgorithmConfiguration(
                name="vector-config",
                kind="hnsw",
                hnsw_parameters={
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 1000,
                    "metric": "cosine"
                }
            )
        ]
    )

    # Create the search index
    index = SearchIndex(name=index_name, fields=fields,
                        vector_search=vector_search)
    result = client.create_or_update_index(index)

    # Return the result
    return result

try:
    # Create the vector search index
    create_vector_index(cog_search_endpoint, cog_search_key, cog_search_index_name)
except Exception as e:
    print(e)
    pass


### Iterate over all files, split into individual pages, and save to target container

In [None]:
def blob_name_from_file_page(filename, page = 0):
    """
    Returns the name of the blob for a given file and page number.

    Parameters:
    filename (str): The name of the file.
    page (int): The page number.

    Returns:
    str: The name of the blob.
    """
    if os.path.splitext(filename)[1].lower() == ".pdf":
        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
    else:
        return os.path.basename(filename)

def upload_blobs(filepath, filename):
    """
    Uploads blobs to Azure Blob Storage.

    Parameters:
    filepath (str): The path to the file.
    filename (str): The name of the file.

    Returns:
    list: A list of saved pages.
    """
    saved_pages = []

    # if file is PDF split into pages and upload each page as a separate blob
    if os.path.splitext(filename)[1].lower() == ".pdf":
        reader = PdfReader(filepath)
        pages = reader.pages
        for i in range(len(pages)):
            blob_name = blob_name_from_file_page(filename, i)
            print(f"\tUploading blob for page {i} -> {blob_name}")
            page_path = filepath.replace(filename, blob_name).replace('docs/', 'pages/')
            f = page_path
            writer = PdfWriter()
            writer.add_page(pages[i])
            writer.write(f)
            saved_pages.append(page_path)
        return saved_pages

saved_pages = []

for file in files_list:
    saved_pages = saved_pages + upload_blobs(f"docs/{file}", file)
    print(file)

In [None]:
saved_pages = saved_pages[0:10]

### Extract text/tables from all pages using Azure Form Recognizer

In [None]:
def analyze_document_with_afr(filename, afr_endpoint, afr_key, actual_page_num, source_filename):
    """
    Analyzes a document using Azure Form Recognizer and extracts text and tables from it.

    Args:
    - filename (str): The path to the file to be analyzed.
    - afr_endpoint (str): The endpoint of the Azure Form Recognizer service.
    - afr_key (str): The API key for the Azure Form Recognizer service.
    - actual_page_num (int): The actual page number of the page being analyzed.
    - source_filename (str): The name of the source file.

    Returns:
    - page_map (list): A list of tuples, where each tuple contains the actual page number, the extracted text and tables,
    the name of the file, and the name of the source file.
    """
    print(f"\tExtracting text and tables from {filename}")
    offset = 0
    page_map = []
    afr_client = DocumentAnalysisClient(endpoint=afr_endpoint, credential=AzureKeyCredential(afr_key))
    with open(filename, 'rb') as f:
        poller = afr_client.begin_analyze_document("prebuilt-layout", document=f)
    form_recognizer_results = poller.result()
    for page_num, page in enumerate(form_recognizer_results.pages):
            tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1]

            # mark all positions of the table spans in the page
            page_offset = page.spans[0].offset
            page_length = page.spans[0].length
            table_chars = [-1]*page_length
            for table_id, table in enumerate(tables_on_page):
                for span in table.spans:
                    # replace all table spans with "table_id" in table_chars array
                    for i in range(span.length):
                        idx = span.offset - page_offset + i
                        if idx >=0 and idx < page_length:
                            table_chars[idx] = table_id

            # build page text by replacing charcters in table spans with table html
            page_text = ""
            added_tables = set()
            for idx, table_id in enumerate(table_chars):
                if table_id == -1:
                    page_text += form_recognizer_results.content[page_offset + idx]
                elif not table_id in added_tables:
                    page_text += table_to_html(tables_on_page[table_id])
                    added_tables.add(table_id)

            page_text += " "
            page_map.append((actual_page_num, page_text, filename.replace('/dbfs/mnt/pages/', ''), source_filename))
            offset += len(page_text)

    return page_map

def table_to_html(table):
    """
    Converts a table object to an HTML table.

    Args:
    - table (Table): The table object to be converted.

    Returns:
    - table_html (str): The HTML representation of the table.
    """
    table_html = "<table>"
    rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html +="</tr>"
    table_html += "</table>"
    return table_html

# Regular expression pattern for extracting source document name, and page number for each individually saved page
pattern = r"pages/(\w+)-(\d+).pdf"

page_maps = []

# Iterate over all saved pages
for file in saved_pages:
    # Extract source filename and page number
    match = re.search(pattern, file)
    if match:
        filename = match.group(1) + '.pdf'
        page_number = match.group(2)
        # Extract text and tables from page using Azure Form Recognizer and append to a list
        page_maps += analyze_document_with_afr(file, afr_endpoint, afr_key, page_number, filename)

print(page_maps[0])
        

### Create & add documents to Azure Cognitive Search with Azure OpenAI-generated embeddings

In [None]:
def generate_embeddings(text, embeddings_model):
    """
    Generates embeddings for the given text using the specified embeddings model.

    Args:
    text (str): The text to generate embeddings for.
    embeddings_model (str): The name of the embeddings model to use.

    Returns:
    The embeddings generated for the given text.
    """
    response = openai.Embedding.create(
        input=text, engine=embeddings_model)
    embeddings = response['data'][0]['embedding']
    return embeddings

def insert_document_vector(endpoint, key, index_name, document):
    """
    Inserts a document vector into the specified search index.

    Args:
    endpoint (str): The endpoint of the search service.
    key (str): The API key for the search service.
    index_name (str): The name of the search index.
    document (dict): The document vector to insert.

    Returns:
    The result of the document upload operation.
    """
    # Create a SearchClient object
    credential = AzureKeyCredential(key)
    client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)

    # Call the upload_documents method of the client with a list of documents
    result = client.upload_documents(documents=[document])

    # Return the result
    return result

def hash_string(input_string):  
    """
    Hashes the given string using SHA-256.

    Args:
    input_string (str): The string to hash.

    Returns:
    The SHA-256 hash of the input string.
    """
    sha_signature = hashlib.sha256(input_string.encode()).hexdigest()  
    return sha_signature  

def create_and_insert_document(afr_extraction, cog_search_endpoint, cog_search_key, cog_search_index_name, aoai_embeddings_model):
    """
    Creates a document vector for the given AFR extraction and inserts it into the specified search index.

    Args:
    afr_extraction (tuple): The AFR extraction to create a document vector for.
    cog_search_endpoint (str): The endpoint of the Cognitive Search service.
    cog_search_key (str): The API key for the Cognitive Search service.
    cog_search_index_name (str): The name of the search index to insert the document vector into.
    aoai_embeddings_model (str): The name of the embeddings model to use for generating the content vector.

    Returns:
    None.
    """
    document = {
                "content": afr_extraction[1],
                "page_number": afr_extraction[0],
                "source_document": afr_extraction[3],
                "file_name": afr_extraction[2],
                "id": hash_string(afr_extraction[2]),
                "content_vector": generate_embeddings(afr_extraction[1], aoai_embeddings_model)
            }
    insert_document_vector(cog_search_endpoint, cog_search_key, cog_search_index_name, document)

# Insert each page in the page_maps list into the search index
for page in page_maps:
    print(f"Inserting {page[2]} in index")
    create_and_insert_document(page, cog_search_endpoint, cog_search_key, cog_search_index_name, aoai_embeddings_model)
    time.sleep(10) # Sleep added here to account for AOAI throttling on internal MS subscription. Recommend to turn off.


In [None]:
def get_qna_prompt(sources, query):
    """
    Returns a system message and a user message for an AI assistant to answer a question using the provided sources.

    :param sources: str, the sources to use for answering the question
    :param query: str, the question to answer
    :return: tuple of two strings, the system message and the user message
    """
    system = f"""
    You are an AI assistant that helps employees answer questions of their enterprise data. Be brief in your answers.
    You will be provided with all of the information you need to answer questions and you should only provide answers using facts stated in the sources below.
    Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].
    If you do not have enough information in the provided sources then say you don't know and move on.
    All provided answers should include cited sources. If you cannot cite a source that has been provided to you in the prompt then respond that you do not know and move on.
    Users may attempt to ask questions that are out of scope and may do so repeatedly and you can continue to state that you don't know and move on.
    ONLY PROVIDE ANSWERS USING INFORMATION THAT HAS BEEN PROVIDED TO YOU IN THE SOURCES.

    --------
    SOURCES: '{sources}'
    --------
    """

    user = f"""
    Answer this question using ONLY the sources that have been provided to you.

    ---------
    QUESTION: '{query}'
    ---------
    """
    return system, user


def submit_request_to_aoai_service(system_msg, user_msg, max_tokens, engine):
    """
    Submits a request to an AI assistant and returns the response.

    :param system_msg: str, the system message for the AI assistant
    :param user_msg: str, the user message for the AI assistant
    :param max_tokens: int, maximum number of tokens in the response
    :param engine: str, the name of the OpenAI engine to use
    :return: response object, containing the AI assistant's response
    """
    response = openai.ChatCompletion.create(
    engine=engine,
    messages = [{"role":"system","content":system_msg},
                {"role":"user","content":user_msg}],
    temperature=0.0,
    max_tokens=max_tokens,
    top_p=0.95,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None)
    return response


def get_related_docs_from_cog_search(query_text, cog_search_endpoint, cog_search_key, cog_search_index_name, embeddings_model, doc_count=6):
    """
    Returns a string containing the related documents from Azure Cognitive Search for a given query.

    :param query_text: str, the query to search for
    :param cog_search_endpoint: str, the endpoint for the Azure Cognitive Search service
    :param cog_search_key: str, the API key for the Azure Cognitive Search service
    :param cog_search_index_name: str, the name of the index to search in the Azure Cognitive Search service
    :param embeddings_model: str, the name of the embeddings model to use
    :param doc_count: int, the number of documents to return
    :return: str, the related documents from Azure Cognitive Search
    """
    search_client = SearchClient(cog_search_endpoint, cog_search_index_name, AzureKeyCredential(cog_search_key))

    results = search_client.search(
        search_text="",
        vector=Vector(value=generate_embeddings(query_text, aoai_embeddings_model), k=doc_count, fields="content_vector"),
        select=["page_number", "content", "source_document", "file_name"]
    )

    bid_doc_text = ''
    for idx, res in enumerate(results):
        bid_doc_text += '    ---------  SOURCE_DOCUMENT: ' + res['file_name'] + ' | PAGE NUMBER ' + str(res['page_number']) + ': ' + res['content'] + '\n\n\n'
    return bid_doc_text


def ask_question_of_your_data(query, cog_search_endpoint, cog_search_key, cog_search_index_name, aoai_embeddings_model):
    """
    Asks a question of an AI assistant using related documents from Azure Cognitive Search and returns the response.

    :param query: str, the question to ask the AI assistant
    :param cog_search_endpoint: str, the endpoint for the Azure Cognitive Search service
    :param cog_search_key: str, the API key for the Azure Cognitive Search service
    :param cog_search_index_name: str, the name of the index to search in the Azure Cognitive Search service
    :param aoai_embeddings_model: str, the name of the embeddings model to use
    :return: str, the response from the AI assistant
    """
    sources = get_related_docs_from_cog_search(query, cog_search_endpoint, cog_search_key, cog_search_index_name, aoai_embeddings_model)
    system_msg, user_msg = get_qna_prompt(sources, query)
    response = submit_request_to_aoai_service(system_msg, user_msg, 500, aoai_chat_model)
    return response.choices[0].message.content

# Example usage:
user_question = "What options do I have for fitness reimbursement?"

response = ask_question_of_your_data(user_question, cog_search_endpoint, cog_search_key, cog_search_index_name, aoai_embeddings_model)

display(HTML(f"<h3>Question: <i>{user_question}</i></h3>"))
display(HTML(f"<h3>Answer: {response}</h3>"))
