# Catalog of Contents

1. [Process documents and convert to Markdown](###1.-Convert-document-to-Markdown-with-images-and-tables)
2. [Process images in documents and update MD](###2.-Extract-images-from-document-and-save-locally-as-figure_xx.png)
3. [Inference](#3.-Inference)
4. [Define the helper function to concatenate function info](#Define-the-helper-function-to-concatenate-function-info)
5. [Combine all components into a single function](#Combine-all-components-into-a-single-function)
6. [Final test](#Final-test)

In [None]:
%pip install -qU openai
%pip install -qU pyyaml
%pip install -qU pdf2docx
%pip install -qU tenacity
%pip install -qU matplotlib
%pip install -qU python-dotenv
%pip install -qU tiktoken==0.7.0
%pip install -qU langchain==0.1.0
%pip install -qU azure-core==1.29.5
%pip install -qU azure-storage-blob==12.19.0
%pip install -qU azure-search-documents==11.4.0
%pip install -qU azure-ai-documentintelligence==1.0.0b3

In [None]:
"""
Phase 1: Knowledge Ingestion
 1. Convert document to Markdown with images and tables
 2. Extract images from document and save locally as figure_xx.png
 3. Loop: Generate descriptions for images and insert into original Markdown
 4. Upload local images to cloud storage
 5. Generate chunks, ensuring content within HTML tags remains intact

Phase 2: Knowledge Retrieval
 1. Check retrieved chunk for "blob://" URLs
 2. Parse "blob://" URL, extract storage information, and generate SAS URL
 3. Replace "blob://" with SAS URL
 4. Generate answer
"""

In [None]:
import azure.ai.documentintelligence

print(azure.ai.documentintelligence.__version__)

In [None]:
import os
from dotenv import load_dotenv

load_dotenv("../../.env")
print(os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"))

# 1. Convert document to Markdown with images and tables

In [5]:
# import libraries
import os
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat, AnalyzeResult
# from azure.ai.documentintelligence.models import ContentFormat, AnalyzeOutputOption, AnalyzeResult


doc_intelligence_endpoint = os.getenv(
    "AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=doc_intelligence_endpoint, credential=AzureKeyCredential(doc_intelligence_key))

def analyze_document(file_path):
    with open(file_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout",
            analyze_request=f,
            content_type="application/octet-stream",
            polling_interval=3,
            # output=[AnalyzeOutputOption.FIGURES],
            output_content_format=ContentFormat.MARKDOWN
        )

    # result: AnalyzeResult = poller.result()
    # operation_id = poller.details["operation_id"]
    # return operation_id, result

    result: AnalyzeResult = poller.result()

    return result

In [6]:
# Usage
result = analyze_document("../00_data/US_11820651_B2.pdf")
# result = analyze_document("../00_data/刑法的私塾.pdf")
# operation_id, result = analyze_document(
#     "../00_data/Overview of LLMs_short.pdf")

In [None]:
a = 0
if result.figures and len(result.figures) > 0:
    a = 1

a

In [None]:
from IPython.display import HTML, Markdown, display

markdown_content = result.content

# display(Markdown(markdown_content))
display(Markdown(markdown_content))

In [None]:
from process_images import include_figures_in_md

input_file_path = "../00_data/US_11820651_B2.pdf"
containerName = "rag-test"
folder = "US_11820651_B2"
new_md_content = await include_figures_in_md(input_file_path, result, containerName,
                                             folder, output_folder="data/cropped")
display(Markdown(new_md_content))

## Phase 2: Process retrieved chunks

In [6]:
import re
from datetime import datetime, timedelta, timezone
from azure.storage.blob.aio import BlobServiceClient
from azure.storage.blob import BlobSasPermissions, generate_blob_sas

sas_token_cache = {}


def generate_sas_url(blob_client):
    # Generate a unique key for the blob
    blob_key = f"{blob_client.account_name}/{blob_client.container_name}/{blob_client.blob_name}"

    # Check if the SAS token is already cached and still valid
    if blob_key in sas_token_cache:
        sas_token, expiry_time = sas_token_cache[blob_key]
        if expiry_time > datetime.now(timezone.utc):
            # Return the cached SAS URL if the token is still valid
            return f"https://{blob_client.account_name}.blob.core.windows.net/{blob_client.container_name}/{blob_client.blob_name}?{sas_token}"

    # Generate a new SAS token
    expiry_time = datetime.now(timezone.utc) + timedelta(hours=1)
    sas_token = generate_blob_sas(
        blob_client.account_name,
        blob_client.container_name,
        blob_client.blob_name,
        account_key=blob_client.credential.account_key,
        permission=BlobSasPermissions(read=True),
        expiry=expiry_time
    )

    # Cache the new SAS token and its expiry time
    sas_token_cache[blob_key] = (sas_token, expiry_time)

    # Generate the SAS URL
    sas_url = f"https://{blob_client.account_name}.blob.core.windows.net/{blob_client.container_name}/{blob_client.blob_name}?{sas_token}"

    return sas_url


def extract_blob_url(input_str):
    # Define the regex pattern to match the blob URL
    pattern = r'blob://[^\s\)]+\.png'

    # Find all matches in the input string
    matches = re.findall(pattern, input_str)

    # Return the list of matched URLs
    return matches


def extract_elements_from_blob_url(blob_url):
    # Split the URL by '/'
    elements = blob_url.split('/')

    # Extract the required elements
    if len(elements) >= 6:
        return elements[2], elements[3], elements[4], elements[5]
    else:
        return None


async def process(md_content: str):
    blob_urls = extract_blob_url(md_content)
    sas_urls = {}

    async with BlobServiceClient.from_connection_string(os.environ["AzureWebJobsStorage"]) as storage_client:
        for blob_url in blob_urls:
            elements = extract_elements_from_blob_url(blob_url)
            if elements:
                _, container_name, blob_name = elements[0], elements[1], '/'.join(
                    elements[2:])
                blob_client = storage_client.get_blob_client(
                    container=container_name, blob=blob_name)
                sas_url = generate_sas_url(blob_client)
                sas_urls[blob_url] = sas_url

    # Replace blob URLs with SAS URLs in the markdown content
    for blob_url, sas_url in sas_urls.items():
        md_content = md_content.replace(blob_url, sas_url)

    return md_content

In [None]:
new_md_content = """# 1\\. Introduction  \nLanguage plays a fundamental role in facilitating commu- nication and self-expression for humans, and their interaction with machines. The need for generalized models stems from the growing demand for machines to handle complex language tasks, including translation, summarization, information re- trieval, conversational interactions, etc. Recently, significant breakthroughs have been witnessed in language models, pri- marily attributed to transformers [1], increased computational capabilities, and the availability of large-scale training data. These developments have brought about a revolutionary trans- formation by enabling the creation of LLMs that can approxi- mate human-level performance on various tasks [2, 3]. Large  \n<figure>\n\n![](https://xxxxxxx)<!-- FigureContent=\"The image is a bar chart titled \"Papers Released over Years,\" depicting the trend of academic papers with keywords related to \"Large Language Model\" (LLMs), \"LLMs + Fine-Tuning,\" and \"LLMs + Alignment\" from 2018 to 2023. The chart uses three colors: blue for \"LLMs,\" red for \"LLMs + Fine-Tuning,\" and green for \"LLMs + Alignment.\"\n\n- **2018**: 42 papers on LLMs, 3 on Fine-Tuning, and 12 on Alignment.\n- **2019**: 60 papers on LLMs, 32 on Fine-Tuning, and 17 on Alignment.\n- **2020**: 114 papers on LLMs, 66 on Fine-Tuning, and 26 on Alignment.\n- **2021**: 260 papers on LLMs, 153 on Fine-Tuning, and 58 on Alignment.\n- **2022**: 1,210 papers on LLMs, 582 on Fine-Tuning, and 238 on Alignment.\n- **2023**: 20,900 papers on LLMs, 7,260 on Fine-Tuning, and 4,740 on Alignment.\n\nThe chart shows a significant increase in papers over the years, especially in 2023.\" --></figure>  \n<!-- Footnote=\"\\*Equal contribution\" -->  \n<!-- Footnote=\"Email addresses: humza\\_naveed@yahoo. com (Humza Naveed),\" -->  \n<!-- Footnote=\"aukhanee@gmail. com (Asad Ullah Khan), shiqiu@cse.cuhk.edu.hk (Shi Qiu), muhammad. saqib@data61.csiro. au (Muhammad Saqib),\" -->  \n<!-- Footnote=\"saeed. anwar@kfupm. edu. sa (Saeed Anwar), muhammad. usman@kfupm. edu. sa (Muhammad Usman), naveed. akhtar1@unimelb. edu. au (Naveed Akhtar), nick. barnes@anu. edu. au (Nick Barnes), ajmal.mian@uwa.edu.au (Ajmal Mian)\" -->  \n<!-- PageFooter=\"Preprint submitted to Elsevier\" -->  \n<!-- PageFooter=\"April 11, 2024\" -->\n:selected: :selected: :unselected: :unselected: :unselected: :selected: :unselected: :unselected: :unselected: :unselected: :unselected: :selected: :unselected: :unselected: :selected: :unselected: :unselected: :selected: :unselected: :unselected: :selected: :selected: :unselected: :unselected:<figure>\n\n![](https://xxxxxx)<!-- FigureContent=\"The image is a timeline chart illustrating the release of various large language models (LLMs) from 2019 to 2024. It uses blue and orange cards to distinguish between 'pre-trained' and 'instruction-tuned' models, respectively. Models on the upper half of the timeline are open-source, while those on the lower half are closed-source.\n\nKey highlights include:\n\n- **2019-2020**: Introduction of models like T5 and mT5 (both pre-trained and open-source).\n- **2021**: Notable releases such as GPT-3 (pre-trained, closed-source) and T0 (instruction-tuned, open-source).\n- **2022**: Many models released, including Codex and WebGPT (pre-trained, closed-source) and OPT-IML (instruction-tuned, open-source).\n- **2023**: Models like LLaMA and Code Llama (pre-trained, open-source) and ChatGPT (instruction-tuned, closed-source) are released.\n- **2024**: Expected releases include PanGu-Σ, BloombergGPT, and Gemini (all pre-trained, closed-source).\n\nThe chart emphasizes the growing trend toward instruction-tuned and open-source models in natural language processing research.\" --></figure>  \nLanguage Models (LLMs) have emerged as cutting-edge arti- ficial intelligence systems that can process and generate text with coherent communication [4], and generalize to multiple tasks [5, 6]."""
blob_urls = extract_blob_url(new_md_content)
processed_md_content = await process(new_md_content)
processed_md_content

In [None]:
processed_md_content

In [None]:
display(Markdown(processed_md_content))

# 2. Chunking

In [None]:
import re
from langchain.schema import Document
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Function to find complete tags and replace with placeholders
def preserve_tags(text, tag):
    pattern = fr"(<{tag}.*?>.*?</{tag}>)"
    matches = re.findall(pattern, text, re.DOTALL)
    for i, match in enumerate(matches):
        text = text.replace(match, f"[[PLACEHOLDER_{i}]]")
    return text, matches

# Function to restore tags from placeholders
def restore_tags(text, matches):
    for i, match in enumerate(matches):
        text = text.replace(f"[[PLACEHOLDER_{i}]]", match)
    return text


headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("#####", "Header 5"),
    ("######", "Header 6"),  
    ("#######", "Header 7"), 
    ("########", "Header 8")
]

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False)

# Process the markdown content to preserve tags
replaced_md_content, matches = preserve_tags(new_md_content, "figure")
md_header_splits = md_splitter.split_text(replaced_md_content)

chunk_size = 512
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Split the text into documents
splits = text_splitter.split_documents(md_header_splits)

# Restore tags in each split
restored_splits = [Document(page_content=restore_tags(split.page_content, matches)) for split in splits]

print(splits[3].page_content)
print("------------------------------")
print(restored_splits[3].page_content)

In [13]:
import tiktoken
import matplotlib.pyplot as plt


encoding = tiktoken.encoding_for_model('gpt-4o')
encoding


def gen_token_num_plot(splits):
    import tiktoken
    import matplotlib.pyplot as plt

    encoding = tiktoken.encoding_for_model('gpt-4')

    def get_max_tokens(splits):
        max_tokens = 0
        for split in splits:
            num_tokens = len(encoding.encode(split.page_content))
            if num_tokens > max_tokens:
                max_tokens = num_tokens
        return max_tokens

    # get_max_chars(splits)


    # Calculate the range for each bar
    max_token = get_max_tokens(splits)

    # Calculate the count scope for each range
    range_size = max_token / 5

    # Initialize the count for each range
    range_counts = [0] * 5

    # Count the number of splits in each range
    for split in splits:
        num_token = len(encoding.encode(split.page_content))
        range_index = min(int(num_token / range_size), 4)
        range_counts[range_index] += 1

    # Set the figure size
    fig, ax = plt.subplots(figsize=(8, 4))

    # Plot the bar chart
    ax.bar(range(1, 6), range_counts)
    ax.set_xlabel('Count Scope')
    ax.set_ylabel('Number of Splits')
    ax.set_title('Number of Splits in Each Range')

    # Set the x-axis tick labels to the count scope values
    xtick_labels = [f'{int(i * range_size)}-{int((i+1) * range_size)}' for i in range(5)]
    ax.set_xticks(range(1, 6))
    ax.set_xticklabels(xtick_labels)

    plt.show()

In [None]:
print(f"{len(restored_splits)} in total")
gen_token_num_plot(restored_splits)

# 4. Create index

In [40]:
# Import required libraries
import os
import json
import openai
from openai import AzureOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient, SearchIndexingBufferedSender
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryType,
    VectorizedQuery,
)
from azure.search.documents.indexes.models import (
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSearch,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    ExhaustiveKnnParameters,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SearchField,
    VectorSearch,
    HnswParameters,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)

# Configure environment variables
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
service_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")

credential = AzureKeyCredential(key)

In [None]:
index_name = "index-multimodal"

index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)

try:
    result = index_client.delete_index(index_name)
    print ('Index', index_name, 'Deleted')
except Exception as ex:
    print (ex)

In [None]:
# Create a search index
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True,
                sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SimpleField(name="source", type=SearchFieldDataType.String,
                filterable=True),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)

result = index_client.create_or_update_index(index)
print(f' {result.name} created')

## Create embeddings
Read your data, generate OpenAI embeddings and export to a format to insert your Azure AI Search index:

In [18]:
import uuid

embedding_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(texts, model=model):
    return embedding_client.embeddings.create(input=texts, model=model).data


In [19]:
file_prefix = "Overview of LLMs.pdf"

In [None]:
print(restored_splits[3].page_content)

In [None]:
model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

batch_inputs = [restored_splits[i].page_content for i in range(len(restored_splits))]
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(batch_inputs), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = batch_inputs[batch_start: batch_end]
    response = client.embeddings.create(input=batch, model=model)
    for i, be in enumerate(response.data):
        # double check embeddings are in same order as input
        assert i == be.index
    batch_embeddings = [e.embedding for e in response.data]
    embeddings.extend(batch_embeddings)

documents = []
counter = 0
for item in restored_splits:
    file_name = file_prefix.split('.')[0]
    documents.append({
        'id': str(uuid.uuid4()),
        'title':  item.metadata.get('Header 1', file_name),
        'content': item.page_content,
        'category': item.metadata.get('Header 2', file_name),
        'source': file_prefix,
        'contentVector': embeddings[counter]
    })
    counter += 1

print(f'{len(documents)} documents generated')


In [None]:
os.getenv("AZURE_SEARCH_ENDPOINT2")

In [None]:
endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
admin_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
credential = AzureKeyCredential(admin_key)
client = SearchClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=credential,)
client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents in total")

## Retrieval

In [24]:
embedding_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(query, model=model):
    return embedding_client.embeddings.create(input=[query], model=model).data[0].embedding

In [None]:
# Semantic Hybrid Search
query = "How is the trend of LLM releases?"

search_client = SearchClient(
    service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizedQuery(vector=generate_embeddings(query),
                               k_nearest_neighbors=5,
                               fields="contentVector")

results = search_client.search(
    search_text=query,
    vector_queries=[vector_query],
    select=["title", "content", "source"],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=5
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

output = []
for result in results:
    print(f"Title: {result['title']}")
    print(f"result: {result}")
    # print(f"source: {result['source']}")
    print(f"Content: {result['content']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")

    output.append(result['content'])
    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

In [27]:
import re
from datetime import datetime, timedelta, timezone
from azure.storage.blob.aio import BlobServiceClient
from azure.storage.blob import BlobSasPermissions, generate_blob_sas

sas_token_cache = {}


def generate_sas_url(blob_client):
    # Generate a unique key for the blob
    blob_key = f"{blob_client.account_name}/{blob_client.container_name}/{blob_client.blob_name}"

    # Check if the SAS token is already cached and still valid
    if blob_key in sas_token_cache:
        sas_token, expiry_time = sas_token_cache[blob_key]
        if expiry_time > datetime.now(timezone.utc):
            # Return the cached SAS URL if the token is still valid
            return f"https://{blob_client.account_name}.blob.core.windows.net/{blob_client.container_name}/{blob_client.blob_name}?{sas_token}"

    # Generate a new SAS token
    expiry_time = datetime.now(timezone.utc) + timedelta(hours=1)
    sas_token = generate_blob_sas(
        blob_client.account_name,
        blob_client.container_name,
        blob_client.blob_name,
        account_key=blob_client.credential.account_key,
        permission=BlobSasPermissions(read=True),
        expiry=expiry_time
    )

    # Cache the new SAS token and its expiry time
    sas_token_cache[blob_key] = (sas_token, expiry_time)

    # Generate the SAS URL
    sas_url = f"https://{blob_client.account_name}.blob.core.windows.net/{blob_client.container_name}/{blob_client.blob_name}?{sas_token}"

    return sas_url


def extract_blob_url(input_str):
    # Define the regex pattern to match the blob URL
    pattern = r'blob://[^\s\)]+\.png'

    # Find all matches in the input string
    matches = re.findall(pattern, input_str)

    # Return the list of matched URLs
    return matches


def extract_elements_from_blob_url(blob_url):
    # Split the URL by '/'
    elements = blob_url.split('/')

    # Extract the required elements
    if len(elements) >= 6:
        return elements[2], elements[3], elements[4], elements[5]
    else:
        return None


async def process(md_content: str):
    blob_urls = extract_blob_url(md_content)
    sas_urls = {}

    async with BlobServiceClient.from_connection_string(os.environ["AzureWebJobsStorage"]) as storage_client:
        for blob_url in blob_urls:
            elements = extract_elements_from_blob_url(blob_url)
            if elements:
                _, container_name, blob_name = elements[0], elements[1], '/'.join(
                    elements[2:])
                blob_client = storage_client.get_blob_client(
                    container=container_name, blob=blob_name)
                sas_url = generate_sas_url(blob_client)
                sas_urls[blob_url] = sas_url

    # Replace blob URLs with SAS URLs in the markdown content
    for blob_url, sas_url in sas_urls.items():
        md_content = md_content.replace(blob_url, sas_url)

    return md_content

In [None]:
os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")

In [None]:
embedding_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")

In [32]:
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
from IPython.display import Markdown
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryType,
    VectorizedQuery,
)

load_dotenv("../../.env")

# Configure environment variables
key = os.getenv("AZURE_SEARCH_ADMIN_KEY2")
service_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT2")
credential = AzureKeyCredential(key)

api_key = os.getenv("AZURE_OPENAI_API_KEY_GPT_4o")
api_base = os.getenv("AZURE_OPENAI_ENDPOINT_GPT_4o")
deployment_name = os.getenv("AZURE_OPENAI_MODEl_GPT_4o")
api_version = os.getenv("AZURE_OPENAI_API_VERSION_GPT_4o")

gpt_client = AzureOpenAI(
    api_key=api_key,
    azure_endpoint=api_base,
    api_version="2024-05-01-preview",
)

embedding_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")


@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(query, model=model):
    return embedding_client.embeddings.create(input=[query], model=model).data[0].embedding


async def infer(query: str, index_name: str):
    search_client = SearchClient(
        service_endpoint, index_name, AzureKeyCredential(key))
    vector_query = VectorizedQuery(vector=generate_embeddings(query),
                                   k_nearest_neighbors=5,
                                   fields="contentVector")

    results = search_client.search(
        search_text=query,
        vector_queries=[vector_query],
        select=["title", "content", "source"],
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name='my-semantic-config',
        query_caption=QueryCaptionType.EXTRACTIVE,
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=5
    )

    output = []
    for result in results:
        processed_chunk = await process(result['content'])
        output.append(processed_chunk)

    context = '\n'.join(output)

    prompt = """
        {{question}}

        Sources:
        {{context}}
    """

    final_prompt = prompt.replace(
        "{{question}}", query).replace("{{context}}", context)
    
    # print(final_prompt)

    response = gpt_client.chat.completions.create(
        model=deployment_name,
        messages=[{'role': 'system', 'content': "You are a helpful assistant. Try to answer user's question by referencing the following related background information.\n  If there is not enough information to answer user's question, just say not enough information.\n  If there are relevant images, please embed the image in the response. DON'T modify the image URL."},
                  {"role": "user", "content": final_prompt}],
        temperature=0.7,
        max_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None)

    return response

In [None]:
query = "How is the trend of LLM releases?"
index_name = "index-multimodal"
response = await infer(query, index_name)
Markdown(response.choices[0].message.content)

In [None]:
query = "What is PEFT? How it works?"
index_name = "index-multimodal"
response = await infer(query, index_name)
Markdown(response.choices[0].message.content)

In [None]:
query = "Show me the diagram of Parameter Efficient Fine-Tuning."
index_name = "index-multimodal"
response = await infer(query, index_name)
Markdown(response.choices[0].message.content)

In [None]:
query = "How parameter-efficient fine-tuning works in detail?"
index_name = "index-multimodal"
response = await infer(query, index_name)
Markdown(response.choices[0].message.content)

In [None]:
query = "How do tool augmented LLMs work?"
index_name = "index-multimodal"
response = await infer(query, index_name)
Markdown(response.choices[0].message.content)

In [None]:
query = "How does the BLOOM architecture look like?"
index_name = "index-multimodal"
response = await infer(query, index_name)
Markdown(response.choices[0].message.content)

In [None]:
query = "Show me an example of attention patterns"
index_name = "index-multimodal"
response = await infer(query, index_name)
Markdown(response.choices[0].message.content)