## 📋 Table of Contents

This notebook guides you through the following sections:

1. [**Optical Character Recognition (OCR) with Azure AI Document Intelligence**](#optical-character-recognition-ocr-with-azure-ai-document-intelligence): Overview of Azure's Document Analysis Client and its pre-trained models for document analysis.

2. [**Understanding Data Extracted from the Layout Model**](#understanding-data-extracted-from-the-layout-model): Insights into the data extracted from the layout model.
    - [**Custom Logic for Processing Extracted Information**](#custom-logic-for-processing-extracted-information): Discusses the need for custom logic to process the extracted information based on specific use cases and requirements.
    - [**Leveraging LangChain Integration**](#leveraging-langchain-integration): Explanation of how Retrieval-Augmented Generation (RAG) works with a pretrained Large Language Model (LLM) and an external data retrieval system for dynamic interaction with documents and content generation.

In [1]:
import os

# Define the target directory
target_directory = r"C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbbai-azure-ai-document-intelligence


In [2]:
from src.azure_search_ai.indexer import AzureIndexerManager
az_indexer_client = AzureIndexerManager()

## Create Index

In [3]:
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    HnswAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    HnswParameters,
    SearchField,
    VectorSearch,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticField,
    SemanticSearch,
    SemanticPrioritizedFields
)

In [5]:
# define field types for index 
images_index_fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        sortable=True,
        filterable=True,
        facetable=True,
    ),
    SimpleField(name="url", type=SearchFieldDataType.String),
    SimpleField(name="categoryEnriched", type=SearchFieldDataType.String),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="summary", type=SearchFieldDataType.String),
    SearchField(
        name="summaryVector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=1536,
        vector_search_profile_name="myHnswProfile",
    )
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=5,
                ef_construction=300,
                ef_search=400,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            ),
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        ),
    ]
)

semantic_config_images = SemanticConfiguration(
    name="images-index-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        keywords_fields=[SemanticField(field_name="categoryEnriched")],
        content_fields=[SemanticField(field_name="summary")],
    ),
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(
    configurations=[semantic_config_images]
)

In [6]:
az_indexer_client.create_index(index_name="image-ocr-index",
                              fields=images_index_fields, 
                              vector_search=vector_search,
                              semantic_search=semantic_search)

2024-01-21 23:04:19,366 - micro - MainProcess - INFO     Index image-ocr-index created (core.py:create_index:132)


## Creating DataSource

In [7]:
az_indexer_client.create_data_source_connection(name="ocr-image-blob-datasource", 
                                                description='''Data source for OCR images extracted from PDFs 
                                                for multimodal search purposes''', 
                                                container_name="ocrimages", 
                                                type="azureblob")

2024-01-21 23:04:20,769 - micro - MainProcess - INFO     Data source 'ocr-image-blob-datasource' created or updated (core.py:create_data_source_connection:164)


## Adding Skills 

In [23]:
az_indexer_client.skills = []

In [24]:
inputs= [
        {
          "name": "summaryText",
          "source": "/document/summary/*"
        }
      ]

outputs = [
        {
          "name": "embedding",
          "targetName": "summaryVector"
        }
      ]


In [25]:
az_indexer_client.skills

[]

In [26]:
az_indexer_client.add_skill(odata_type="EmbeddingSkill",
                                name="image-ocr-embedding-skill",
                                description="Vectorize summaries extracted from images using ada",
                                context="/document/summary/*",
                                inputs=inputs,
                                outputs=outputs)

## Creating skillsets 

In [27]:
az_indexer_client.create_skillset(name="image-ocr-skillset-4",
                                  description="Skillset for OCR images extracted from PDFs")

2024-01-21 23:07:41,999 - micro - MainProcess - INFO     Skillset: 
{
    "name": "image-ocr-skillset-4",
    "description": "Skillset for OCR images extracted from PDFs",
    "skills": [
        {
            "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
            "name": "image-ocr-embedding-skill",
            "description": "Vectorize summaries extracted from images using ada",
            "context": "/document/summary/*",
            "resourceUri": "https://ml-workspace-dev-canadaeast-001-aoai.openai.azure.com",
            "apiKey": "d0fabc0b5f9c4b57a864761dd0eb146d",
            "deploymentId": "foundational-canadaeast-ada",
            "inputs": [
                {
                    "name": "summaryText",
                    "source": "/document/summary/*"
                }
            ],
            "outputs": [
                {
                    "name": "embedding",
                    "targetName": "summaryVector"
                }
            ]
 