In [3]:
import os

# Define the target directory
target_directory = r"/Users/marcjimz/Documents/Development/gbb-ai-hls-factory-prior-auth"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to /Users/marcjimz/Documents/Development/gbb-ai-hls-factory-prior-auth


## Instantiate the PolicyIndexingPipeline Class

Settings for running the pipeline are in `src\pipeline\policyIndexer\settings.yaml`.

The PolicyIndexingPipeline automates the process of indexing policy documents into Azure AI Search.

In [4]:
from src.pipeline.policyIndexer.run import PolicyIndexingPipeline

In [11]:
indexer = PolicyIndexingPipeline()

## Upload Document to Landing Zone Blob Storage

In [12]:
## TODO: ALLOW KEY BASED AUTHENTICATION

In [5]:
indexer.upload_documents(local_path="utils/data/cases/policies")

2024-12-10 21:14:15,404 - micro - MainProcess - INFO     Uploaded utils/data/cases/policies/001.pdf to policies_ocr/001.pdf (run.py:upload_documents:161)
2024-12-10 21:14:15,606 - micro - MainProcess - INFO     Uploaded utils/data/cases/policies/003.pdf to policies_ocr/003.pdf (run.py:upload_documents:161)
2024-12-10 21:14:15,818 - micro - MainProcess - INFO     Uploaded utils/data/cases/policies/002.pdf to policies_ocr/002.pdf (run.py:upload_documents:161)
2024-12-10 21:14:16,157 - micro - MainProcess - INFO     Uploaded utils/data/cases/policies/005.pdf to policies_ocr/005.pdf (run.py:upload_documents:161)
2024-12-10 21:14:16,705 - micro - MainProcess - INFO     Uploaded utils/data/cases/policies/004.pdf to policies_ocr/004.pdf (run.py:upload_documents:161)


## Create Data Source (Connect Blob)

In [13]:
indexer.create_data_source()

2024-12-10 21:21:26,080 - micro - MainProcess - INFO     Data source 'ai-policies-blob' created or updated (run.py:create_data_source:186)


## Create Index 

In [14]:
indexer.create_index()

2024-12-10 21:21:31,338 - micro - MainProcess - INFO     Index 'ai-policies-index' created or updated successfully. (run.py:create_index:312)


## Creare Skillset

In [15]:
indexer.create_skillset()

2024-12-10 21:21:33,937 - micro - MainProcess - INFO     Skillset 'ai-policies-skillset' created or updated (run.py:create_skillset:530)


## Create Indexer

In [16]:
indexer.create_indexer()

2024-12-10 21:21:46,767 - micro - MainProcess - INFO     Indexer 'ai-policies-indexer' created or updated (run.py:create_indexer:564)


## Create Run Indexer 

In [17]:
from src.pipeline.policyIndexer.run import IndexerRunner

In [18]:
indexer = IndexerRunner(indexer_name="ai-policies-indexer")

In [23]:
indexer.monitor_indexer_status()

2024-12-10 21:24:21,866 - micro - MainProcess - INFO     Indexer 'ai-policies-indexer' has been started. (run.py:run_indexer:636)
2024-12-10 21:24:22,079 - micro - MainProcess - INFO     Indexer Status: running (run.py:monitor_indexer_status:680)
2024-12-10 21:24:22,079 - micro - MainProcess - INFO     Last Run Time: 2024-12-11 04:23:03.859000+00:00 (run.py:monitor_indexer_status:681)
2024-12-10 21:24:22,080 - micro - MainProcess - INFO     Execution Status: success (run.py:monitor_indexer_status:682)
2024-12-10 21:24:22,080 - micro - MainProcess - INFO     Indexer 'ai-policies-indexer' completed successfully. (run.py:monitor_indexer_status:690)


## Test Search 

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential

credential = (
    AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_ADMIN_KEY"))
    if os.getenv("AZURE_AI_SEARCH_ADMIN_KEY")
    else DefaultAzureCredential()
)
index_name = os.getenv("AZURE_AI_SEARCH_INDEX_NAME", "ai-policies-index")


search_client = SearchClient(
    endpoint=os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"],
    index_name=index_name,
    credential=AzureKeyCredential(os.environ["AZURE_AI_SEARCH_ADMIN_KEY"]),
)

In [25]:
SEARCH_QUERY = "afiniitor therapy"

In [26]:
vector_query = VectorizableTextQuery(
    text=SEARCH_QUERY, k_nearest_neighbors=5, fields="vector", weight=0.5
)

In [27]:
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType

results = search_client.search(
    search_text=SEARCH_QUERY,
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name="my-semantic-config",
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=5,
)

for result in results:
    print("=" * 40)
    print(f"ID: {result['chunk_id']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Source_doc_path: {result['parent_path']}")
    content = (
        result["chunk"][:500] + "..." if len(result["chunk"]) > 500 else result["chunk"]
    )
    print(f"Content: {content}")

    captions = result.get("@search.captions", [])
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}")
        else:
            print(f"Caption: {caption.text}")
    print("=" * 40)

ID: 0b3177cacab8_aHR0cHM6Ly9zdG9yYWdlcHJpb3JhdXRoamd1MnAybi5ibG9iLmNvcmUud2luZG93cy5uZXQvcHJlLWF1dGgtcG9saWNpZXMvcG9saWNpZXNfb2NyLzAwNS5wZGY1_normalized_images_10_pages_0
Reranker Score: 2.7419040203094482
Source_doc_path: https://storagepriorauthjgu2p2n.blob.core.windows.net/pre-auth-policies/policies_ocr/005.pdf
Content: UnitedHealthcare® (2) Presence of phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha (PIK3CA) mutation Authorization will be issued for 12 months. 2. Reauthorization a. Afinitor will be approved based on the following criterion: (1) Patient does not show evidence of progressive disease while on Afinitor therapy Authorization will be issued for 12 months. Q. Gastrointestingal Stromal Tumor (GIST) 1. Initial Authorization a. Afinitor will be approved based on all of the followi...
Caption: <em>Afinitor will be approved </em>based on the following criterion: (1)<em> Patient does not show evidence of progressive disease </em>while on<em> Afinitor ther