In [None]:
import os

# Define the target directory
target_directory = os.getcwd()  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

## Instantiate the PolicyIndexingPipeline Class

Settings for running the pipeline are in `src\pipeline\policyIndexer\settings.yaml`.

The PolicyIndexingPipeline automates the process of indexing policy documents into Azure AI Search.

In [4]:
from src.pipeline.policyIndexer.run import PolicyIndexingPipeline

In [11]:
indexer = PolicyIndexingPipeline()

## Upload Document to Landing Zone Blob Storage

In [12]:
## TODO: ALLOW KEY BASED AUTHENTICATION

In [None]:
indexer.upload_documents(local_path="utils/data/cases/policies")

## Create Data Source (Connect Blob)

In [None]:
indexer.create_data_source()

## Create Index 

In [None]:
indexer.create_index()

## Creare Skillset

In [None]:
indexer.create_skillset()

## Create Indexer

In [None]:
indexer.create_indexer()

## Create Run Indexer 

In [17]:
from src.pipeline.policyIndexer.run import IndexerRunner

In [18]:
indexer = IndexerRunner(indexer_name="ai-policies-indexer")

In [None]:
indexer.monitor_indexer_status()

## Test Search 

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential

credential = (
    AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_ADMIN_KEY"))
    if os.getenv("AZURE_AI_SEARCH_ADMIN_KEY")
    else DefaultAzureCredential()
)
index_name = os.getenv("AZURE_AI_SEARCH_INDEX_NAME", "ai-policies-index")


search_client = SearchClient(
    endpoint=os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"],
    index_name=index_name,
    credential=AzureKeyCredential(os.environ["AZURE_AI_SEARCH_ADMIN_KEY"]),
)

In [25]:
SEARCH_QUERY = "afiniitor therapy"

In [26]:
vector_query = VectorizableTextQuery(
    text=SEARCH_QUERY, k_nearest_neighbors=5, fields="vector", weight=0.5
)

In [None]:
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType

results = search_client.search(
    search_text=SEARCH_QUERY,
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name="my-semantic-config",
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=5,
)

for result in results:
    print("=" * 40)
    print(f"ID: {result['chunk_id']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Source_doc_path: {result['parent_path']}")
    content = (
        result["chunk"][:500] + "..." if len(result["chunk"]) > 500 else result["chunk"]
    )
    print(f"Content: {content}")

    captions = result.get("@search.captions", [])
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}")
        else:
            print(f"Caption: {caption.text}")
    print("=" * 40)