In [None]:
import os, json
from pathlib import Path
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    HnswParameters,
    HnswAlgorithmConfiguration,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile
)
import utils
from dotenv import load_dotenv
load_dotenv(".env")

### SET VARIABLES

In [2]:
use_azure_active_directory = False  # Set this flag to True if you are using Azure Active Directory
if not use_azure_active_directory:
    aoai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
    aoai_api_key = os.environ["AZURE_OPENAI_API_KEY"]
    aoai_api_version = os.environ["AZURE_OPENAI_API_VERSION"]

    client = openai.AzureOpenAI(
        azure_endpoint=aoai_endpoint,
        api_key=aoai_api_key,
        api_version=aoai_api_version
    )

    embedding_model: str = "text-embedding-ada-002" 

    service_endpoint = os.environ["SEARCH_ENDPOINT"] 
    index_name = os.environ["SEARCH_INDEX_NAME"]
    key = os.environ["SEARCH_KEY"]
    credential = AzureKeyCredential(key)    


### CREATE A JSON INPUT FILE FOR EMBEDDINGS


├── pdf_dir  
│&emsp; &emsp; ├── text_dir  
│&emsp; &emsp; └── json_dir  
│&emsp; &emsp; &emsp; &emsp; ├── docVectors.json(output1)  
│&emsp; &emsp; &emsp; &emsp; └── docVectors2.json(output2)  
this script  

In [3]:
text_dir = "./pdf/text"
json_dir = "./pdf/json"
json_file = "docVectors.json"
category = "manual"
embedded_file = "docVectors2.json"

### Create a JSON file for Text input

In [4]:
file_contents = []

for i,fname in enumerate(next(os.walk(text_dir))[2]):
    fpath = Path(os.path.join(text_dir, fname))
    with open(fpath, "rb") as f:        
        data = f.read().decode('utf-8')

    file_contents.append(
        {
            "id": str(i),
            "title": fpath.stem,
            "content": data,
            "category": category
        }
    )

os.makedirs(json_dir, exist_ok=True)
with open(os.path.join(json_dir, json_file), "w", encoding='utf-8') as f:
    json.dump(file_contents, f, ensure_ascii=False)

### Create a JSON file for Text and Embeddings input

In [5]:
def remove_page_format(text: str, max_length: int = 7200) -> str:
    content = json.loads(text)
    content_str = json.dumps(content, ensure_ascii=False)

    while len(content_str) > max_length:
        k, v = content.popitem()
        print(f"Removed page {k}")
        content_str = json.dumps(content, ensure_ascii=False)
        print(f"New length: {len(content_str)}")
        if len(content) == 0:
            print("Content is empty")
            exit(1)

    out_doc = ''
    for page in content:
        out_doc += content[page]

    return out_doc.replace('\n',' ')

In [None]:
# Generate Document Embeddings using OpenAI Ada 002
# Read the text-sample.json
with open(os.path.join(json_dir, json_file), 'r', encoding='utf-8') as file:
    input_data = json.load(file)

# Generate embeddings for title and content fields 
for item in input_data:
    title = item['title']
    content = item['content']
    title_embeddings = utils.generate_embeddings(title, embedding_model, client)
    content_embeddings = utils.generate_embeddings(remove_page_format(content), embedding_model, client)
    item['titleVector'] = title_embeddings
    item['contentVector'] = content_embeddings

# Output embeddings to docVectors.json file
with open(os.path.join(json_dir, embedded_file), "w", encoding='utf-8') as f:
    json.dump(input_data, f, ensure_ascii=False)

### DEFINE INDEX FIELDS

In [7]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String, analyzer_name="ja.microsoft"),
    SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="ja.microsoft"),
    SearchableField(name="category", type=SearchFieldDataType.String, analyzer_name="ja.microsoft",
                    filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="default",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="content")]
    )
)

### CREATE AN INDEX

In [None]:
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

### INSERT TEXT AND VECTOR DATA

In [None]:
# Upload some documents to the index
with open(os.path.join(json_dir, embedded_file), 'r', encoding='utf-8') as file:  
    documents = json.load(file)
    
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

### PERFORM A HYBRID SEARCH

In [None]:
# Semantic Hybrid Search
query = "ペットボトルの投棄方法は 1 から 9 番のどれですか？"  

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizedQuery(vector=utils.generate_embeddings(query, embedding_model, client), k_nearest_neighbors=3, fields="contentVector")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["title", "content", "category"],
    query_type=QueryType.SEMANTIC, 
    semantic_configuration_name="default",
    query_caption=QueryCaptionType.EXTRACTIVE, 
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['content']}")
    print(f"Category: {result['category']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

## Error Handling for Partial Content (OPTIONAL)

In [None]:
from azure.core.exceptions import HttpResponseError
query = "<query>" 

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizedQuery(vector=utils.generate_embeddings(query, embedding_model, client), k_nearest_neighbors=3, fields="contentVector")

try:
    results = search_client.search(  
        search_text=query,  
        vector_queries=[vector_query],
        select=["title", "content", "category"],
        query_type=QueryType.SEMANTIC, 
        semantic_configuration_name="default",
        query_caption=QueryCaptionType.EXTRACTIVE, 
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=3
    )

    semantic_answers = results.get_answers()
    for answer in semantic_answers:
        if answer.highlights:
            print(f"Semantic Answer: {answer.highlights}")
        else:
            print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")

    for result in results:
        print(f"Title: {result['title']}")
        print(f"Reranker Score: {result['@search.reranker_score']}")
        print(f"Content: {result['content']}")
        print(f"Category: {result['category']}")

        captions = result["@search.captions"]
        if captions:
            caption = captions[0]
            if caption.highlights:
                print(f"Caption: {caption.highlights}\n")
            else:
                print(f"Caption: {caption.text}\n")
                
except HttpResponseError as e:
    if "Partial Content" in str(e):
        # Handle the 'Partial Content' case here
        print("Received partial content. Fall back to vector/text hybrid...\n")
        
        results = search_client.search(  
            search_text=query,  
            vector_queries=[vector_query],
            select=["title", "content", "category"],
            top=3
        )

        for result in results:
            print(f"Title: {result['title']}")
            print(f"Reranker Score: {result['@search.score']}")
            print(f"Content: {result['content']}")
            print(f"Category: {result['category']}")

            captions = result["@search.captions"]
            if captions:
                caption = captions[0]
                if caption.highlights:
                    print(f"Caption: {caption.highlights}\n")
                else:
                    print(f"Caption: {caption.text}\n")
                    
    else:
        # Handle other HTTP errors
        print(f"An HTTP error occurred: {e}")

### END OF TEST

### PERFORM A HYBRID MULTI-VECTOR SEARCH

### タイトルと CONTENT をそれぞれベクトルに変換して検索する

In [None]:
# Semantic Hybrid Search
query = "<query>"  

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query_1 = VectorizedQuery(vector=utils.generate_embeddings(query, embedding_model, client), k_nearest_neighbors=3, fields="titleVector")
vector_query_2 = VectorizedQuery(vector=utils.generate_embeddings(query, embedding_model, client), k_nearest_neighbors=3, fields="contentVector")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query_1, vector_query_2],
    select=["title", "content", "category"],
    query_type=QueryType.SEMANTIC, 
    semantic_configuration_name="default",
    query_caption=QueryCaptionType.EXTRACTIVE, 
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['content']}")
    print(f"Category: {result['category']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

### END OF SCRIPT