**The Notebook updates the Index**
- **Connect to Azure Ai Search Index**
- **Configure Vector Search & Update**

In [None]:
# %pip install azure-search-documents azure-search azure-core openai==0.28

In [None]:
import openai
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField, SearchFieldDataType, SearchField,
    VectorSearch, HnswAlgorithmConfiguration, VectorSearchProfile,
    SemanticConfiguration, SemanticPrioritizedFields, SemanticField, SemanticSearch
)
from azure.core.credentials import AzureKeyCredential

# Configuration
openai.api_type = "azure"
openai.api_base = "https://xxxxx.openai.azure.com/"
openai.api_version = "2023-11-01"
openai.api_key = "xxxxxx"

search_service_name = "xxxxxx"
search_index_name = "books-index"
admin_key = "xxxxxxx"
endpoint = f"https://{search_service_name}.search.windows.net"

# Create a search index client
index_client = SearchIndexClient(endpoint=endpoint, credential=AzureKeyCredential(admin_key))

# Retrieve the existing index
existing_index = index_client.get_index(search_index_name)

# Define new fields if necessary (e.g., contentVector, searchContent)
new_fields = [
    SearchField(name="Embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="searchContent", type=SearchFieldDataType.String, searchable=True),
    SearchField(name="DescriptionEmbedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]

# Add new fields to the existing fields
fields = existing_index.fields
for new_field in new_fields:
    if new_field.name not in [field.name for field in fields]:
        fields.append(new_field)

# Configure the vector search
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters={
                "m": 8,
                "efConstruction": 800,
                "efSearch": 800,
                "metric": "cosine"
            }
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)

# Define semantic configuration
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="Title"),
        keywords_fields=[SemanticField(field_name="Genres")],
        content_fields=[SemanticField(field_name="searchContent"), SemanticField(field_name="Description")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Update the search index with the new fields and configurations
existing_index.fields = fields
existing_index.vector_search = vector_search
existing_index.semantic_search = semantic_search

result = index_client.create_or_update_index(existing_index)
print(f'Index {result.name} updated successfully')


- **Create Embeddings (rate limit patience)**
- **Get Embeddings as JSON**
- **Upload to Azure AI Search**
- **Batch - Parallel**

In [None]:
import openai
import json
import time
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchIndexingBufferedSender
import concurrent.futures
import os

# Configuration
openai.api_type = "azure"
openai.api_base = "https://xxxxx.openai.azure.com/"
openai.api_version = "2024-05-01-preview"
openai.api_key = "xxxxxxx"
deployment_id = "text-embedding-ada-002"

search_service_name = "xxxxxx"
search_index_name = "books-index"
admin_key = "xxxxxxxx"
endpoint = f"https://{search_service_name}.search.windows.net"

# Initialize the search client
search_client = SearchClient(endpoint=endpoint, index_name=search_index_name, credential=AzureKeyCredential(admin_key))

# Fetch all documents from the search index
results = search_client.search(search_text="*", include_total_count=True)
documents = [doc for doc in results]

# Function to generate embeddings for a batch of texts
def generate_embeddings_batch(texts, max_retries=7, backoff_factor=2):
    embeddings = []
    for text in texts:
        for attempt in range(max_retries):
            try:
                response = openai.Embedding.create(input=[text], deployment_id=deployment_id)
                embeddings.append(response['data'][0]['embedding'])
                break
            except openai.error.RateLimitError as e:
                if attempt < max_retries - 1:
                    wait_time = backoff_factor * (2 ** attempt)
                    print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print("Max retries exceeded. Please try again later.")
                    raise e
            except Exception as e:
                print(f"Unexpected error: {e}")
                raise e
        time.sleep(1)  # Add a delay between individual requests to reduce aggressiveness
    return embeddings

def process_documents(documents, batch_size=5, max_workers=8):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            
            # Create searchContent text
            search_texts = [f"{doc.get('Title', '')} {doc.get('Author', '')} {doc.get('Genres', '')} Rating: {doc.get('Rating', '')}" for doc in batch]
            description_texts = [doc.get('Description', '') if 'Description' in doc else '' for doc in batch]
            
            # Generate embeddings for both searchContent and Description in parallel
            future_search = executor.submit(generate_embeddings_batch, search_texts)
            future_description = executor.submit(generate_embeddings_batch, description_texts)
            
            futures[future_search] = ('Embedding', batch, search_texts, i)
            futures[future_description] = ('DescriptionEmbedding', batch, description_texts, i)

        # Process the completed futures
        for future in concurrent.futures.as_completed(futures):
            try:
                field_name, batch, texts, start_index = futures[future]
                embeddings = future.result()
                print(f"Processing batch starting at index {start_index} for {field_name}")

                for j, embedding in enumerate(embeddings):
                    # Store the embeddings in the respective fields
                    documents[start_index + j][field_name] = embedding
                    if field_name == 'Embedding':
                        documents[start_index + j]['searchContent'] = texts[j]
                    elif field_name == 'DescriptionEmbedding':
                        documents[start_index + j]['Description'] = texts[j]  # Optional: store the text used for embedding

            except Exception as e:
                print(f"Error processing batch: {e}")    
  

# Process documents to generate embeddings
process_documents(documents)

# Ensure the output directory exists
output_dir = "/lakehouse/default/Files/embeddings"
os.makedirs(output_dir, exist_ok=True)

# Save the documents with embeddings to a JSON file in the lakehouse
output_file = os.path.join(output_dir, "bookVectors.json")
with open(output_file, 'w') as file:
    json.dump(documents, file, indent=2)

print(f"Documents with embeddings saved to {output_file}")

# Upload the documents with embeddings to the index
# search_client = SearchClient(endpoint=endpoint, index_name=search_index_name, credential=AzureKeyCredential(admin_key))

# Using SearchIndexingBufferedSender to upload the documents in batches optimized for indexing
with SearchIndexingBufferedSender(
    endpoint=endpoint,
    index_name=search_index_name,
    credential=AzureKeyCredential(admin_key),
) as batch_client:
    # Add upload actions for all documents
    with open("/lakehouse/default/Files/embeddings/bookVectors.json", 'r') as file:
        documents = json.load(file)
        batch_client.upload_documents(documents=documents)

print(f"Uploaded {len(documents)} documents in total")
