### Users Notebook
- This Notebook creates new AI Search Index
- Creates Vector Profile
- Gets the user JSON and transforms as needed
- Uploads the Users to the Index
- Generates embeddings and stores them to the Index as well


In [None]:
import json

# Define input and output file paths
input_file = '/lakehouse/default/Files/userdata.json'
output_file = '/lakehouse/default/Files/udata.json'

# Read the JSON file
with open(input_file, 'r') as file:
    user_data = json.load(file)

# Transform the Genres field from a JSON string to an actual list
for user in user_data:
    try:
        user['Genres'] = json.loads(user['Genres'])
    except json.JSONDecodeError as e:
        print(f"Invalid JSON format for Genres in user ID {user.get('id', 'unknown')}: {user['Genres']}. Error: {e}")
        user['Genres'] = []  # Set to an empty list or handle it as per your requirement

# Write the transformed data to a new JSON file
with open(output_file, 'w') as file:
    json.dump(user_data, file, indent=4)

print(f"Transformed data saved to {output_file}")


In [1]:
import openai
import json
import time
import os
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField, SearchFieldDataType, SearchField, SearchIndex,
    VectorSearch, HnswAlgorithmConfiguration, VectorSearchProfile,
    SemanticConfiguration, SemanticPrioritizedFields, SemanticField, SemanticSearch
)
from azure.core.credentials import AzureKeyCredential
import concurrent.futures
from azure.search.documents import SearchIndexingBufferedSender

# Initialize Azure OpenAI and Search clients
openai.api_type = "azure"
openai.api_base = "https://dev-oai-kpass.openai.azure.com/"
openai.api_version = "2024-02-01"
openai.api_key = "18a8d8f39a794c169130055ac2c2ff7d"

search_service_name = "azaivztqx"
search_index_name = "users-index"
admin_key = "UvNc9RS47BkkZi0Hz7XPdSkpvi9QXDuqbg6rrejGw5AzSeBxWhxe"
endpoint = f"https://{search_service_name}.search.windows.net"

# Initialize the search client
search_client = SearchClient(endpoint=endpoint, index_name=search_index_name, credential=AzureKeyCredential(admin_key))

# Define the initial user schema without the Embedding field
user_fields = [
    SimpleField(name="UserId", type=SearchFieldDataType.String, key=True, retrievable=True, filterable=True),
    SimpleField(name="Age", type=SearchFieldDataType.Int32, retrievable=True, filterable=True),
    SearchField(name="Genres", type=SearchFieldDataType.Collection(SearchFieldDataType.String), retrievable=True, filterable=True, facetable=True, searchable=True)
]

# Define vector search configurations
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="userHnsw",
            parameters={
                "m": 8,
                "efConstruction": 800,
                "efSearch": 800,
                "metric": "cosine"
            }
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="userHnswProfile",
            algorithm_configuration_name="userHnsw",
        )
    ]
)

# Define semantic configuration
semantic_configurations = SemanticConfiguration(
    name="users-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name=""),
        keywords_fields=[SemanticField(field_name="Genres")],
        content_fields=[SemanticField(field_name="searchContent")]
    )
)

# Create the initial index schema
index = SearchIndex(
    name=search_index_name,
    fields=user_fields,
    vector_search=vector_search,
    semantic_configurations=[semantic_configurations]
)

# Create a search index client
index_client = SearchIndexClient(endpoint=endpoint, credential=AzureKeyCredential(admin_key))

# Create or update the index
result = index_client.create_or_update_index(index)
print(f'Index {result.name} created or updated successfully')

# Add the Embedding and searchContent fields to the existing index
additional_fields = [
    SearchField(name="Embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), retrievable=True, searchable=True, vector_search_dimensions=1536, vector_search_profile_name="userHnswProfile"),
    SearchField(name="searchContent", type=SearchFieldDataType.String, searchable=True)
]

# Retrieve the existing index
existing_index = index_client.get_index(search_index_name)
existing_index.fields.extend(additional_fields)

# Update the index with the new fields
result = index_client.create_or_update_index(existing_index)
print(f'Index {result.name} updated with additional fields')

# Load user data from JSON file
input_file = '/lakehouse/default/Files/udata.json'
with open(input_file, 'r') as file:
    user_data = json.load(file)

# Lists to hold valid and invalid documents
valid_documents = []
invalid_documents = []

# Validate each document
for doc in user_data:
    valid = True

    if 'UserId' in doc:
        try:
            doc['UserId'] = str(doc['UserId'])
        except ValueError:
            print(f"Invalid value for UserId in document ID {doc['UserId']}: {doc['UserId']}")
            invalid_documents.append(doc)
            valid = False
    
    # Validate Genres is a list of strings
    if 'Genres' in doc:
        if isinstance(doc['Genres'], str):
            try:
                doc['Genres'] = json.loads(doc['Genres'])
            except json.JSONDecodeError:
                print(f"Invalid JSON format for Genres in document ID {doc['UserId']}: {doc['Genres']}")
                invalid_documents.append(doc)
                valid = False
        elif isinstance(doc['Genres'], list):
            if not all(isinstance(genre, str) for genre in doc['Genres']):
                print(f"Unexpected format for Genres in document ID {doc['UserId']}: {doc['Genres']}")
                invalid_documents.append(doc)
                valid = False
        else:
            print(f"Unexpected format for Genres in document ID {doc['UserId']}: {doc['Genres']}")
            invalid_documents.append(doc)
            valid = False
    
    if valid:
        valid_documents.append(doc)

# Log the number of valid and invalid documents
print(f"Valid documents: {len(valid_documents)}")
print(f"Invalid documents: {len(invalid_documents)}")

# Upload valid documents to the Azure Search index
if valid_documents:
    result = search_client.upload_documents(documents=valid_documents)
    print(f"Uploaded {len(valid_documents)} documents to the Azure Search index. Results: {result}")
else:
    print("No valid documents to upload.")

# Function to generate embeddings for a batch of texts
def generate_embeddings_batch(texts, max_retries=7, backoff_factor=2):
    embeddings = []
    for text in texts:
        for attempt in range(max_retries):
            try:
                response = openai.Embedding.create(input=text, engine="text-embedding-ada-002")  # Use the correct deployment ID
                embeddings.append(response['data'][0]['embedding'])
                break
            except openai.error.RateLimitError as e:
                if attempt < max_retries - 1:
                    wait_time = backoff_factor * (2 ** attempt)
                    print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print("Max retries exceeded. Please try again later.")
                    raise e
        time.sleep(1)  # Add a delay between individual requests to reduce aggressiveness
    return embeddings

# Function to process documents in parallel
def process_documents(documents, batch_size=5, max_workers=8):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            texts = [f"{' '.join(doc['Genres'])} {doc['Age']}" for doc in batch]
            future = executor.submit(generate_embeddings_batch, texts)
            futures[future] = (batch, texts, i)

        for future in concurrent.futures.as_completed(futures):
            embeddings = future.result()
            batch, texts, start_index = futures[future]
            for j, embedding in enumerate(embeddings):
                documents[start_index + j]['Embedding'] = embedding
                documents[start_index + j]['searchContent'] = texts[j]

# Generate embeddings for documents
process_documents(valid_documents)

# Ensure the output directory exists
output_dir = "/lakehouse/default/Files/embeddings"
os.makedirs(output_dir, exist_ok=True)

# Save the documents with embeddings to a JSON file in the lakehouse
output_file = os.path.join(output_dir, "userVectors.json")
with open(output_file, 'w') as file:
    json.dump(valid_documents, file)

print(f"Documents with embeddings saved to {output_file}")

# Upload the documents with embeddings to the index
try:
    with SearchIndexingBufferedSender(
        endpoint=endpoint,
        index_name=search_index_name,
        credential=AzureKeyCredential(admin_key),
    ) as batch_client:
        with open(output_file, 'r') as file:
            documents = json.load(file)
            batch_client.upload_documents(documents=documents)

    print(f"Uploaded {len(documents)} documents in total")
except Exception as e:
    print(f"Error uploading documents: {e}")


StatementMeta(, f33da179-175f-4cec-b466-d235b2f2978e, 5, Finished, Available, Finished)

Index users-index created or updated successfully
Index users-index updated with additional fields
Valid documents: 5
Invalid documents: 0
Uploaded 5 documents to the Azure Search index. Results: [<azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7bdc74dc3670>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7bdc74dc38b0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7bdc74dc3910>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7bdc74dc3970>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7bdc74dc39d0>]
Documents with embeddings saved to /lakehouse/default/Files/embeddings/userVectors.json
Uploaded 5 documents in total
