### New Users Update notebook
- This Notebook updates the Users Index when a new user registers and updates the Users Table in Azure SQL
- The Notebook runs upon a CDC Trigger that initiates the Data Pipeline which fetches the Users Table
- The Notebook compares the current Users Index and updates only the new ones as well as creates and saves the embeddings

In [None]:
import json

# Define input and output file paths
input_file = '/lakehouse/default/Files/userdata.json'
output_file = '/lakehouse/default/Files/udata.json'

# Read the JSON file
with open(input_file, 'r') as file:
    user_data = json.load(file)

# Transform the Genres field from a JSON string to an actual list
for user in user_data:
    try:
        user['Genres'] = json.loads(user['Genres'])
    except json.JSONDecodeError as e:
        print(f"Invalid JSON format for Genres in user ID {user.get('id', 'unknown')}: {user['Genres']}. Error: {e}")
        user['Genres'] = []  # Set to an empty list or handle it as per your requirement

# Write the transformed data to a new JSON file
with open(output_file, 'w') as file:
    json.dump(user_data, file, indent=4)

print(f"Transformed data saved to {output_file}")

In [None]:
import openai
import json
import time
import os
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
import concurrent.futures
from azure.search.documents import SearchIndexingBufferedSender

# Configuration
openai.api_type = "azure"
openai.api_base = "https://xxxxx.openai.azure.com/"
openai.api_version = "2024-02-01"
openai.api_key = "xxxxxxxx"
deployment_id = "text-embedding-ada-002"

search_service_name = "xxxxx"
search_index_name = "xxxxx-index"
admin_key = "xxxxxxxx"
endpoint = f"https://{search_service_name}.search.windows.net"

# Initialize the search client
search_client = SearchClient(endpoint=endpoint, index_name=search_index_name, credential=AzureKeyCredential(admin_key))

# Step 1: Fetch existing user IDs and their data from the search index
def get_existing_users():
    existing_users = {}
    results = search_client.search("*", select=["UserId", "Genres", "Age"], include_total_count=True)
    for result in results:
        existing_users[result["UserId"]] = result
    return existing_users

existing_users = get_existing_users()
print(f"Fetched {len(existing_users)} existing users from the index.")

# Step 2: Load user data from JSON file
input_file = '/lakehouse/default/Files/udata.json'
with open(input_file, 'r') as file:
    user_data = json.load(file)

# Step 3: Validate each document and separate valid and invalid documents
valid_documents = []
invalid_documents = []

def validate_documents(user_data):
    for doc in user_data:
        valid = True

        if 'UserId' in doc:
            try:
                doc['UserId'] = str(doc['UserId'])
            except ValueError:
                print(f"Invalid value for UserId in document ID {doc['UserId']}: {doc['UserId']}")
                invalid_documents.append(doc)
                valid = False
        
        # Validate Genres is a list of strings
        if 'Genres' in doc:
            if isinstance(doc['Genres'], str):
                try:
                    doc['Genres'] = json.loads(doc['Genres'])
                except json.JSONDecodeError:
                    print(f"Invalid JSON format for Genres in document ID {doc['UserId']}: {doc['Genres']}")
                    invalid_documents.append(doc)
                    valid = False
            elif isinstance(doc['Genres'], list):
                if not all(isinstance(genre, str) for genre in doc['Genres']):
                    print(f"Unexpected format for Genres in document ID {doc['UserId']}: {doc['Genres']}")
                    invalid_documents.append(doc)
                    valid = False
            else:
                print(f"Unexpected format for Genres in document ID {doc['UserId']}: {doc['Genres']}")
                invalid_documents.append(doc)
                valid = False
        
        if valid:
            valid_documents.append(doc)

# Run validation
validate_documents(user_data)

print(f"Valid documents: {len(valid_documents)}")
print(f"Invalid documents: {len(invalid_documents)}")

# Step 4: Filter out users who already exist and have the same data
def filter_new_or_updated_users(existing_users, valid_documents):
    new_or_updated_users = []
    for doc in valid_documents:
        user_id = doc.get('UserId')
        if user_id in existing_users:
            existing_doc = existing_users[user_id]
            # Check if the existing user data is the same
            if doc['Genres'] == existing_doc.get('Genres') and doc['Age'] == existing_doc.get('Age'):
                continue  # Skip users whose data hasn't changed
        new_or_updated_users.append(doc)
    return new_or_updated_users

new_users = filter_new_or_updated_users(existing_users, valid_documents)
print(f"Found {len(new_users)} new or updated users to process.")

# Step 5: Upload the new or updated users to the Azure Search index
def upload_documents_to_index(documents):
    try:
        if documents:
            result = search_client.upload_documents(documents=documents)
            print(f"Uploaded {len(documents)} documents to the Azure Search index.")
        else:
            print("No new or updated documents to upload.")
    except Exception as e:
        print(f"Error uploading documents: {e}")

upload_documents_to_index(new_users)



# Step 4: Generate embeddings for new or updated users
def generate_embeddings_batch(texts, max_retries=7, backoff_factor=2):
    embeddings = []
    for text in texts:
        for attempt in range(max_retries):
            try:
                response = openai.Embedding.create(input=text, engine=deployment_id)
                embeddings.append(response['data'][0]['embedding'])
                break
            except openai.error.RateLimitError:
                wait_time = backoff_factor * (2 ** attempt)
                print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            except Exception as e:
                print(f"Error generating embedding: {e}")
                raise e
        time.sleep(0.5)
    return embeddings

def process_documents(documents, batch_size=5, max_workers=8):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            texts = [f"{' '.join(doc['Genres'])} {doc['Age']}" for doc in batch]
            future = executor.submit(generate_embeddings_batch, texts)
            futures[future] = (batch, texts, i)

        for future in concurrent.futures.as_completed(futures):
            embeddings = future.result()
            batch, texts, start_index = futures[future]
            for j, embedding in enumerate(embeddings):
                # Store embeddings in the document
                batch[j]['Embedding'] = embedding
                batch[j]['searchContent'] = texts[j]

# Step 5: Process and generate embeddings for filtered users
if new_users:
    process_documents(new_users)


# Ensure the output directory exists
output_dir = "/lakehouse/default/Files/embeddings"
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, "updated_users.json")
with open(output_file, 'w') as file:
    json.dump(new_users, file, indent=2)

print(f"Updated users with embeddings saved to {output_file}")


# Upload the documents with embeddings to the index
try:
    with SearchIndexingBufferedSender(
        endpoint=endpoint,
        index_name=search_index_name,
        credential=AzureKeyCredential(admin_key),
    ) as batch_client:
        with open(output_file, 'r') as file:
            documents = json.load(file)
            batch_client.upload_documents(documents=documents)

    print(f"Uploaded {len(documents)} documents in total")
except Exception as e:
    print(f"Error uploading documents: {e}")
