In [None]:
# Welcome to your new notebook
# Type here in the cell editor to add code!


In [None]:
import json

# Define input and output file paths
input_file = '/lakehouse/default/Files/userdata.json'
output_file = '/lakehouse/default/Files/udata.json'

# Read the JSON file
with open(input_file, 'r') as file:
    user_data = json.load(file)

# Transform the Genres field from a JSON string to an actual list
for user in user_data:
    user['Genres'] = json.loads(user['Genres'])

# Write the transformed data to a new JSON file
with open(output_file, 'w') as file:
    json.dump(user_data, file, indent=4)

print(f"Transformed data saved to {output_file}")


In [None]:
import openai
import os
import json
import time
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
import concurrent.futures

# Initialize Azure OpenAI and Search clients
openai.api_type = "azure"
openai.api_base = "https://xxxxxxxxxxxxxxxxxx.openai.azure.com/"
openai.api_version = "2024-02-01"
openai.api_key = "xxxxxxxxxxxxxxxxxx"

search_service_name = "aiskzzgq"
search_index_name = "xxx2222-index"
admin_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxx"
endpoint = f"https://{search_service_name}.search.windows.net"

# Initialize the search client
search_client = SearchClient(endpoint=endpoint, index_name=search_index_name, credential=AzureKeyCredential(admin_key))

# Load user data from JSON file
input_file = '/lakehouse/default/Files/udata.json'
with open(input_file, 'r') as file:
    user_data = json.load(file)

# Function to generate embeddings for a batch of texts
def generate_embeddings_batch(texts, max_retries=7, backoff_factor=2):
    embeddings = []
    for text in texts:
        for attempt in range(max_retries):
            try:
                response = openai.Embedding.create(input=text, engine="text-embedding-ada-002")  # Use the correct deployment ID
                embeddings.append(response['data'][0]['embedding'])
                break
            except openai.error.RateLimitError as e:
                if attempt < max_retries - 1:
                    wait_time = backoff_factor * (2 ** attempt)
                    print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print("Max retries exceeded. Please try again later.")
                    raise e
        time.sleep(1)  # Add a delay between individual requests to reduce aggressiveness
    return embeddings

# Function to process documents in parallel
def process_documents(documents, batch_size=5, max_workers=8):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            texts = [f"{' '.join(doc['Genres'])} {doc['Age']}" for doc in batch]
            future = executor.submit(generate_embeddings_batch, texts)
            futures[future] = (batch, texts, i)

        for future in concurrent.futures.as_completed(futures):
            embeddings = future.result()
            batch, texts, start_index = futures[future]
            for j, embedding in enumerate(embeddings):
                documents[start_index + j]['contentVector'] = embedding
                documents[start_index + j]['searchContent'] = texts[j]

# Generate embeddings for documents
process_documents(user_data)

# Ensure the output directory exists
output_dir = "/lakehouse/default/Files/embeddings"
os.makedirs(output_dir, exist_ok=True)

# Save the documents with embeddings to a JSON file in the lakehouse
output_file = os.path.join(output_dir, "userVectors.json")
with open(output_file, 'w') as file:
    json.dump(user_data, file)

print(f"Documents with embeddings saved to {output_file}")

# Upload the documents with embeddings to the index
search_client = SearchClient(endpoint=endpoint, index_name=search_index_name, credential=AzureKeyCredential(admin_key))

# Using SearchIndexingBufferedSender to upload the documents in batches optimized for indexing
from azure.search.documents import SearchIndexingBufferedSender

try:
    with SearchIndexingBufferedSender(
        endpoint=endpoint,
        index_name=search_index_name,
        credential=AzureKeyCredential(admin_key),
    ) as batch_client:
        with open(output_file, 'r') as file:
            documents = json.load(file)
            batch_client.upload_documents(documents=documents)

    print(f"Uploaded {len(documents)} documents in total")
except Exception as e:
    print(f"Error uploading documents: {e}")
