In [None]:
#! pip install numpy
! pip install python-dotenv
! pip install azure-core
! pip install azure-cosmos
! pip install openai
! pip install azure-identity

In [6]:
import json
import re
import random
import string
import uuid
from dotenv import dotenv_values

# Azure Identity imports
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

# Cosmos DB imports
from azure.cosmos import CosmosClient
from azure.cosmos.aio import CosmosClient as CosmosClientAsync
from azure.cosmos import PartitionKey

# Azure OpenAI imports
from openai import AzureOpenAI


In [7]:

# specify the name of the .env file name 
env_name = "my-config.env"  #"config.env"
config = dotenv_values(env_name)

# OpenAI configuration
OPENAI_API_ENDPOINT = config['openai_endpoint']
OPENAI_API_VERSION = config['openai_api_version'] # at the time of authoring, the api version is 2024-02-01
OPENAI_KEY = config['openai_key']
COMPLETIONS_MODEL = config['openai_completions_model']
COMPLETIONS_MODEL_DEPLOYMENT = config['openai_completions_deployment']
EMBEDDING_MODEL = config['openai_embeddings_model']
EMBEDDING_MODEL_DEPLOYMENT = config['openai_embeddings_deployment']
EMBEDDING_DIMENSIONS = int(config['openai_embeddings_dimensions'])

# Azure Cosmos DB configuration
COSMOS_ENDPOINT = config['cosmos_uri']
COSMOS_KEY = config['cosmos_key']
COSMOS_DATABASE = config['cosmos_database']
COSMOS_CUSTOMER_CONTAINER = config['cosmos_customer_container']
COSMOS_CUSTOMER_PARTITION_KEY_PROPERTY = config['cosmos_customer_partition_key_property']
COSMOS_CUSTOMER_VECTOR_PROPERTY = config['cosmos_customer_vector_property']

# Output json file
OUTPUT_CUSTOMER_FILE_NAME = config['output_customer_file_name']


In [8]:
# Create Azure OpenAI client using key

AOAI_client = AzureOpenAI(
    azure_endpoint = OPENAI_API_ENDPOINT, 
    api_version = OPENAI_API_VERSION,
    api_key = OPENAI_KEY
    )


In [9]:
def generate_embeddings(text):
    '''
    Generate embeddings from string of text.
    This will be used to vectorize data and user input for interactions with Azure OpenAI.
    '''
    response = AOAI_client.embeddings.create(
        input = text, 
        dimensions = EMBEDDING_DIMENSIONS,
        model = EMBEDDING_MODEL_DEPLOYMENT)
    
    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

In [None]:
def generate_completion(user_prompt, max_tokens=100):
    
    system_prompt = '''
    You are a product manager for the Cosmic Works Bike Company, a bike retailer. 
    Your job is to create sales orders to support an online retail store.'''
    
    messages=[{"role": "system", "content": system_prompt}]
    messages.append({"role": "user", "content": user_prompt})
    
    response = AOAI_client.chat.completions.create(
        model = COMPLETIONS_MODEL_DEPLOYMENT,
        messages = messages,
        max_tokens = max_tokens
    )
    
    print(response.model_dump())
    
    response = response.model_dump_json(indent=2)
    # Convert the response to a JSON object
    response = json.loads(response)
    # Extract the response from the JSON object
    response = response['choices'][0]['message']['content']
    
    return response

In [33]:
# This function generates a customer profile for the Cosmic Works Bike Company

def generate_customer():
    
    # Define the prompt for generating a customer profile
    prompt = """
    Create a fake customer profile for an e-commerce website. The profile should include:
    - id: A unique identifier for the customer. Should be a UUID.
    - Customer Id: The same id value above.
    - First Name: A realistic firs name.
    - Last Name: A realistic last name.
    - Email Address: A realistic email address.
    - Phone Number: A real phone number with the correct country code.
    - An array of addresses: One Billing Address and one or more shipping addresses with names of family members. Each address should include:
    - Name: A realistic name of the person associated with the address.
    - Address Type: Either Billing or Shipping.
    - Address Line 1: A realistic street address.
    - Address Line 2: Optional, could be an apartment number or suite (or empty).
    - City: A real city name that matches the state and country.
    - State: A real state or province that matches the city and country.
    - Zip Code: A real postal or zip code that matches the city, state, and country.
    - Country: A real country name.

    The profile must be returned as a valid JSON object with the following structure:
    {
        "id": "a UUID",
        "customerId": "the same UUID as above",
        "firstName": "John",
        "lastName": "Doe",
        "email": "johndoe@hotmail.com",
        "addresses": [
            {
                "name": "John Doe",
                "type": "Billing",
                "address1": "123 Main Street",
                "address2": "Apt 4B",
                "city": "New York",
                "state": "NY",
                "zip": "10001",
                "country": "United States"
            },
            {
                "name": "Jane Doe",
                "type": "Shipping",
                "address1": "456 Elm Street",
                "address2": "",
                "city": "New York",
                "state": "NY",
                "zip": "10001",
                "country": "United States"
            }
        ]
    }
    Ensure that all address fields are consistent and realistic.
    
    Only return valid json for the customer profile. Do not include ```json or any other code in the response.
    """

    return generate_completion(prompt, max_tokens=2000)

In [46]:
# Generate a single customer profile
def generate_customer_profile():
        
    # Create a customer profile
    customerJson = generate_customer()
    
    # Convert the customerJson to an object
    customer = json.loads(customerJson)
    
    # Print for debugging
    #print(customerJson)
    
    return customer

# Debugging
#generate_customer_profile()

In [None]:
# Generate customer profiles, vectorize them, and return the customers
def generate_customer_profiles(customers=1):
    
    # Generate customers
    all_customers = []
    for _ in range(customers):
        customer = generate_customer_profile()
        
        # Vectorize the customer data
        customer_vector = {
            "firstName": customer["firstName"],
            "lastName": customer["lastName"],
            "email": customer["email"],
            "phone": customer["phone"],
            "addresses": customer["addresses"]
        }
        
        # Generate embeddings for the customer data
        customer['vectors'] = generate_embeddings(json.dumps(customer_vector, ensure_ascii=False))
        
        # Append the customer to the list of all customers
        all_customers.append(customer)

    # Print the generated customers for debugging
    #for customer in all_customers:
    #    customer = json.dumps(customer, ensure_ascii=False)
    #    print(customer)
    
    return all_customers

In [None]:
# Test one customer profile before generating the entire customer list
customer = generate_customer_profiles(customers=1)

# Convert product to JSON to print for debugging
customer = json.dumps(customer, ensure_ascii=False)
print(customer)

In [42]:
# Generate customer profiles and vectorize them

# Generate 100 customer profiles
customers = generate_customer_profiles(customers=100)

# Save the products to a JSON file
with open(OUTPUT_CUSTOMER_FILE_NAME, 'w') as f:
    json.dump(customers, f, indent = 4)

# Done!
print(f"Customers generated and saved to {OUTPUT_CUSTOMER_FILE_NAME}")

Customers generated and saved to customers.json


In [44]:
# create a Cosmos DB database and container for the customer profiles
cosmos_client = CosmosClient(url = COSMOS_ENDPOINT, credential = COSMOS_KEY)

db= cosmos_client.create_database_if_not_exists(
    id = COSMOS_DATABASE
)

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path": COSMOS_CUSTOMER_VECTOR_PROPERTY,
            "dataType": "float32",
            "distanceFunction": "cosine",
            "dimensions": EMBEDDING_DIMENSIONS
        }
    ]
}

indexing_policy = {
    "includedPaths": [
        { "path": "/*" }
    ],
    "excludedPaths": [
        { "path": "/\"_etag\"/?" },
        { "path": "/vector/*"  }
    ],
    "vectorIndexes": [
        {"path": "/vector", "type": "quantizedFlat" } # "diskAnn"
    ]
}

container = db.create_container_if_not_exists(
    id = COSMOS_CUSTOMER_CONTAINER,
    partition_key = PartitionKey( path = COSMOS_CUSTOMER_PARTITION_KEY_PROPERTY, kind = 'Hash' ),
    indexing_policy = indexing_policy,
    vector_embedding_policy = vector_embedding_policy
    )

In [45]:
# Write the data from the customers.json file to the Cosmos DB container
with open(OUTPUT_CUSTOMER_FILE_NAME, 'r') as f:
    customers = json.load(f)
    
    for customer in customers:
        container.upsert_item( body = customer )
        
print(f"Customers written to {COSMOS_CUSTOMER_CONTAINER} in {COSMOS_DATABASE} in Cosmos DB")

Customers written to customers in database in Cosmos DB
