In [None]:
#! pip install numpy
! pip install python-dotenv
! pip install azure-core
! pip install azure-cosmos
! pip install openai
! pip install azure-identity

In [2]:
import json
import re
import random
import string
import uuid
from dotenv import dotenv_values

# Azure Identity imports
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

# Cosmos DB imports
from azure.cosmos import CosmosClient
from azure.cosmos.aio import CosmosClientAsync
from azure.cosmos import PartitionKey

# Azure OpenAI imports
from openai import AzureOpenAI



In [12]:

# specify the name of the .env file name 
env_name = "my-config.env"  #"config.env"
config = dotenv_values(env_name)

# OpenAI configuration
OPENAI_API_ENDPOINT = config['openai_endpoint']
OPENAI_API_VERSION = config['openai_api_version'] # at the time of authoring, the api version is 2024-02-01
OPENAI_KEY = config['openai_key']
COMPLETIONS_MODEL = config['openai_completions_model']
COMPLETIONS_MODEL_DEPLOYMENT = config['openai_completions_deployment']
EMBEDDING_MODEL = config['openai_embeddings_model']
EMBEDDING_MODEL_DEPLOYMENT = config['openai_embeddings_deployment']
EMBEDDING_DIMENSIONS = int(config['openai_embeddings_dimensions'])

# Azure Cosmos DB configuration
COSMOS_ENDPOINT = config['cosmos_uri']
COSMOS_KEY = config['cosmos_key']
COSMOS_DATABASE_NAME = config['cosmos_database_name']
COSMOS_CONTAINER_NAME = config['cosmos_container_name']
COSMOS_PARTITION_KEY_PROPERTY = config['cosmos_partition_key_property']
COSMOS_VECTOR_PROPERTY = config['cosmos_vector_property']

# Output json file
OUTPUT_FILE_NAME = config['output_file_name']


In [13]:
# Create Azure OpenAI client using key

AOAI_client = AzureOpenAI(
    azure_endpoint = OPENAI_API_ENDPOINT, 
    api_version = OPENAI_API_VERSION,
    api_key = OPENAI_KEY
    )


In [6]:
# Create Azure OpenAI client using Azure Identity (optional)

AOAI_client = AzureOpenAI(
    azure_endpoint = OPENAI_API_ENDPOINT, 
    api_version = OPENAI_API_VERSION,
    azure_ad_token = DefaultAzureCredential()
)

In [14]:
def generate_embeddings(text):
    '''
    Generate embeddings from string of text.
    This will be used to vectorize data and user input for interactions with Azure OpenAI.
    '''
    response = AOAI_client.embeddings.create(
        input = text, 
        dimensions = EMBEDDING_DIMENSIONS,
        model = EMBEDDING_MODEL_DEPLOYMENT)
    
    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

In [36]:
def generate_completion(user_prompt, max_tokens=100):
    
    system_prompt = '''
    You are a product manager for the Cosmic Works Bike Company, a bike retailer. 
    Your job is to create a new product catalog that will be used by the company's website. '''
    
    messages=[{"role": "system", "content": system_prompt}]
    messages.append({"role": "user", "content": user_prompt})
    
    response = AOAI_client.chat.completions.create(
        model = COMPLETIONS_MODEL_DEPLOYMENT,
        messages = messages,
        max_tokens = max_tokens
    )
    
    response = response.model_dump_json(indent=2)
    # Convert the response to a JSON object
    response = json.loads(response)
    # Extract the response from the JSON object
    response = response['choices'][0]['message']['content']
    
    return response

In [41]:
# These functions are used to generate product data for the Cosmic Works Bike Company

def generate_productName(product_category):
    prompt=f"Generate a detailed and engaging product name for a product with a product category of '{product_category}'"
    prompt+="Return only the text of the product name."
    return generate_completion(prompt, max_tokens=100)

def generate_description(product_name):
    prompt=f"Generate an engaging product description for a product named '{product_name}'"
    prompt+="that includes concise product attributes normally found for a product of this type."
    prompt+="Return only the text of the product description. Only return return alphanumeric characters and spaces."
    
    description = generate_completion(prompt, max_tokens=200)
    # Strip off anything past the last period in the string so no partial sentences are returned due to token limit
    description = description[:description.rfind('.')] + "."
    
    return description

def generate_price(product_name, product_category, product_description):
    prompt=f"Create a price for this product in dollars and cents that is appropriate for a product named '{product_name}' in a product category of '{product_category}' and description of '{product_description}'"
    prompt+="Return only the price as a number value. Do not return any currency symbols. Do not return any other characters other than the number value."
    price = generate_completion(prompt, max_tokens=20)
    # Strip off anything that is not a number or a decimal point
    price = float(re.sub(r'[^\d.]', '', price))
    return price

def generate_customer_name():
    prompt="Create a first and last name of a person. Can be male or female."
    prompt+="Return only the first and last name with a space between them."
    return generate_completion(prompt, max_tokens=100)

def generate_review(product_name, product_description, product_price):
    prompt=f"Write a customer product review for a product named '{product_name}' and description of '{product_description}' with a price of '{product_price}'"
    prompt+="Return only the text of the product review."
    return generate_completion(prompt, max_tokens=200)


In [38]:
# Generate a single product
def generate_product(category, tags):
    
    # Generate a product using the category name
    product_name = generate_productName(category['name'])
    # Generate a description for the product using its name
    description = generate_description(product_name)
    # Generate a price for the product using its name, category, and description
    price = generate_price(product_name, category['name'], description)
    
    # Generate 1-5 reviews for the product
    reviews = [
        {
            "customer": generate_customer_name(), 
            # Generate a random rating between 1 and 5
            "rating": random.randint(1, 5),
            # Generate a product review using its name, description and price
            "review": generate_review(product_name, description, price)
        }
        for _ in range(random.randint(1, 5))
    ]
    
    # Create a product object
    product = {
        "id": str(uuid.uuid4()),
        "categoryId": category['id'],
        "categoryName": category['name'],
        "sku": ''.join(random.choices(string.ascii_letters + string.digits, k=10)),
        "name": product_name,
        "description": description,
        "price": price,
        "tags": tags,
        "reviews": reviews
    }
    
    # Convert product to JSON to print for debugging
    #product = json.dumps(product, ensure_ascii=False)
    #print(product)
    
    return product



In [43]:
# Generate an entire product catalog, vectorize it, and return the products
def generate_product_catalog(products_per_category=10):
    
    category_names = ["Accessories, Bike Racks", "Accessories, Bike Stands", "Accessories, Bottles and Cages",
        "Accessories, Cleaners", "Accessories, Fenders", "Accessories, Helmets", "Accessories, Hydration Packs",
        "Accessories, Lights", "Accessories, Locks", "Accessories, Panniers", "Accessories, Pumps",
        "Accessories, Tires and Tubes", "Bikes, Mountain Bikes", "Bikes, Road Bikes", "Bikes, Touring Bikes",
        "Clothing, Bib-Shorts", "Clothing, Caps", "Clothing, Gloves", "Clothing, Jerseys", "Clothing, Shorts",
        "Clothing, Socks", "Clothing, Tights", "Clothing, Vests", "Components, Bottom Brackets", "Components, Brakes",
        "Components, Chains", "Components, Cranksets", "Components, Derailleurs", "Components, Forks", "Components, Handlebars",
        "Components, Headsets", "Components, Mountain Frames", "Components, Pedals", "Components, Road Frames",
        "Components, Saddles", "Components, Touring Frames", "Components, Wheels"
    ]
    
    # Create product category objects with id values
    categories = [
        {
            "id": str(uuid.uuid4()),
            "name": category_name
        }
        for category_name in category_names
    ]
    
    tag_names = [
        'New', 'Sale', 'Popular', 'Limited Edition', 'Exclusive', 'Best Seller', 'Trending', 'Hot', 'Discounted', 'Clearance',
        'Featured', 'Top Rated', 'Recommended', 'Special Offer', 'Deal of the Day', 'Flash Sale', 'Back in Stock', 'Pre-Order',
        'Online Only', 'Eco-Friendly', 'Handmade', 'Luxury', 'Budget', 'Premium', 'Collectors Item', 'Rare', 'Modern', 'Classic',
        'Essential', 'Must-Have', 'Limited Stock', 'Seasonal', 'Holiday Special', 'Anniversary Edition',
        'Collectors Edition', 'Special Edition', 'Exclusive Release', 'Limited Release', 'Exclusive Offer'
    ]
    
    # Create product tag objects with id values
    tags = [
        {
            "id": str(uuid.uuid4()),
            "name": tag_name
        }
        for tag_name in tag_names
    ]
    
    # Generate products for each category, default is 10 products per category
    all_products = []
    for category in categories:
        for _ in range(products_per_category):
            
            # Select a random set of 1-5 tags for the product
            product_tags = random.sample(tags, random.randint(1, 5))       
            
            # Generate a new product, pass in its category and tags
            product = generate_product(category, product_tags)
            
            # Select specific product data to vectorize
            vector_product = {
                "name": product['name'],
                "categoryName": product['categoryName'],
                "description": product['description'],
                "price": product['price'],
                # Convert tags to a list of tag names to avoid vectorizing the tag ids
                "tags": [tag['name'] for tag in product['tags']],
                "reviews": product['reviews']
            }
            
            # Generate embeddings for the product data
            product['vectors'] = generate_embeddings(json.dumps(vector_product, ensure_ascii=False))

            # Print the category and new product name to monitor progress
            print(f"New Product: Category: {category['name']}, Product: {product['name']}")

            # Append the product to the list of all products
            all_products.append(product)

    # Print the generated products for debugging
    #for product in all_products:
    #    product = json.dumps(product, ensure_ascii=False)
    #    print(product)
    
    return all_products

In [None]:
# Test with a single product before running the next cell to generate an entire product catalog

category_names = ["Accessories, Bike Racks", "Accessories, Bike Stands", "Accessories, Bottles and Cages",
        "Accessories, Cleaners", "Accessories, Fenders", "Accessories, Helmets", "Accessories, Hydration Packs",
        "Accessories, Lights", "Accessories, Locks", "Accessories, Panniers", "Accessories, Pumps",
        "Accessories, Tires and Tubes", "Bikes, Mountain Bikes", "Bikes, Road Bikes", "Bikes, Touring Bikes",
        "Clothing, Bib-Shorts", "Clothing, Caps", "Clothing, Gloves", "Clothing, Jerseys", "Clothing, Shorts",
        "Clothing, Socks", "Clothing, Tights", "Clothing, Vests", "Components, Bottom Brackets", "Components, Brakes",
        "Components, Chains", "Components, Cranksets", "Components, Derailleurs", "Components, Forks", "Components, Handlebars",
        "Components, Headsets", "Components, Mountain Frames", "Components, Pedals", "Components, Road Frames",
        "Components, Saddles", "Components, Touring Frames", "Components, Wheels"
    ]
    
categories = [
        {
            "id": str(uuid.uuid4()),
            "name": category_name
        }
        for category_name in category_names
    ]


tag_names = [
        'New', 'Sale', 'Popular', 'Limited Edition', 'Exclusive', 'Best Seller', 'Trending', 'Hot', 'Discounted', 'Clearance',
        'Featured', 'Top Rated', 'Recommended', 'Special Offer', 'Deal of the Day', 'Flash Sale', 'Back in Stock', 'Pre-Order',
        'Online Only', 'Eco-Friendly', 'Handmade', 'Luxury', 'Budget', 'Premium', 'Collectors Item', 'Rare', 'Modern', 'Classic',
        'Essential', 'Must-Have', 'Limited Stock', 'Seasonal', 'Holiday Special', 'Anniversary Edition',
        'Collectors Edition', 'Special Edition', 'Exclusive Release', 'Limited Release', 'Exclusive Offer'
    ]

    
tags = [
        {
            "id": str(uuid.uuid4()),
            "name": tag_name
        }
        for tag_name in tag_names
    ]

# Select "Bikes, Mountain Bikes" category from the list of categories
category = next(category for category in categories if category['name'] == "Bikes, Mountain Bikes")

# Select a random set of tags for the product
product_tags = random.sample(tags, random.randint(1, 5))

# Generate a new product using the category and tags
product = generate_product(category=category, tags=product_tags)

# Vectorize the product data
vector_product = {
    "name": product['name'],
    "categoryName": product['categoryName'],
    "description": product['description'],
    "price": product['price'],
    # Convert tags to a list of tag names to avoid vectorizing the tag ids
    "tags": [tag['name'] for tag in product['tags']],
    "reviews": product['reviews']
}

# Convert the the product to a json string, generate embeddings, and add the embeddings to the product
product['vectors'] = generate_embeddings(json.dumps(vector_product, ensure_ascii=False))

# Convert the product to a json string to print for debugging
product = json.dumps(product, ensure_ascii=False)
print(product)



In [None]:
# Generate an entire product catalog and vectorize the products
products = generate_product_catalog()

# Save the products to a JSON file
with open(OUTPUT_FILE_NAME, 'w') as f:
    json.dump(products, f, indent = 4)

# Done!
print(f"Products generated and saved to {OUTPUT_FILE_NAME}")

In [25]:
cosmos_client = CosmosClient(url = COSMOS_ENDPOINT, credential = COSMOS_KEY)

db= cosmos_client.create_database_if_not_exists(
    id = COSMOS_DATABASE_NAME
)

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path": COSMOS_VECTOR_PROPERTY,
            "dataType": "float32",
            "distanceFunction": "cosine",
            "dimensions": EMBEDDING_DIMENSIONS
        }
    ]
}

indexing_policy = {
    "includedPaths": [
        { "path": "/*" }
    ],
    "excludedPaths": [
        { "path": "/\"_etag\"/?" },
        { "path": "/vector/*"  }
    ],
    "vectorIndexes": [
        {"path": "/vector", "type": "quantizedFlat" } # "diskAnn" 
    ]
}

container = db.create_container_if_not_exists(
    id = COSMOS_CONTAINER_NAME,
    partition_key = PartitionKey( path = COSMOS_PARTITION_KEY_PROPERTY, kind = 'Hash' ),
    indexing_policy = indexing_policy,
    vector_embedding_policy = vector_embedding_policy
    )

In [None]:
# Write the data from the products.json file to the Cosmos DB container
with open(OUTPUT_FILE_NAME, 'r') as f:
    products = json.load(f)
    
    for product in products:
        container.upsert_item( body = product )
        
print(f"Products written to {COSMOS_CONTAINER_NAME} in {COSMOS_DATABASE_NAME} in Cosmos DB")