### Generate data



In [1]:
from openai import AzureOpenAI
from datetime import datetime  
import os
from pathlib import Path  
import json
import random
import uuid
from tenacity import retry, wait_random_exponential, stop_after_attempt  
import pandas as pd
from dotenv import load_dotenv
env_path = Path('../app') / '.env'
load_dotenv(dotenv_path=env_path)


# Variables not used here do not need to be updated in your .env file
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
key_credential = os.environ["AZURE_SEARCH_ADMIN_KEY"] 
# index_name = os.environ["AZURE_SEARCH_INDEX_NAME"]
index_name= "skin-care"
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT_2"]
azure_openai_key = os.environ["AZURE_OPENAI_API_KEY_2"] 
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMB_DEPLOYMENT_2"]
chat_engine = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_1"]
embedding_model_name = azure_openai_embedding_deployment
azure_openai_api_version = "2023-12-01-preview"

credential = key_credential 



In [8]:
chat_client = AzureOpenAI(
  api_key=os.environ.get("AZURE_OPENAI_API_KEY_1"),  
  api_version="2023-12-01-preview",
  azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT_1")
)
@retry(wait=wait_random_exponential(multiplier=1, max=10), stop=stop_after_attempt(5))
def generate_text_data():
    response = chat_client.chat.completions.create(
        model=chat_engine,
        messages=[{"role":"system", "content":"you are helpful AI assistant with knowlege about skin care products and can speak English, Japanese, Vietnamese and Chinese"},
                  {"role":"user", "content":"Generate name, description and usage of a skin care product in English, Japanese, Vietnamese, Spanish and Chinese. Output should be in JSON format with following keys: 'name_en', description_en', 'best_for_en', 'name_ja', 'description_ja', 'best_for_ja', 'name_vi','description_vi', 'best_for_vi', 'name_es', 'description_es', 'best_for_es', 'name_es', 'description_zh', 'best_for_zh'. name should sounds like a real product name. The best_for should give descrition of the skin type, gender, age the product is best suitable for as well as when to use it. Try to vary skin type, age, and gender, do not just say good for all"}],
    response_format={ "type": "json_object" },
    )
    
    response_message = json.loads(response.choices[0].message.content)
    assert response_message.keys() == {'name_en', 'description_en', 'best_for_en', 'name_ja', 'description_ja', 'best_for_ja', 'name_vi', 'description_vi', 'best_for_vi','name_es', 'description_es', 'best_for_es', 'name_zh', 'description_zh', 'best_for_zh'}

    return response_message
# write code to run generate_text_data() for 100 times and save the output to a json file
output = []
for i in range(200):
    output.append(generate_text_data())


In [13]:
with open('text_output.json', 'w') as f:
    json.dump(output, f)

In [15]:
#load the json file and convert it to a pandas dataframe
df = pd.read_json('text_output.json')
#add column for the id (pd_001 to pd_100), price (from 500 to 6000), size (30ml, 40ml, 50ml, 80ml and 100ml)
df['id'] = ['pd_'+str(i).zfill(3) for i in range(1, 201)]
df['price'] = [random.randint(500, 6000) for i in range(200)]
df['size'] = [random.choice(['30ml', '40ml', '50ml', '80ml', '100ml']) for i in range(200)]
print(df.head(5))



                  name_en                                     description_en  \
0  Radiance Renewal Serum  Revitalize your skin with our Radiance Renewal...   
1   Radiant Renewal Cream  Reveal brighter and smoother skin with our Rad...   
2       RadiantGlow Serum  Experience the power of RadiantGlow Serum, a l...   
3     Radiant Youth Serum  A powerful serum enriched with antioxidants an...   
4     Radiant Youth Serum  This serum is packed with antioxidants and hya...   

                                         best_for_en       name_ja  \
0  Ideal for individuals with combination skin, a...   輝きリニューアルセラム   
1  Ideal for mature women with dry skin, looking ...      輝く再生クリーム   
2  Best for normal to combination skin types, sui...  ラディアントグロウセラム   
3  Ideal for mature skin, both male and female, a...      輝く若さのセラム   
4  Ideal for individuals with dry or combination ...       輝く若さセラム   

                                      description_ja  \
0          自然な輝きを引き出し、年齢による肌の老化サインを軽減する、輝き

In [17]:
# Note: DALL-E 3 requires version 1.0.0 of the openai-python library or later
import os
from openai import AzureOpenAI
import json
import requests
@retry(wait=wait_random_exponential(multiplier=1, max=10), stop=stop_after_attempt(5))
def generate_image(description, file_path):
    client = AzureOpenAI(
        api_version="2024-02-15-preview",
        azure_endpoint="https://openai002.openai.azure.com/",
        api_key=os.environ["AZURE_OPENAI_API_KEY_2"],
    )

    result = client.images.generate(
        model="Dalle3", # the name of your DALL-E 3 deployment
        prompt=description,
        size = "1024x1024",
        n=1
    )

    image_url = json.loads(result.model_dump_json())['data'][0]['url']

    response = requests.get(image_url)
    with open(file_path, 'wb') as file:
        file.write(response.content)


In [19]:
#for each row in the dataframe, call generate_image() using the description_en and save the image to a file using the id as the file name and .jpg as the extension to the data/images folder
os.makedirs('images', exist_ok=True)
for i, row in df.iterrows():
    if i ==0: continue
    description = row['description_en']
    file_path = f'images/{row["id"]}.jpg'
    generate_image(description, file_path)


In [22]:
from openai import AzureOpenAI
import json

client = AzureOpenAI(
    azure_deployment=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key,

)
input_data= df.to_dict(orient='records')
    # 'url', 'product_name', 'description', 'brand', 'spf', 'price', 'size',
    #    'release_date', 'rev_sum', 'rev_point', 'amount', 'category'

descriptions = [item['description_en'] for item in input_data]
description_response = client.embeddings.create(input=descriptions, model=embedding_model_name)
description_embeddings = [item.embedding for item in description_response.data]

best_for = [item['best_for_en'] for item in input_data]
best_for_response = client.embeddings.create(input=best_for, model=embedding_model_name)
best_for_embeddings = [item.embedding for item in description_response.data]

# Generate embeddings for title and content fields
for i, item in enumerate(input_data):
    item['descriptionVector'] = description_embeddings[i]
    item['bestforVector'] = best_for_embeddings[i]

# Output embeddings to docVectors.json file
output_path = 'docVectors.json'
with open(output_path, "w", encoding='utf-8') as f:
    json.dump(input_data, f, ensure_ascii=False)


In [26]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex
)

from azure.core.credentials import AzureKeyCredential

# Create a search index
#  {'name_en', 'description_en', 'best_for_en', 'name_ja', 'description_ja', 'best_for_ja', 'name_vi', 'description_vi', 'best_for_vi','name_es', 'description_es', 'best_for_es', 'name_zh', 'description_zh', 'best_for_zh'}

index_client = SearchIndexClient(
    endpoint=endpoint, credential=AzureKeyCredential(credential))
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="name_en", type=SearchFieldDataType.String),
    SearchableField(name="description_en", type=SearchFieldDataType.String),
    SearchableField(name="best_for_en", type=SearchFieldDataType.String),

    SearchableField(name="name_ja", type=SearchFieldDataType.String),
    SearchableField(name="description_ja", type=SearchFieldDataType.String),
    SearchableField(name="best_for_ja", type=SearchFieldDataType.String),

    SearchableField(name="name_vi", type=SearchFieldDataType.String),
    SearchableField(name="description_vi", type=SearchFieldDataType.String),
    SearchableField(name="best_for_vi", type=SearchFieldDataType.String),

    SearchableField(name="name_es", type=SearchFieldDataType.String),
    SearchableField(name="description_es", type=SearchFieldDataType.String),
    SearchableField(name="best_for_es", type=SearchFieldDataType.String),

    SearchableField(name="name_zh", type=SearchFieldDataType.String),
    SearchableField(name="description_zh", type=SearchFieldDataType.String),
    SearchableField(name="best_for_zh", type=SearchFieldDataType.String),


    SimpleField(name="price", type=SearchFieldDataType.Int32, sortable=True, filterable=True),
    SimpleField(name="size", type=SearchFieldDataType.String, sortable=True, filterable=True),


    SearchField(name="descriptionVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="bestforVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]


# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)



semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="name_es"),
        # keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="description_es")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
try:
    index_client.delete_index(index_name)   
except Exception as e:
    print(e)
    pass    
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 skin-care created


Insert data into vector store

In [27]:
from azure.search.documents import SearchClient

# Upload some documents to the index
input_path = 'docVectors.json'
with open(input_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(credential))
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 


Uploaded 200 documents


In [10]:
import redis
#read data from the docVectors.json file into a pandas dataframe df
with open('docVectors.json', 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)
r = redis.StrictRedis(host="redis002.redis.cache.windows.net",
        port=6380, db=0, password="YbS219KZXbAvbrhVbOvMJK7xz4tzukc0YAzCaBcNUiY=", ssl=True)
#now upload the images to the redis cache, using the id as the key and the image as the value
for i, row in df.iterrows():
    if i ==0: continue
    file_path = f'images/{row["id"]}.jpg'
    with open(file_path, "rb") as image_file:
        image = image_file.read()
        r.set(row['id'], image)


Test search

In [36]:
from azure.search.documents.models import VectorizedQuery

# Pure Vector Search
query = "Product for old men with dry skin and wrinkles."  
  
embedding = client.embeddings.create(input=query, model=embedding_model_name).data[0].embedding
desc_vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="descriptionVector")
usage_vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="bestforVector")

results = search_client.search(  
    search_text=None,  
    vector_queries= [ usage_vector_query],
    select=["name_en","description_en", "best_for_en", "name_vi","price","size"],
)  
  
for result in results:  
    print(f"product_name: {result['name_en']}")  
    print(f"description: {result['description_en']}")  

    print(f"Score: {result['@search.score']}")  
    print(f"best_for_en: {result['best_for_en']}")  
    print(f"name_vi: {result['name_vi']}\n")  

product_name: Youthful Glow Serum
description: A luxurious serum enriched with potent antioxidants and vitamins to help reduce fine lines and wrinkles, leaving your skin looking radiant and youthful.
Score: 0.67888933
best_for_en: Ideal for mature skin, both men and women in their 40s and 50s. Best used in the evening before bed as part of your nighttime skincare routine.
name_vi: Serum Rạng Rỡ Thanh Xuân

product_name: Radiant Glow Youth Serum
description: Achieve a youthful radiance with this luxurious serum that hydrates and revitalizes the skin. Formulated with potent anti-aging ingredients to diminish fine lines and improve skin texture.
Score: 0.6709109
best_for_en: Ideal for individuals with aging concerns, suitable for all genders, best used in the evening for overnight skin repair.
name_vi: Serum Trẻ Hóa Da Rạng Ngời

product_name: Radiant Youth Serum
description: A luxurious serum that renews and rejuvenates the skin for a youthful and radiant glow. Enriched with potent anti-