# Data Preparation + Workflow Notebook


### installing libraries and packages


In [None]:
%pip install svgwrite
%pip install openai
%pip install aspose-words
%pip install azure-storage-blob
%pip install azure-identity


In [None]:
json_output = []

In [None]:
import requests
import subprocess
import aspose.words as aw
import base64
from mimetypes import guess_type

### Function to convert SVG image to PNG format

In [None]:

def convert_image_to_PNG(image_url):
    doc=aw.Document()
    builder = aw.DocumentBuilder(doc)

    svg_url=image_url
    svg_file="./temp.svg"
    jpeg_file="./output.PNG"

    response = requests.get(svg_url)
    if response.status_code == 200:
        with open(svg_file, "wb") as file:
            file.write(response.content)
        print("SVG downloaded successfully.")
        shape = builder.insert_image(svg_file)
        shape.get_shape_renderer().save(jpeg_file, aw.saving.ImageSaveOptions(aw.SaveFormat.PNG))
    
    


   

### Generating local URL for the image

In [None]:


def local_image_data_to_url():
    image_path="./output.png"
    # Guess the MIME type of the image based on the file extension
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'
    
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')
    
    return f"data:{mime_type};base64,{base64_encoded_data}"

### Creating an Azure OpenAI Client

In [None]:
import os 
from openai import AzureOpenAI
from dotenv import load_dotenv

load_dotenv()
openai_key = os.getenv("AZURE_OPENAI_KEY")
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
embedding_engine = os.getenv("EMBEDDING_ENGINE")
gpt_model = os.getenv("GPT_MODEL")





In [None]:
client = AzureOpenAI(
    azure_endpoint = openai_endpoint, 
    api_key=openai_key,  
    api_version="2024-02-15-preview"
  )

### Defining the ChatCompletionAPI function for labelling each image and creating data for Database

In [None]:
def chat_completions_API(local_image_url, blob_image_url):

  

  system_prompt = f"""You are a helpful AI assitant meant to assist me in my work. I am trying to build a solution that lets people search for 
  azure icons in their svg format. I will be passing you an image for which you need to generate information in the following manner (these images are
  nothing but Azure icons, so use appropriate knowledge):

  "name":"<suitable name for the image>"
  "description":"<description of what the icon is; you can maybe include the Azure service name and the description of the service; if lets say
  the image is about Azure Active Directory then you can include what Azure Active Directory is>" 
  "similar_words": "<similar names or words that relate to this Azure service that the icon is about>" 

  Sample Output:
    
    "name": "Azure Backup",
    "description": "Azure Backup is a service that provides simple, secure, and cost-effective solutions to back up your data and recover it from the Microsoft Azure cloud.",
    "similar_words": "cloud backup, data recovery, Azure cloud storage, secure backup solutions"
    "url":"https://azureiconskuljot.blob.core.windows.net/azureicons/00017-icon-service-Recovery-Services-Vaults.svg"
    
  the output format should not look like:
  ```json
  {{
  "name": "App Service Certificates",
  "description": "App Service Certificates is a service in Azure that provides a simple and user-friendly way to purchase, manage, and deploy SSL/TLS certificates for your Azure App Services.",
  "similar_words": "SSL certificates, TLS certificates, Azure App Services, secure web applications",
  "url": "https://azureiconskuljot.blob.core.windows.net/pngicons/00049-icon-service-App-Service-Certificates.png"
  }}
  ```

  strictly adhere to this sample output format

  I will be storing all this information in Azure CosmosDB with vector embeddings. the vector embeddings will be generated for the name, description and 
  similar_words fields so that I can perform a similarity search and efficiently help user in retrieving the best suitable Azure Service icon; so make
  sure that you fill in the fields accordingly in a manner that best helps to fit into my use-case/side project that I am building"""

  response = client.chat.completions.create(
      model="gpt-4o",
      messages = [
          {
              "role":"system",
              "content":system_prompt
          },
          {
              "role":"user",
              "content":[
                  {
                      "type":"text",
                      "text":"analyse and generate information for this picture according to the system prompt fed to you; note that the url of the image in blog storage container is:" + str(blob_image_url)
                  },
                  {
                      "type":"image_url",
                      "image_url":{
                          "url":local_image_url
                      }
                  }
              ]
          }
      ],
      temperature=0
  )

  print(response.choices[0].message.content)

  json_output.append(response.choices[0].message.content)


### Generating the compiled function for data preparation

In [None]:
def generate_complete_data(image_url):
    convert_image_to_PNG(image_url)
    local_url = local_image_data_to_url()
    chat_completions_API(local_image_url=local_url, blob_image_url=image_url)

### Pulling images from storage account and creating the dataset

In [None]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

containerName = os.getenv("STORAGE_CONTAINER_NAME")
storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string)

container_client = blob_service_client.get_container_client(container=containerName)

print(container_client)

blob_list = container_client.list_blobs()

print("accessing blobs from storage account")
print("-------------------------------------------------")

for blob in blob_list:
    blob_client= blob_service_client.get_blob_client(container=containerName, blob=blob.name)
    blob_url=blob_client.url
    generate_complete_data(blob_url)
    
    

In [None]:
print(json_output)

In [None]:
import json

dict_data = dict()

with open("./data.json", 'w') as json_file:
 data_dicts = [json.loads("{" + item + "}") for item in json_output]
    
 json.dump(data_dicts, json_file, indent=4)

           

### Defining a Vector Embeddings Generator Function

In [None]:
def generate_embeddings(item):
    embeddings_response = client.embeddings.create(
        input=item,
        model="text-embedding-ada-002"
    )
    
    embeddings_dict =  embeddings_response.model_dump()
    
    return embeddings_dict['data'][0]['embedding']

### Populating dataset with Vector Embeddings

In [None]:
import json
with open("data.json","r") as json_file:
    file_data = json.load(json_file)
    

for item in file_data:
        data_for_embeddings_engine = f""""
        name: {item.get('name')}
        description: {item.get('description')}
        similar_words: {item.get('similar_words')}
        
        """
        
        print (data_for_embeddings_engine)
        
        embeddings = generate_embeddings(data_for_embeddings_engine)
        
        item['vector'] = embeddings
        
    
    
        
       
with open("data.json","w") as json_file: 
 json.dump(file_data, json_file, indent=4)

    


### Installing Python SDK for azure cosmos DB

In [None]:
%pip install azure-cosmos

### Creating a Database Client


In [None]:
from azure.cosmos import CosmosClient, PartitionKey, exceptions
load_dotenv()
cosmosdb_connection_string = os.getenv("COSMOSDB_CONNECTION_STRING")

cosmos_client = CosmosClient.from_connection_string(cosmosdb_connection_string)
database_name = os.getenv("DATABASE_NAME")

database = cosmos_client.create_database_if_not_exists(id=database_name)

### Defining a Vector Embedding Policy

In [None]:
pk = "/name"

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/vector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        }
    ]
}

### Defining a Vector Indexing Policy


In [None]:
indexing_policy = {
    "vectorIndexes": [
        {
            "path":"/vector",
            "type":"diskANN"
        }

    ]
}

### Creating a Container Finally


In [None]:
try:
    container_name = os.getenv("COSMOSDB_CONTAINER_NAME")
    
    container = database.create_container_if_not_exists(
        id=container_name,
        partition_key=PartitionKey(path=pk),
        indexing_policy=indexing_policy,
        vector_embedding_policy=vector_embedding_policy
        
    )
    
except Exception as e:
    print(e)

### Upserting Data into our container

In [None]:
import uuid

with open("data.json", "r") as json_file:
    data = json.load(json_file)
    
for obj in data:
    guid = str(uuid.uuid4())
    obj['id'] = guid
    container.upsert_item(obj)


    

### Vectorising the user query

In [None]:
user_query = "azure cosmosdb"
user_query_embeddings = generate_embeddings(user_query)
print(user_query_embeddings)

### Retrieving best results for the user query

In [None]:
queryText = f""" SELECT TOP 5 c.name, c.description, c.similar_words, c.url, VectorDistance(c.vector, {user_query_embeddings}) AS SimilarityScore
FROM c
ORDER BY VectorDistance(c.vector, {user_query_embeddings})"""
query_results = container.query_items(
    query=queryText,
    enable_cross_partition_query=True
)

images=[]

for item in query_results:
    print(item)
    images.append(item)