# Load Image File vectors from Storage Account into CogSearch 
### The Images from Storage Account are loaded into CogSearch by following below steps
- Establish a connection with Storage Account using the Python SDK.
- Retrieve the required image files from Storage Account container using the file stream download method.
- Use Azure AI Vision to vectorize the Image files from Storage Account.
- Index the vector chunks into Azure Cognitive Search.
- Repeat the process for all the required files.

#### Using the Azure Storage Python SDK  to fetch the file stream and use AI Vision to embed the image in memory and create a vector index
Inspired from Below Repos
- Azure Python SDK https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/storage/azure-storage-blob
- Refer to https://github.com/MSUSAzureAccelerators/Azure-Cognitive-Search-Azure-OpenAI-Accelerator/blob/main/04-Complex-Docs.ipynb
- Refer to https://github.com/Azure/azureml-examples

In [None]:
pip install -r requirements.txt

### Import required libraries and environment variables

In [1]:
# Import required libraries  
import os  
import json  
import requests
import http.client, urllib.parse
from tenacity import retry, stop_after_attempt, wait_fixed
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    RawVectorQuery,
)
from azure.search.documents.indexes.models import (  
 
    ExhaustiveKnnParameters,  
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    HnswVectorSearchAlgorithmConfiguration,
    SimpleField,
    SearchField,  
    SearchFieldDataType,  
    SearchIndex,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchProfile,  
)
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

from IPython.display import Image, display
from tqdm import tqdm
import requests
  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
api_version = os.getenv("AZURE_SEARCH_API_VERSION")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
aiVisionApiKey = os.getenv("AZURE_AI_VISION_API_KEY")  
aiVisionRegion = os.getenv("AZURE_AI_VISION_REGION")
credential = DefaultAzureCredential()
# Setup the Payloads header for cog search
headers = {'Content-Type': 'application/json','api-key': os.getenv('AZURE_SEARCH_ADMIN_KEY')}
params = {'api-version': os.getenv('AZURE_SEARCH_API_VERSION')}

### Get Image vectors uisng AI Vision

In [2]:
@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))
def get_image_vector(image_path, key, region):
    headers = {
        'Ocp-Apim-Subscription-Key': key,
    }

    params = urllib.parse.urlencode({
        'model-version': '2023-04-15',
    })

    try:
        if image_path.startswith(('http://', 'https://')):
            headers['Content-Type'] = 'application/json'              
            body = json.dumps({"url": image_path})
            
        else:
            headers['Content-Type'] = 'application/octet-stream'
            with open(image_path, "rb") as filehandler:
                image_data = filehandler.read()
                body = image_data

        conn = http.client.HTTPSConnection(f'{region}.api.cognitive.microsoft.com', timeout=3)
        conn.request("POST", "/computervision/retrieval:vectorizeImage?api-version=2023-04-01-preview&%s" % params, body, headers)
        response = conn.getresponse()
        data = json.load(response)
        conn.close()

        if response.status != 200:
            raise Exception(f"Error processing image {image_path}: {data.get('message', '')}")
        #print(data)
        return data.get("vector")

    except (requests.exceptions.Timeout, http.client.HTTPException) as e:
        print(f"Timeout/Error for {image_path}. Retrying...")
        raise


### Read the metadata Json blob and index the images

In [3]:
from azure.storage.blob import BlobClient
import json

# Define a function that takes filename as input
def read_json_blob(filename):
    # Create a blob client with the connection string, container name, and blob name
    # Create a blob client with the connection string, container name, and blob name
    blob_client = BlobClient.from_connection_string(conn_str=os.getenv("AZURE_STORAGE_CONNECTION_STRING"), 
                                                    container_name=os.getenv("AZURE_STORAGE_CONTAINER_NAME"), blob_name=filename)
    #get stream 
    stream = blob_client.download_blob()
    file_content_stream = stream.readall()

    # Parse the byte string as a JSON object
    json_data = json.loads(file_content_stream)

    # Print the JSON object
    #print (json_data)
    
    
    results = []
    file_count = 1
    images_per_file = 50

    for idx, image_data in enumerate(json_data):
        if image_data.get('image_path', None):
            try:
                vector = get_image_vector(
                    image_data.get('image_path', None),
                    aiVisionApiKey,
                    aiVisionRegion
                )
            except Exception as e:
                print(f"Error processing image at index {idx}: {e}")
                vector = None
            
            # Only save result if vector is present AND description is present
            description_present = image_data.get('description', '')
            
            if vector and description_present:
                result = {
                    "id": idx,
                    "image_vector": vector,
                    "description": description_present  # Always add the description, even if it's an empty string
                }
        results.append(result)
    return results


### List the blobs in your container using the ContainerClient

In [4]:
#define a list for holding the file details 
files_to_index = []

In [None]:
container = ContainerClient.from_connection_string(conn_str=os.getenv("AZURE_STORAGE_CONNECTION_STRING"), container_name=os.getenv("AZURE_STORAGE_CONTAINER_NAME"))

blob_list = container.list_blobs()
for blob in blob_list:
    print(blob)
    files_to_index.append({"file_name": blob.name,"file_url": (os.getenv("AZURE_STORAGE_BASE_URL")+ os.getenv("AZURE_STORAGE_CONTAINER_NAME") + "/" + blob.name)})  

### Get the file content Stream for the blobs async and use the AI  intelligence to create vectors

In [6]:
for item in files_to_index:
    item["image_map"]= read_json_blob(filename=item["file_name"])
    
    

In [None]:
#for item in files_to_index:
    #print(item)

### Create an index

Create your search index schema and vector search configuration:

In [None]:
# Create a search index 
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)  
fields = [  
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),  
    SearchField(name="description", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(
        name="image_vector",  
        hidden=True,
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
        searchable=True,
        vector_search_dimensions=1024,  
        vector_search_profile="myHnswProfile"
    ),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswVectorSearchAlgorithmConfiguration(  
            name="myHnsw",  
            kind=VectorSearchAlgorithmKind.HNSW,  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=1000,  
                metric="cosine",  
            ),  
        ),  
            ExhaustiveKnnVectorSearchAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,  
            parameters=ExhaustiveKnnParameters(  
                metric="cosine",  
            ),  
        ), 
    ],  
   profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm="myHnsw",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm="myExhaustiveKnn",  
        ),  
    ],  
)  
  
# Create the search index with the vector search configuration  
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created") 

### Push Image vector data to the index

In [None]:
%%time
for item in files_to_index:
    print("Uploading chunks from",item["file_name"])
    for img in tqdm(item['image_map']):
        try: 
            upload_payload = {
                "value": [
                    {
                        "id": str(img["id"]),
                        "image_vector": img["image_vector"],
                        "description": img["description"],
                        "@search.action": "upload"
                    },
                ]
            }
            #print(upload_payload)
            r = requests.post(os.environ['AZURE_SEARCH_SERVICE_ENDPOINT'] + "/indexes/" + index_name + "/docs/index",
                                 data=json.dumps(upload_payload), headers=headers, params=params)
            if r.status_code != 200:
                print(r.status_code)
                print(r.text)
        except Exception as e:
            print("Exception:",e)
            #print(content)