In [2]:
import base64
from google.cloud import aiplatform
from google.protobuf import struct_pb2
import typing
import numpy as np

# Prepare dataset

In [3]:
class EmbeddingResponse(typing.NamedTuple):
    text_embedding: typing.Sequence[float]
    image_embedding: typing.Sequence[float]

class EmbeddingPredictionClient:
    """Wrapper around Prediction Service Client."""
    def __init__(self, project : str,
        location : str = "us-central1",
        api_regional_endpoint: str = "us-central1-aiplatform.googleapis.com"):
        client_options = {"api_endpoint": api_regional_endpoint}
        # Initialize client that will be used to create and send requests.
        # This client only needs to be created once, and can be reused for multiple requests.
        self.client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)  
        self.location = location
        self.project = project

    def get_embedding(self, text : str = None, image_bytes : bytes = None):
        if not text and not image_bytes:
            raise ValueError('At least one of text or image_bytes must be specified.')

        instance = struct_pb2.Struct()
        if text:
            instance.fields['text'].string_value = text

        if image_bytes:
            encoded_content = base64.b64encode(image_bytes).decode("utf-8")
            image_struct = instance.fields['image'].struct_value
            image_struct.fields['bytesBase64Encoded'].string_value = encoded_content

        instances = [instance]
        endpoint = (f"projects/{self.project}/locations/{self.location}"
            "/publishers/google/models/multimodalembedding@001")
        response = self.client.predict(endpoint=endpoint, instances=instances)

        text_embedding = None
        if text:    
            text_emb_value = response.predictions[0]['textEmbedding']
            text_embedding = [v for v in text_emb_value]

        image_embedding = None
        if image_bytes:    
            image_emb_value = response.predictions[0]['imageEmbedding']
            image_embedding = [v for v in image_emb_value]

        return EmbeddingResponse(
            text_embedding=text_embedding,
            image_embedding=image_embedding)

In [4]:
def reduce_embedding_dimesion(
        vector_text: list = [],
        vector_image: list = [],
):
    if vector_image and vector_text:
        matrix = np.array([vector_text, vector_image])
        max_pooled_rows = np.sum(matrix, axis=0)
    else:
        max_pooled_rows = np.array(vector_text or vector_image)

    return list(max_pooled_rows)

In [5]:
embeddings_client = EmbeddingPredictionClient(project="rl-llm-dev")

In [6]:
def image_text_to_embedding(text = "", image_path = "") -> list:
    image_contents = ""
    if image_path:
        with open(image_path, "rb") as f:
            image_contents = f.read()

    response = embeddings_client.get_embedding(
        text=text,
        image_bytes=image_contents)

    reduced_vector = reduce_embedding_dimesion(
        vector_image=response.image_embedding,
        vector_text=response.text_embedding)

    return reduced_vector

# Prepare data for index

In [5]:
import pandas as pd
input_file = pd.read_excel("csm-dataset-update.xlsx")

In [9]:
metadata = []

for i in range(10):
    title = input_file.title[i]
    id = input_file.id[i]
    link = input_file.link[i]

    with open(f"image_0{i}.jpg", "rb") as f:
        image_contents = f.read()
        response = embeddings_client.get_embedding(
            text=title,
            image_bytes=image_contents)

    reduced_vector = reduce_embedding_dimesion(
        vector_image=response.image_embedding,
        vector_text=response.text_embedding
    )

    metadata.append(
        {
            "id": id, 
            "embedding": reduced_vector,
            "restricts": [
                {
                    "namespace": "id",
                    "allow": [id]
                },
                {
                    "namespace": "link",
                    "allow": [link]
                },
            ]
        }
    )

In [11]:
import json

with open("metadata.json", "w") as f:
    for m in metadata:
        f.write(json.dumps(m))
        f.write("\n")

In [13]:
! gsutil cp metadata.json gs://csm-dataset/vertex-vector-search/metadata.json

Copying file://metadata.json [Content-Type=application/json]...
/ [1 files][244.8 KiB/244.8 KiB]                                                
Operation completed over 1 objects/244.8 KiB.                                    


# Create Search Index

In [None]:
from google.cloud import aiplatform_v1

In [None]:
DISPLAY_NAME = "test_sdk"
DESCRIPTION = "CSM Multimodal Search"
LOCATION = "us-central1"

EMBEDDINGS_INITIAL_URI = "gs://csm-dataset/vertex-vector-search/"
DIMENSIONS = 1408

PROJECT_ID = "rl-llm-dev"
REGION = "us-central1"
BUCKET_URI = "gs://csm-dataset/vertex-staging-data"

In [None]:
index_client = aiplatform_v1.IndexServiceClient()

In [None]:
metadata = {
    "contentsDeltaUri": EMBEDDINGS_INITIAL_URI,
    "config":{
        "dimensions": DIMENSIONS,
        "approximateNeighborsCount": 150,
        "distanceMeasureType": "DOT_PRODUCT_DISTANCE",
        "featureNormType": "UNIT_L2_NORM",
        "algorithmConfig": {
            "treeAhConfig": {
                "leafNodeEmbeddingCount": 10000, 
                "leafNodesToSearchPercent": 5}
            }
    }
}

metadata_schema_uri = "gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml"

index = aiplatform_v1.Index(
    display_name = DISPLAY_NAME,
    description = DESCRIPTION,
    metadata_schema_uri = metadata_schema_uri,
    metadata = metadata,
    index_update_method = "STREAM_UPDATE"
)

create_index_request = aiplatform_v1.CreateIndexRequest(
    parent=f"projects/{PROJECT_ID}/locations/{LOCATION}",
    index=index
)

In [None]:
index_client.create_index(request=create_index_request)

# Using cURL

In [14]:
DISPLAY_NAME = "csm-multimodal-search"
DESCRIPTION = "CSM Multimodal Search"
LOCATION = "us-central1"

EMBEDDINGS_INITIAL_URI = "csm-dataset/vertex-vector-search/"
DIMENSIONS = 1408
PROJECT_ID = "rl-llm-dev"
REGION = "us-central1"
BUCKET_URI = "gs://csm-dataset/vertex-staging-data"

metadata_schema_uri = "gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml"

### Create the Index with streaming insert

In [15]:
%%writefile index_request.json
{
    "displayName": "csm-multimodal-vector-search",
    "description": "CSM Multimodal Vector Search",
    "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml",
    "metadata": {
        "contentsDeltaUri": "gs://csm-solution-dataset/metadata/vector_metadata.json",
        "config":{
            "dimensions": 1408,
            "approximateNeighborsCount": 150,
            "distanceMeasureType": "DOT_PRODUCT_DISTANCE",
            "featureNormType": "UNIT_L2_NORM",
            "algorithmConfig": {
                "treeAhConfig": {
                    "leafNodeEmbeddingCount": 10000, 
                    "leafNodesToSearchPercent": 10
                }
            }
        }
    },
    "indexUpdateMethod": "STREAM_UPDATE",
}

Writing index_request.json


In [16]:
! curl -X POST \
-H "Authorization: Bearer $(gcloud auth print-access-token)" \
-H "Content-Type: application/json; charset=utf-8" \
"https://us-central1-aiplatform.googleapis.com/v1/projects/rl-llm-dev/locations/us-central1/indexes" \
-d @index_request.json

{
  "name": "projects/244831775715/locations/us-central1/indexes/7799676002698788864/operations/6470551643707408384",
  "metadata": {
    "@type": "type.googleapis.com/google.cloud.aiplatform.v1.CreateIndexOperationMetadata",
    "genericMetadata": {
      "createTime": "2023-10-20T17:37:51.072438Z",
      "updateTime": "2023-10-20T17:37:51.072438Z"
    }
  }
}


# List Indexes

In [17]:
! curl -X GET \
    -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    "https://us-central1-aiplatform.googleapis.com/v1/projects/rl-llm-dev/locations/us-central1/indexes"

{
  "indexes": [
    {
      "name": "projects/244831775715/locations/us-central1/indexes/7799676002698788864",
      "displayName": "csm-multimodal-search",
      "description": "CSM Multimodal Search",
      "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml",
      "metadata": {
        "config": {
          "dimensions": 1408,
          "approximateNeighborsCount": 150,
          "distanceMeasureType": "DOT_PRODUCT_DISTANCE",
          "featureNormType": "UNIT_L2_NORM",
          "algorithmConfig": {
            "treeAhConfig": {
              "leafNodeEmbeddingCount": "10000",
              "leafNodesToSearchPercent": 10
            }
          },
          "shardSize": "SHARD_SIZE_MEDIUM"
        }
      },
      "deployedIndexes": [
        {
          "indexEndpoint": "projects/244831775715/locations/us-central1/indexEndpoints/6972983996099592192",
          "deployedIndexId": "csm_deploy_1697828519567"
        }

In [18]:
! curl -X GET \
    -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    "https://us-central1-aiplatform.googleapis.com/v1/projects/rl-llm-dev/locations/us-central1/indexEndpoints"

{
  "indexEndpoints": [
    {
      "name": "projects/244831775715/locations/us-central1/indexEndpoints/6972983996099592192",
      "displayName": "csm-endpoint",
      "deployedIndexes": [
        {
          "id": "csm_deploy_1697828519567",
          "index": "projects/244831775715/locations/us-central1/indexes/7799676002698788864",
          "displayName": "csm-deploy",
          "createTime": "2023-10-20T19:02:09.689210Z",
          "indexSyncTime": "2023-10-20T19:44:24.578774Z",
          "deploymentGroup": "default",
          "dedicatedResources": {
            "machineSpec": {
              "machineType": "e2-standard-16"
            },
            "minReplicaCount": 2,
            "maxReplicaCount": 2
          }
        }
      ],
      "etag": "AMEw9yOVIlylxjsZLj39YbyzbrHRhe6xszSKmChzkXFNvnuGgfZQCBUqR2tFpq4j4t9c",
      "createTime": "2023-10-20T19:01:40.824700Z",
      "updateTime": "2023-10-20T19:01:41.533211Z",
      "publicEndpointDomainName": "1236044824.us-central1-24

# Query

In [1]:
from google.cloud import aiplatform_v1

client_options = {
    "api_endpoint": "1236044824.us-central1-244831775715.vdb.vertexai.goog"
}

index_endpoint_id = "6972983996099592192"
deployed_index_id = "csm_deploy_1697828519567"
match_client = aiplatform_v1.MatchServiceClient(client_options=client_options)

In [7]:
def find_neighbor(
    project_id: str,
    index_endpoint_id: str,
    deployed_index_id: str,
    feature_vector: list,
    datapoint_id: str = "0",
    neighbor_count: int = 10
):
    request = aiplatform_v1.FindNeighborsRequest(
        index_endpoint = f"projects/{project_id}/locations/us-central1/" \
                         f"indexEndpoints/{index_endpoint_id}",
        deployed_index_id = deployed_index_id,
        return_full_datapoint=True
    )

    query = aiplatform_v1.FindNeighborsRequest.Query(
        datapoint = aiplatform_v1.IndexDatapoint(
            datapoint_id=datapoint_id,
            feature_vector=feature_vector
        ),
        neighbor_count = neighbor_count
    )

    request.queries.append(query)
    return match_client.find_neighbors(request)

In [12]:
feature_vector = image_text_to_embedding(
    # text="keyboard",
    image_path="image_02.jpg"
)

In [13]:
response = find_neighbor(
    project_id="rl-llm-dev",
    index_endpoint_id=index_endpoint_id,
    deployed_index_id=deployed_index_id,
    feature_vector=feature_vector
)

In [14]:
len(response.nearest_neighbors[0].neighbors)

0

In [15]:
for n in response.nearest_neighbors[0].neighbors:
    result = {
        "id": n.datapoint.restricts[0].allow_list[0],
        "link": n.datapoint.restricts[1].allow_list[0]}
    print(result)
    break

# Delete Datapoints

In [None]:
import pandas as pd
input_file = pd.read_excel("csm-dataset-update.xlsx")

In [None]:
metadata = []

for i in range(10):
    title = input_file.title[i]
    id = input_file.id[i]
    link = input_file.link[i]

    with open(f"image_0{i}.jpg", "rb") as f:
        image_contents = f.read()
        response = embeddings_client.get_embedding(
            text=title,
            image_bytes=image_contents)

    reduced_vector = reduce_embedding_dimesion(
        vector_image=response.image_embedding,
        vector_text=response.text_embedding
    )

    metadata.append(
        {
            "id": id, 
            "embedding": reduced_vector,
            "restricts": [
                {
                    "namespace": "id",
                    "allow": [id]
                },
                {
                    "namespace": "link",
                    "allow": [link]
                },
            ]
        }
    )

In [None]:
import json

with open("metadata.json", "w") as f:
    for m in metadata:
        f.write(json.dumps(m))
        f.write("\n")

In [None]:
!curl -X POST\
-H "Content-Type: application/json" \
-H "Authorization: Bearer `gcloud auth print-access-token`" \
"https://us-central1-aiplatform.googleapis.com/v1/projects/rl-llm-dev/locations/us-central1/indexes/2965906227646693376:removeDatapoints" \
-d '{datapoint_ids: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]}'

# List Index

In [None]:
! curl -X GET \
    -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    "https://us-central1-aiplatform.googleapis.com/v1/projects/rl-llm-dev/locations/us-central1/indexes"

# Upsert elements

In [None]:
import pandas as pd
input_file = pd.read_excel("csm-dataset-update.xlsx")

In [None]:
metadata = []

for i in range(10):
    title = input_file.title[i]
    with open(f"image_0{i}.jpg", "rb") as f:
        image_contents = f.read()
        response = embeddings_client.get_embedding(
            text=title,
            image_bytes=image_contents)

    reduced_vector = reduce_embedding_dimesion(
        vector_image=response.image_embedding,
        vector_text=response.text_embedding
    )

    metadata.append(
        {
            "id": str(i), 
            "embedding": list(reduced_vector)
        }
    )

In [None]:
import json

with open("metadata.json", "w") as f:
    for m in metadata:
        f.write(json.dumps(m))
        f.write("\n")

In [95]:
from google.cloud import firestore
db = firestore.Client()

In [111]:
document = db.collection("website_search").document("0HAhrsZVJDLpy1rAMLaL").get()

In [112]:
doc_list = document.to_dict()["conversation"].copy()

In [113]:
doc_list

[{'message': 'gaming', 'author': 'user'},
 {'message': 'There are many different types of gaming chairs available, including chairs with footrests, chairs with speakers, and chairs with adjustable heights. One popular option is the GTRACING Gaming Chair with Footrest Speakers Video Game Chair Bluetooth Music Heavy Duty Ergonomic Computer Office Desk Chair Red, which is available for $159.99. This chair features two Bluetooth speakers, a strong metal frame, and a comfortable seat cushion.',
  'author': 'system'},
 'test']

In [108]:
doc_list.append("test")

In [109]:
doc_list

[{'message': 'gaming', 'author': 'user'},
 {'message': 'There are many different types of gaming chairs available, including chairs with footrests, chairs with speakers, and chairs with adjustable heights. One popular option is the GTRACING Gaming Chair with Footrest Speakers Video Game Chair Bluetooth Music Heavy Duty Ergonomic Computer Office Desk Chair Red, which is available for $159.99. This chair features two Bluetooth speakers, a strong metal frame, and a comfortable seat cushion.',
  'author': 'system'},
 'test']

In [110]:
db.collection("website_search").document("0HAhrsZVJDLpy1rAMLaL").set(
    {"conversation":doc_list}, merge=True
)

update_time {
  seconds: 1697845017
  nanos: 13510000
}