In [1]:
! pip3 install --upgrade google-cloud-aiplatform \
                        google-cloud-storage \
                        'google-cloud-bigquery[pandas]' \
                        tqdm

Collecting google-cloud-aiplatform
  Obtaining dependency information for google-cloud-aiplatform from https://files.pythonhosted.org/packages/b4/ab/339400d48a1751f689d13e68ecf21428176eb41a821032d47eaa312f404c/google_cloud_aiplatform-1.36.0-py2.py3-none-any.whl.metadata
  Using cached google_cloud_aiplatform-1.36.0-py2.py3-none-any.whl.metadata (27 kB)
Collecting google-cloud-storage
  Obtaining dependency information for google-cloud-storage from https://files.pythonhosted.org/packages/04/72/71b1b531cefa1daff8f6a2a70b4d4fa18dd4da851b5486d53578811b0838/google_cloud_storage-2.13.0-py2.py3-none-any.whl.metadata
  Downloading google_cloud_storage-2.13.0-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting google-cloud-bigquery[pandas]
  Obtaining dependency information for google-cloud-bigquery[pandas] from https://files.pythonhosted.org/packages/51/8c/bf168c5450431734d67ed4db3e62e2c81fbf2c7d8c0ff3153808e9ab480f/google_cloud_bigquery-3.13.0-py2.py3-none-any.whl.metadata
  Downloading google_

In [2]:
import math
from typing import Any, Generator

import pandas as pd
from google.cloud import bigquery
from typing import List, Optional

# Load the "Vertex AI Embeddings for Text" model
from vertexai.preview.language_models import TextEmbeddingModel
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple

import numpy as np
from tqdm.auto import tqdm
import random

PROJECT_ID = "viki-dev-app-wsky"
REGION = "us-east4"
BUCKET_URI = "gs://viki-ai-provisional-dev/icd10-search-balki/"
# The number of dimensions for the tensorflow universal sentence encoder.
# If other embedder is used, the dimensions would probably need to change.
DIMENSIONS = 512
DISPLAY_NAME = "balki-icd10-index"
DESCRIPTION = "icd10-index"
EMBEDDING_DIR = BUCKET_URI + "/" + "icd10-index"
DEPLOYED_INDEX_ID = "balki-icd10-index"

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

2023-11-07 10:07:25.806174: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]


def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 10, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

# Define an embedding method that uses the model
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

def query_bigquery_chunks(
    max_rows: int, rows_per_chunk: int, start_chunk: int = 0
) -> Generator[pd.DataFrame, Any, None]:
    for offset in range(start_chunk, max_rows, rows_per_chunk):
        query = QUERY_TEMPLATE.format(limit=rows_per_chunk, offset=offset)
        query_job = client.query(query)
        rows = query_job.result()
        df = rows.to_dataframe()
        df["code_with_desc"] = df.code + "\n" + df.desc
        yield df


In [3]:
client = bigquery.Client(project=PROJECT_ID)

QUERY_TEMPLATE = """
        SELECT distinct q.code, q.desc
        FROM (SELECT * FROM `viki-dev-app-wsky.genai.icd10_codes`) AS q 
        LIMIT 200 OFFSET 0;
        """
df = next(query_bigquery_chunks(max_rows=1000, rows_per_chunk=1000))

# Examine the data
df.head()

# Encode a subset of questions for validation
codes = df.code_with_desc.tolist()[:500]
is_successful, code_embeddings = encode_text_to_embedding_batched(
    sentences=df.code.tolist()[:500]
)

# Filter for successfully embedded sentences
codes = np.array(codes)[is_successful]

DIMENSIONS = len(code_embeddings[0])

print(DIMENSIONS)

code_index = random.randint(0, 99)

print(f"Query code = {codes[code_index]}")


  0%|          | 0/40 [00:00<?, ?it/s]

768
Query code = A0472
  Enterocolitis due to Clostridium difficile, not specified as recurrent


In [4]:
# Get similarity scores for each embedding by using dot-product.
#scores = np.dot(code_embeddings[code_index], code_embeddings.T)
query_embedding = encode_texts_to_embeddings(sentences=["intestinal infections"])
scores = np.dot(query_embedding[0], code_embeddings.T)

# Print top 20 matches
for index, (code, score) in enumerate(
    sorted(zip(codes, scores), key=lambda x: x[1], reverse=True)[:20]
):
    print(f"\t{index}: {code}: {score}")

TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'

In [5]:
import tempfile
from pathlib import Path

# Create temporary file to write embeddings to
embeddings_file_path = Path(tempfile.mkdtemp())

print(f"Embeddings directory: {embeddings_file_path}")

Embeddings directory: /var/folders/v9/f8ypf65s18v2n9q58yfvl20h0000gq/T/tmp8h7ofuuj


In [6]:
import gc
import json

BQ_NUM_ROWS = 10
BQ_CHUNK_SIZE = 1000
BQ_NUM_CHUNKS = math.ceil(BQ_NUM_ROWS / BQ_CHUNK_SIZE)

START_CHUNK = 0

# Create a rate limit of 300 requests per minute. Adjust this depending on your quota.
API_CALLS_PER_SECOND = 300 / 60
# According to the docs, each request can process 5 instances per request
ITEMS_PER_REQUEST = 5

# Loop through each generated dataframe, convert
for i, df in tqdm(
    enumerate(
        query_bigquery_chunks(
            max_rows=BQ_NUM_ROWS, rows_per_chunk=BQ_CHUNK_SIZE, start_chunk=START_CHUNK
        )
    ),
    total=BQ_NUM_CHUNKS - START_CHUNK,
    position=-1,
    desc="Chunk of rows from BigQuery",
):
    # Create a unique output file for each chunk
    chunk_path = embeddings_file_path.joinpath(
        f"{embeddings_file_path.stem}_{i+START_CHUNK}.json"
    )
    with open(chunk_path, "a") as f:
        id_chunk = df.code

        # Convert batch to embeddings
        is_successful, question_chunk_embeddings = encode_text_to_embedding_batched(
            sentences=df.code_with_desc,
            api_calls_per_second=API_CALLS_PER_SECOND,
            batch_size=ITEMS_PER_REQUEST,
        )

        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [str(value) for value in embedding],
                }
            )
            + "\n"
            for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
        ]
        f.writelines(embeddings_formatted)

        # Delete the DataFrame and any other large data structures
        del df
        gc.collect()

Chunk of rows from BigQuery:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

In [7]:
! gcloud auth login
#! gcloud auth application-default login
remote_folder = f"{BUCKET_URI}/{embeddings_file_path.stem}/"
print(remote_folder)
print(embeddings_file_path)
! gsutil -m cp -r {embeddings_file_path}/* {remote_folder} 

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=1rh803CZJirAbn6wtm0ntFhVbQGM4I&access_type=offline&code_challenge=7VEZeVztwgp_TFDyYJ1xtF-WoI74QsCQGVQ4Wbk-Xm4&code_challenge_method=S256


You are now logged in as [Balki.Nakshatrala@wellsky.com].
Your current project is [viki-dev-app-wsky].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID
gs://viki-ai-provisional-dev/icd10-search-balki//tmp8h7ofuuj/
/var/folders/v9/f8ypf65s18v2n9q58yfvl20h0000gq/T/tmp8h7o

In [8]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [None]:
tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=remote_folder,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=80,
    description=DESCRIPTION,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/145042810266/locations/us-east4/indexes/7916290205940711424/operations/989297681719361536


In [None]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

In [None]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

In [18]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    public_endpoint_enabled=True,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/145042810266/locations/us-east4/indexEndpoints/8048126048157564928/operations/8633243255506468864
MatchingEngineIndexEndpoint created. Resource name: projects/145042810266/locations/us-east4/indexEndpoints/8048126048157564928
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/145042810266/locations/us-east4/indexEndpoints/8048126048157564928')


In [4]:
pip install db-dtypes

Collecting db-dtypes
  Downloading db_dtypes-1.1.1-py2.py3-none-any.whl (14 kB)
Installing collected packages: db-dtypes
Successfully installed db-dtypes-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
from google.cloud import aiplatform_v1
from google.cloud import aiplatform
REGION = "us-east4"

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)
ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

tree_ah_index1  = aiplatform.MatchingEngineIndex(
    index_name='2569954498298511360'
)
INDEX_RESOURCE_NAME = tree_ah_index1.resource_name

index_client = aiplatform_v1.IndexServiceClient(
    client_options=dict(api_endpoint=ENDPOINT)
)

embed = encode_texts_to_embeddings(["hello world"])

insert_datapoints_payload = aiplatform_v1.IndexDatapoint(
    datapoint_id="101",
    feature_vector=embed[0],
    restricts=[{"namespace": "class", "allow_list": ["101"]}],
    crowding_tag=aiplatform_v1.IndexDatapoint.CrowdingTag(crowding_attribute="b"),
)

upsert_request = aiplatform_v1.UpsertDatapointsRequest(
    index=INDEX_RESOURCE_NAME, datapoints=[insert_datapoints_payload]
)

index_client.upsert_datapoints(request=upsert_request)




In [23]:
import os
import sys

import numpy as np
from google.cloud.aiplatform.matching_engine import MatchingEngineIndexEndpoint
from google.cloud import aiplatform_v1beta1 as vertexai

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf

tf.keras.utils.disable_interactive_logging()


# 指定されたパスの画像ファイルを EfficientNetB0 でベクトル化する
def file_to_embedding(model: tf.keras.Model, path: str) -> list[float]:
    raw = tf.io.read_file(path)
    image = tf.image.decode_jpeg(raw, channels=3)
    prediction = model.predict(np.array([image.numpy()]))
    return prediction[0].tolist()


class Matcher:
    def __init__(self, index_endpoint_name: str, deployed_index_id: str):
        self._index_endpoint_name = index_endpoint_name
        self._deployed_index_id = deployed_index_id

        self._client = vertexai.MatchServiceClient(
            client_options={"api_endpoint": self._public_endpoint()}
        )

    # Matching Engine にリクエストして、
    # 与えられたエンベディングの近似最近傍探索を行う
    def find_neighbors(self, embedding: list[float], neighbor_count: int):
        datapoint = vertexai.IndexDatapoint(
            datapoint_id="dummy-id",
            feature_vector=embedding
        )
        query = vertexai.FindNeighborsRequest.Query(datapoint=datapoint)
        request = vertexai.FindNeighborsRequest(
            index_endpoint=self._index_endpoint_name,
            deployed_index_id=self._deployed_index_id,
            queries=[query],
        )

        resp = self._client.find_neighbors(request)

        return resp.nearest_neighbors[0].neighbors

    # IndexEndpoint の public endpoint を取得する
    def _public_endpoint(self) -> str:
        endpoint = MatchingEngineIndexEndpoint(
            index_endpoint_name=self._index_endpoint_name
        )
        return endpoint.gca_resource.public_endpoint_domain_name


def search_for_text(index_endpoint_name: str,
                             deployed_index_id: str,
                             text: str) -> None:
    embedding = encode_texts_to_embeddings([text])

    matcher = Matcher(index_endpoint_name, deployed_index_id)
    neighbors = matcher.find_neighbors(embedding[0], 10)

    for neighbor in neighbors:
        datapoint_id = neighbor.datapoint.datapoint_id
        distance = neighbor.distance
        print(f"{datapoint_id}\tdistance={distance}")


In [27]:
index_endpoint_name = 'projects/145042810266/locations/us-east4/indexEndpoints/7298276710200377344'
deployed_index_id = 'balki_search_1699469904710'

search_for_text(index_endpoint_name=index_endpoint_name,
                         deployed_index_id=deployed_index_id,
                         text="Hello")

101	distance=0.8356250524520874
A320	distance=0.603960394859314
A073	distance=0.6031581163406372
A074	distance=0.6029283404350281
A33	distance=0.6026632785797119
A217	distance=0.5976741313934326
A070	distance=0.592897891998291
A0104	distance=0.592700183391571
A067	distance=0.589692234992981
A071	distance=0.5884808897972107
