# Pre-requistes

Install the necessary libraries

In [None]:
!pip install psycopg2
!pip install matplotlib
!pip install google-cloud-storage
!pip install Pillow

Define your project ID, location, and bucket name

In [None]:
# Define project information
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "[your-location]"  # @param {type:"string"}
BUCKET_NAME = "[your-bucket-name]" # @param {type:"string"}

# Define AlloyDB connection parameters
HOST = "[your-alloydb-host-ip]" # @param {type:"string"}
DATABASE_NAME = "[your-database-name]" # @param {type:"string"}
USER = "[your-user]" # @param {type:"string"}
PASSWORD = "[your-password]" # @param {type:"string"}

Create local authentication credentials for your user account

In [None]:
!gcloud auth application-default login

Validate the google_ml_integration extension in AlloyDB (version 1.4.4 or later)

In [None]:
import psycopg2

# Connection parameters
conn_params = {
    'host': HOST,
    'database': DATABASE_NAME,
    'user': USER,
    'password': PASSWORD
}

# Establish connection
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# Validate extension
query = """
select *
    from pg_extension
    where extname = 'google_ml_integration';
"""
cursor.execute(query)

# Fetch result
result = cursor.fetchone()
print(f"Extension validation result: {result}")

# Close connection
cursor.close()
conn.close()

Integrate AlloyDB with Vertx AI (https://cloud.google.com/alloydb/docs/ai/configure-vertex-ai)

# Generate synthetic data

Generate sample images for the demo

In [None]:
from google import genai
from tqdm import tqdm

#client = genai.Client(vertexai=True,project="mtoscano-dev-sandbox",location="us-central1")

client = genai.Client(vertexai=True,project=PROJECT_ID,location=LOCATION)

In [None]:
import os
# Create images directory if it doesn't exist
os.makedirs("images", exist_ok=True)

# List of clothing items with colors
prompts = [
    "red car",
    "blue car",
    "green car",
    "yellow car",
    "black car",
    "white car",
    "sports car",
    "vintage car",
    "convertible car",
    "luxury car",
    "small dog",
    "big dog",
    "fluffy dog",
    "happy dog",
    "sad dog",
    "running dog",
    "sleeping dog",
    "coke bottle on a table",
    "coke bottle in the snow",
    "coke bottle on the beach",
    "open suitcase",
    "closed suitcase",
    "leather suitcase",
    "red suitcase",
    "black suitcase",
    "straw hat",
    "baseball hat",
    "winter hat",
    "fancy hat",
    "red hat",
    "blue mobile phone",
    "black mobile phone",
    "broken mobile phone",
    "old mobile phone",
    "new mobile phone",
    "wooden table",
    "metal table",
    "glass table",
    "round table",
    "square table",
    "red table",
    "blue table",
    "green table",
    "yellow table",
    "black and white table",
    "car in a city",
    "dog in a park",
    "coke bottles on a shelf",
    "suitcase on a conveyor belt",
    "hat on a head",
    "mobile phone on a desk",
    "table with food on it"
]

# Generate output filenames
output_files = [f"images/item_{i+1}.png" for i in range(len(prompts))]

for i, prompt in enumerate(prompts):
    image = client.models.generate_images(
        model="imagen-4.0-generate-preview-05-20",
        prompt=prompt,
    )
    image.generated_images[0].image.save(output_files[i])
print(f"Created output image using {len(image.generated_images[0].image.image_bytes)} bytes")

Created output image using 1532822 bytes


Generate input images for the demo

In [None]:
import os

# Create images directory if it doesn't exist
os.makedirs("input_images", exist_ok=True)

# List of clothing items with colors
prompts = [
    "A bottle of coke",
    "wooden table",
    "a dog playing with a ball"
]

# Generate output filenames
output_files = [f"input_images/input_item_{i+1}.png" for i in range(len(prompts))]

for i, prompt in enumerate(prompts):
    image = client.models.generate_images(
        model="imagen-4.0-generate-preview-05-20",
        prompt=prompt,
    )
    image.generated_images[0].image.save(output_files[i])
print(f"Created output image using {len(image.generated_images[0].image.image_bytes)} bytes")

Created output image using 1371299 bytes


Create a GCS Bucket to store the catalog of images

In [None]:
# Create a new GCS bucket
import os

# Create the bucket using gsutil command
bucket_creation_command = f"gsutil mb -p {PROJECT_ID} -l {LOCATION} gs://{BUCKET_NAME}"

try:
    # Execute the gsutil command
    result = os.system(bucket_creation_command)
    if result == 0:
        print(f"Successfully created bucket: gs://{BUCKET_NAME}")
    else:
        print("Failed to create bucket")
except Exception as e:
    print(f"Error creating bucket: {e}")



Upload the sample images to the bucket

In [None]:
# Upload images from local directory to GCS bucket
import glob

# Get list of images in the images directory
image_files = glob.glob("images/*")

# Upload each image to GCS bucket
for image_file in image_files:
    # Get just the filename without path
    filename = os.path.basename(image_file)

    # Construct the GCS destination path
    destination = f"gs://{BUCKET_NAME}/images/{filename}"

    # Upload command
    upload_command = f"gsutil cp {image_file} {destination}"

    try:
        # Execute the upload
        result = os.system(upload_command)
        if result == 0:
            print(f"Successfully uploaded {filename} to {destination}")
        else:
            print(f"Failed to upload {filename}")
    except Exception as e:
        print(f"Error uploading {filename}: {e}")



Upload the input images to the bucket

In [None]:
# Upload images from local directory to GCS bucket
import glob

# Get list of images in the images directory
image_files = glob.glob("input_images/*")

# Upload each image to GCS bucket
for image_file in image_files:
    # Get just the filename without path
    filename = os.path.basename(image_file)

    # Construct the GCS destination path
    destination = f"gs://{BUCKET_NAME}/input_images/{filename}"

    # Upload command
    upload_command = f"gsutil cp {image_file} {destination}"

    try:
        # Execute the upload
        result = os.system(upload_command)
        if result == 0:
            print(f"Successfully uploaded {filename} to {destination}")
        else:
            print(f"Failed to upload {filename}")
    except Exception as e:
        print(f"Error uploading {filename}: {e}")

Create a table in AlloyDB to store the embeddings along with the metadata

In [None]:
# Connect to AlloyDB and create table for storing image embeddings
import psycopg2

# Connection parameters
conn_params = {
    'host': HOST,
    'database': DATABASE_NAME,
    'user': USER,
    'password': PASSWORD
}

try:
    # Establish connection
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()

    # Create vector extension if it doesn't exist
    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")

    # Create table for storing image embeddings
    create_table_query = """
    CREATE TABLE IF NOT EXISTS images (
        image_id SERIAL PRIMARY KEY,
        image_uri TEXT NOT NULL,
        embedding vector(1408)
    );
    """
    cursor.execute(create_table_query)

    # Commit the changes
    conn.commit()

    print("Successfully created table 'images'")

    # Close connection
    cursor.close()
    conn.close()

except Exception as e:
    print(f"Error creating table: {e}")


# Generate the images embedding and load it to AlloyDB

Load the images table with the embeddings and metadata

In [None]:
# Connect to AlloyDB
import psycopg2
from google.cloud import storage

conn_params = {
    'host': HOST,
    'database': DATABASE_NAME,
    'user': USER,
    'password': PASSWORD
}
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# List images in bucket
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
blobs = list(bucket.list_blobs(prefix='images/'))

# Insert embeddings for each image
for i, blob in enumerate(blobs, 1):
    # Ensure we only process files with a valid content type to avoid errors
    if not blob.content_type or not blob.content_type.startswith('image/'):
        print(f"Skipping non-image file: {blob.name}")
        continue

    image_uri = f'gs://{BUCKET_NAME}/{blob.name}'

    # CORRECT: Use ai.image_embedding and provide the mimetype
    query = """
    SELECT ai.image_embedding(
        model_id => 'multimodalembedding@001',
        image => %s,
        mimetype => %s
    );
    """
    # Pass both the URI and the blob's content type
    cursor.execute(query, (image_uri, blob.content_type))
    embedding = cursor.fetchone()[0]

    # Insert into images table
    insert_query = """
    INSERT INTO images (image_id, image_uri, embedding)
    VALUES (%s, %s, %s);
    """
    cursor.execute(insert_query, (i, image_uri, embedding))

conn.commit()
cursor.close()
conn.close()

print(f"Successfully loaded embeddings for {len(blobs)} images")

Create a ScaNN index on the multimodal column

In [None]:
# Connect to AlloyDB
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

 # Create extension alloydb_scann if it doesn't exist
cursor.execute("CREATE EXTENSION IF NOT EXISTS alloydb_scann;")

# Create ScaNN index on multimodal column
create_index_query = """
CREATE INDEX IF NOT EXISTS image_scann_idx
ON images
USING scann (embedding cosine)
WITH (num_leaves = 5);
"""

cursor.execute(create_index_query)
conn.commit()

print("Successfully created ScaNN index on embedding column")

cursor.close()
conn.close()


# Perform Similarity Search

Perform a similarity search based on input images:

Input Image 1:

In [None]:
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# Generate embedding and perform similarity search in one query
similarity_query = f"""
WITH query_embedding AS (
    SELECT ai.image_embedding(
        model_id => 'multimodalembedding@001',
        image => 'gs://{BUCKET_NAME}/input_images/input_item_1.png',
        mimetype => 'image/png'
    )::vector(1408) AS embedding
)
SELECT
    image_id,
    image_uri,
    1 - (images.embedding <=> query_embedding.embedding) AS similarity_score
FROM
    images,
    query_embedding
ORDER BY
    images.embedding <=> query_embedding.embedding ASC
LIMIT 5;
"""
cursor.execute(similarity_query)
results = cursor.fetchall()

print("\nTop 5 similar images:")
for row in results:
    print(f"ID: {row[0]}, Image URI: {row[1]}, Similarity Score: {row[2]:.4f}")

cursor.close()
conn.close()

Display the top 3 results:

In [None]:
# Display the top 3 results
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from google.cloud import storage
import io
from PIL import Image

# Initialize storage client
storage_client = storage.Client()

# Create a figure with subplots for the top 3 results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Top 3 Similar Images', fontsize=16)

for i in range(min(3, len(results))):
    image_uri = results[i][1]
    similarity_score = results[i][2]

    # Parse the GCS URI to get bucket and blob name
    # Format: gs://bucket-name/path/to/file
    uri_parts = image_uri.replace('gs://', '').split('/', 1)
    bucket_name = uri_parts[0]
    blob_name = uri_parts[1]

    # Download image from GCS
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Download image data
    image_data = blob.download_as_bytes()

    # Convert to PIL Image and then to array for matplotlib
    image = Image.open(io.BytesIO(image_data))

    # Display the image
    axes[i].imshow(image)
    axes[i].set_title(f'Rank {i+1}\nSimilarity: {similarity_score:.4f}')
    axes[i].axis('off')

plt.tight_layout()
plt.show()

# Also show the query image for reference
print("\nQuery image:")
query_uri = f'gs://{BUCKET_NAME}/input_images/input_item_1.png'
uri_parts = query_uri.replace('gs://', '').split('/', 1)
bucket_name = uri_parts[0]
blob_name = uri_parts[1]

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
query_image_data = blob.download_as_bytes()
query_image = Image.open(io.BytesIO(query_image_data))

plt.figure(figsize=(5, 5))
plt.imshow(query_image)
plt.title('Query Image')
plt.axis('off')
plt.show()

Input Image 2:

In [None]:
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# Generate embedding and perform similarity search in one query
similarity_query = f"""
WITH query_embedding AS (
    SELECT ai.image_embedding(
        model_id => 'multimodalembedding@001',
        image => 'gs://{BUCKET_NAME}/input_images/input_item_2.png',
        mimetype => 'image/png'
    )::vector(1408) AS embedding
)
SELECT
    image_id,
    image_uri,
    1 - (images.embedding <=> query_embedding.embedding) AS similarity_score
FROM
    images,
    query_embedding
ORDER BY
    images.embedding <=> query_embedding.embedding ASC
LIMIT 5;
"""
cursor.execute(similarity_query)
results = cursor.fetchall()

print("\nTop 5 similar images:")
for row in results:
    print(f"ID: {row[0]}, Image URI: {row[1]}, Similarity Score: {row[2]:.4f}")

cursor.close()
conn.close()

Display the top 3 results:

In [None]:
# Display the top 3 results
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from google.cloud import storage
import io
from PIL import Image

# Initialize storage client
storage_client = storage.Client()

# Create a figure with subplots for the top 3 results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Top 3 Similar Images', fontsize=16)

for i in range(min(3, len(results))):
    image_uri = results[i][1]
    similarity_score = results[i][2]

    # Parse the GCS URI to get bucket and blob name
    # Format: gs://bucket-name/path/to/file
    uri_parts = image_uri.replace('gs://', '').split('/', 1)
    bucket_name = uri_parts[0]
    blob_name = uri_parts[1]

    # Download image from GCS
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Download image data
    image_data = blob.download_as_bytes()

    # Convert to PIL Image and then to array for matplotlib
    image = Image.open(io.BytesIO(image_data))

    # Display the image
    axes[i].imshow(image)
    axes[i].set_title(f'Rank {i+1}\nSimilarity: {similarity_score:.4f}')
    axes[i].axis('off')

plt.tight_layout()
plt.show()

# Also show the query image for reference
print("\nQuery image:")
query_uri = f'gs://{BUCKET_NAME}/input_images/input_item_2.png'
uri_parts = query_uri.replace('gs://', '').split('/', 1)
bucket_name = uri_parts[0]
blob_name = uri_parts[1]

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
query_image_data = blob.download_as_bytes()
query_image = Image.open(io.BytesIO(query_image_data))

plt.figure(figsize=(5, 5))
plt.imshow(query_image)
plt.title('Query Image')
plt.axis('off')
plt.show()

Input Image 3:

In [None]:
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# Generate embedding and perform similarity search in one query
similarity_query = f"""
WITH query_embedding AS (
    SELECT ai.image_embedding(
        model_id => 'multimodalembedding@001',
        image => 'gs://{BUCKET_NAME}/input_images/input_item_3.png',
        mimetype => 'image/png'
    )::vector(1408) AS embedding
)
SELECT
    image_id,
    image_uri,
    1 - (images.embedding <=> query_embedding.embedding) AS similarity_score
FROM
    images,
    query_embedding
ORDER BY
    images.embedding <=> query_embedding.embedding ASC
LIMIT 5;
"""
cursor.execute(similarity_query)
results = cursor.fetchall()

print("\nTop 5 similar images:")
for row in results:
    print(f"ID: {row[0]}, Image URI: {row[1]}, Similarity Score: {row[2]:.4f}")

cursor.close()
conn.close()

Display the top 3 results:

In [None]:
# Display the top 3 results
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from google.cloud import storage
import io
from PIL import Image

# Initialize storage client
storage_client = storage.Client()

# Create a figure with subplots for the top 3 results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Top 3 Similar Images', fontsize=16)

for i in range(min(3, len(results))):
    image_uri = results[i][1]
    similarity_score = results[i][2]

    # Parse the GCS URI to get bucket and blob name
    # Format: gs://bucket-name/path/to/file
    uri_parts = image_uri.replace('gs://', '').split('/', 1)
    bucket_name = uri_parts[0]
    blob_name = uri_parts[1]

    # Download image from GCS
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Download image data
    image_data = blob.download_as_bytes()

    # Convert to PIL Image and then to array for matplotlib
    image = Image.open(io.BytesIO(image_data))

    # Display the image
    axes[i].imshow(image)
    axes[i].set_title(f'Rank {i+1}\nSimilarity: {similarity_score:.4f}')
    axes[i].axis('off')

plt.tight_layout()
plt.show()

# Also show the query image for reference
print("\nQuery image:")
query_uri = f'gs://{BUCKET_NAME}/input_images/input_item_3.png'
uri_parts = query_uri.replace('gs://', '').split('/', 1)
bucket_name = uri_parts[0]
blob_name = uri_parts[1]

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
query_image_data = blob.download_as_bytes()
query_image = Image.open(io.BytesIO(query_image_data))

plt.figure(figsize=(5, 5))
plt.imshow(query_image)
plt.title('Query Image')
plt.axis('off')
plt.show()

Perform a similarity search based on text:

In [None]:
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# Generate embedding and perform similarity search in one query
similarity_query = f"""
WITH query_embedding AS (
    SELECT ai.text_embedding(
        model_id => 'multimodalembedding@001',
        content => 'A dog running in a park'
    )::vector(1408) AS embedding
)
SELECT
    image_id,
    image_uri,
    1 - (images.embedding <=> query_embedding.embedding) AS similarity_score
FROM
    images,
    query_embedding
ORDER BY
    images.embedding <=> query_embedding.embedding ASC
LIMIT 5;
"""
cursor.execute(similarity_query)
results = cursor.fetchall()

print("\nTop 5 similar images:")
for row in results:
    print(f"ID: {row[0]}, Image URI: {row[1]}, Similarity Score: {row[2]:.4f}")

cursor.close()
conn.close()

Display the top 3 results:

In [None]:
# Display the top 3 results
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from google.cloud import storage
import io
from PIL import Image

# Initialize storage client
storage_client = storage.Client()

# Create a figure with subplots for the top 3 results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Top 3 Similar Images', fontsize=16)

for i in range(min(3, len(results))):
    image_uri = results[i][1]
    similarity_score = results[i][2]

    # Parse the GCS URI to get bucket and blob name
    # Format: gs://bucket-name/path/to/file
    uri_parts = image_uri.replace('gs://', '').split('/', 1)
    bucket_name = uri_parts[0]
    blob_name = uri_parts[1]

    # Download image from GCS
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Download image data
    image_data = blob.download_as_bytes()

    # Convert to PIL Image and then to array for matplotlib
    image = Image.open(io.BytesIO(image_data))

    # Display the image
    axes[i].imshow(image)
    axes[i].set_title(f'Rank {i+1}\nSimilarity: {similarity_score:.4f}')
    axes[i].axis('off')

plt.tight_layout()
plt.show()

# Also show the query text for reference
print("\nQuery text:")
query_text = f'A dog running in a park'
print(query_text)