# Multimodal Embedding Models

## Setup

In [5]:
import io
import base64
import boto3
import json
import os
from PIL import Image
from typing import List

### Define an AWS Bedrock client for making API calls

In [6]:
session = boto3.Session()

# Note there is not any client configuration beyond the name of the service because of the environment variables set up in this notebook environment. How nice!
client = session.client("bedrock-runtime")

## Define helper methods for calling AWS Bedrock

### adapted from: [Classification with Image Embedding with AWS Bedrock Titan Multimodal and Vector DB](https://medium.com/@chikim79/classification-with-image-embedding-with-aws-bedrock-titan-multimodal-and-vector-db-5966cc456582)

In [7]:
def readFileAsBase64(file_path):
    with Image.open(file_path) as img:
        img = img.convert("RGB")  # Convert to RGB mode
        resized_img = img.resize((200,200)) # Resize the image so API calls don't fail
        buffered = io.BytesIO()
        resized_img.save(buffered, format="JPEG")
        base64_string = base64.b64encode(buffered.getvalue()).decode("utf8")
    return base64_string

def construct_bedrock_body(base64_string: str) -> str:
    """Construct the request body for the Bedrock API."""
    return json.dumps(
        {
            "inputImage": base64_string,
            "embeddingConfig": {"outputEmbeddingLength": 1024},
        }
    )

def get_image_embedding(file_path: str) -> List[float]:
    """Get image embedding from Bedrock API."""
    base64_string = readFileAsBase64(file_path)
    body = construct_bedrock_body(base64_string)

    try:
        response = client.invoke_model(
            body=body,
            modelId="amazon.titan-embed-image-v1",
            accept="application/json",
            contentType="application/json",
        )

        response_body = json.loads(response.get("body").read())
        return response_body["embedding"]
    except Exception as e:
        raise ValueError(f"Error raised by image embedding endpoint: {e}")

## Generate the embedding from a sample image

In [8]:
embedding = get_image_embedding("../../images/mteb_leaderboard.png")

In [9]:
print(len(embedding))

1024


In [10]:
embedding

[0.020839129,
 -0.014455973,
 0.00495164,
 -0.0362338,
 0.031540304,
 0.026283586,
 -0.030601604,
 0.02008817,
 -0.049938817,
 0.029099686,
 0.029475166,
 0.00905845,
 0.00971554,
 0.011311329,
 0.0025579561,
 0.06007677,
 -0.032479003,
 -0.0040598754,
 -0.030977085,
 -0.017553682,
 -0.0066647665,
 -0.0017013929,
 0.12541026,
 0.003215046,
 0.066084445,
 -0.01839851,
 0.0362338,
 0.0053975224,
 0.026283586,
 -0.035295103,
 0.052191693,
 -0.014362102,
 -0.0071341163,
 0.019243341,
 0.016239502,
 -0.0104665,
 0.0010795045,
 -0.0077912062,
 0.034168664,
 0.041678257,
 0.048061416,
 -0.03360544,
 0.032291263,
 0.027785506,
 -0.037923463,
 0.026659066,
 0.02290427,
 -0.035670582,
 0.026659066,
 0.0409273,
 0.0102787595,
 0.0040598754,
 0.032479003,
 -0.013047923,
 -0.026095847,
 -0.011921484,
 -0.029287426,
 -0.025532627,
 0.0020768726,
 -0.014549843,
 -0.05632197,
 -0.04261696,
 0.023373619,
 -0.034544144,
 0.0077442713,
 -0.0009093652,
 -0.08035268,
 0.009386995,
 0.020745259,
 -0.0638315

## Exercises

- Take what you've learned from `embeddings/01_comparing_embeddings` and experiment with comparing emeddings images and/or text inputs.

### Discussion Questions

- Images and text "living" in the same semantic space is powerful! What are some of the implications for adding multimodal capability to an embedding model?
- Search around the Internet for other modalities that people are talking about. Do any other modalities look intriguing for your collections or materials?