# Multimodal Embedding Models

## Setup

### Define a multimodal embedding model

In [None]:
from PIL import Image
from transformers import AutoProcessor, AutoTokenizer, CLIPModel
from typing import List

model_name = "openai/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

In [None]:
def get_image_embedding(img: Image) -> List[float]:
    inputs = processor(images=img, return_tensors="pt")
    return model.get_image_features(**inputs)[0]

def get_text_embeddings(texts: List[str]) -> List[List[float]]:
    inputs = tokenizer(texts, padding=True, return_tensors="pt")
    return model.get_text_features(**inputs)

## Generate embeddings

### Generate an embedding from a sample image

In [None]:
file_path = "../../images/coffee.png"
img = Image.open(file_path).convert('RGB').resize((200,200))
image_embedding = get_image_embedding(img)
img

### Generate embeddings from text strings

In [None]:
texts = [
  "a cup of black coffee",
  "a laptop computer",
  "a caffe latte",
  "a caffe latte on a plate in front of a laptop",
  "a laptop showing code",
  "a laptop showing a movie",
  "a laptop on a wooden table",
  "a laptop on an airplane tray table",
  "Godzilla riding a roller coaster"
]
text_embeddings = get_text_embeddings(texts)

## Comparing image and text embeddings

In [None]:
from langchain_community.utils.math import cosine_similarity

image_vector = image_embedding.detach().numpy()
text_vectors = [tensor.detach().numpy() for tensor in text_embeddings]
results = [
    { 'text': text, 'similarity': cosine_similarity([image_vector], [text_vectors[index]])[0][0] }
    for index, text in enumerate(texts)
]

### Sort results with higher similarity first

In [None]:
results.sort(key=lambda x: x['similarity'], reverse=True)

In [None]:
for result in results:
    print(f'Similarity between image and "{result["text"]}": {result["similarity"]}')

## Exercises

- Take what you've learned from `embeddings/01_comparing_embeddings` and experiment with comparing embeddings of images and/or text inputs.

### Discussion Questions

- Images and text "living" in the same semantic space is powerful! What are some of the implications for adding multimodal capability to an embedding model?
- Search around the Internet for other modalities that people are talking about. Do any other modalities look intriguing for your collections or materials?