In [6]:
from pathlib import Path
from model import CLIPModel
from PIL import Image
from preprocessing.tokenizer import CLIPTokenizer
from preprocessing.image_processor import CLIPImageProcessor

MODEL: str = "openai/clip-vit-base-patch32"
CONVERTED_CKPT_PATH: str = f"weights/mlx/{MODEL}"

# Load pretrained MLX CLIPModel
mlx_clip = CLIPModel.from_pretrained(Path(CONVERTED_CKPT_PATH))
# Load input tokenizer and transformers image (pre)processor
tokenizer = CLIPTokenizer.from_pretrained(Path(CONVERTED_CKPT_PATH))
img_processor = CLIPImageProcessor.from_pretrained(Path(CONVERTED_CKPT_PATH))
# Preprocess the input
clip_input = {
    "input_ids": tokenizer(["a photo of a cat", "a photo of a dog"]),
    "pixel_values": img_processor([Image.open("cats.jpeg"), Image.open("dog.jpeg")])
}
# Compute the output
mlx_out = mlx_clip(
    **clip_input,
    return_loss=True
)
# Print some embeddings and the CLIP loss
print("text embeddings:")
print(mlx_out.text_embeds)
print("image embeddings:")
print(mlx_out.image_embeds)
print(f"CLIP loss: {mlx_out.loss}")

text embeddings:
array([[0.0148391, 0.0069961, -0.0233705, ..., -0.0508463, -0.0437914, 0.00330403],
       [0.00870739, 0.0258293, -0.0386577, ..., -0.0546769, -0.0241999, 0.0111514]], dtype=float32)
image embeddings:
array([[-0.00978788, 0.0127698, -0.0274189, ..., 0.0802634, -0.00135005, 0.0237339],
       [0.017399, 0.0232256, -0.0505955, ..., 0.0478406, 0.0470153, 0.00132057]], dtype=float32)
CLIP loss: array(0.00763702, dtype=float32)
