In [None]:
import torch
import coremltools as ct
import clip
import numpy as np
from PIL import Image

# 1. Export TextEncoder

In [None]:
from transformers import CLIPTextModelWithProjection, CLIPTokenizerFast


model_id = "openai/clip-vit-base-patch32"
model = CLIPTextModelWithProjection.from_pretrained(model_id, return_dict=False)
tokenizer = CLIPTokenizerFast.from_pretrained(model_id)
model.eval()

example_input = tokenizer("a photo of a cat", return_tensors="pt")
example_input = example_input.data['input_ids']

traced_model = torch.jit.trace(model, example_input)

In [None]:
max_seq_length = 76 # if max_seq_length is 77 as in the original model, the validation fails, see details at the end of the notebook. Set max_seq_length to 76 works fine with the app.
text_encoder_model = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[ct.TensorType(name="prompt",
                                 shape=[1,max_seq_length],
                                 dtype=np.int32)],
            outputs=[ct.TensorType(name="embOutput", dtype=np.float32),
                     ct.TensorType(name="embOutput2", dtype=np.float32)],
        )
text_encoder_model.save("TextEncoder_float32_test.mlpackage")

## Validate export  precision

In [None]:
# Load the model
model = ct.models.MLModel('TextEncoder_float32_test.mlpackage')

# Choose a tokenizer, here we use the clip tokenizer
text = clip.tokenize("a photo of a cat")
text = text[:,:max_seq_length]

# # Or use CLIPTokenizerFast
# text = tokenizer("a photo of a cat", return_tensors="pt", padding="max_length", max_length=max_seq_length)
# text = text.data['input_ids'].to(torch.int32)

predictions = model.predict({'prompt': text})
out = traced_model(text)

In [None]:
print("PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n>>>", out[0][0, :10])
print("\nCoreML TextEncoder ckpt out for \"a photo of a cat\":\n>>>", predictions['embOutput'][0, :10])

You can see that there is some loss in precision, but it is still acceptable.

# 2. Export ImageEncoder

In [None]:
from transformers import CLIPVisionModelWithProjection, CLIPProcessor

model_id = "openai/clip-vit-base-patch32"
model = CLIPVisionModelWithProjection.from_pretrained(model_id, return_dict=False)
processor = CLIPProcessor.from_pretrained(model_id)
model.eval()

img = Image.open("love-letters-and-hearts.jpg")
example_input = processor(images=img, return_tensors="pt")
example_input = example_input['pixel_values']
traced_model = torch.jit.trace(model, example_input)

In [None]:
bias = [-processor.image_processor.image_mean[i]/processor.image_processor.image_std[i] for i in range(3)]
scale = 1.0 / (processor.image_processor.image_std[0] * 255.0)

image_input_scale = ct.ImageType(name="colorImage",
                           color_layout=ct.colorlayout.RGB,
                           shape=example_input.shape,
                           scale=scale, bias=bias,
                           channel_first=True,)

image_encoder_model = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[image_input_scale],
            outputs=[ct.TensorType(name="embOutput", dtype=np.float32), 
                     ct.TensorType(name="embOutput2", dtype=np.float32)],
        )

image_encoder_model.save("ImageEncoder_float32.mlpackage")

## Validate export

In [None]:
import torchvision.transforms as transforms

image_encoder = ct.models.MLModel('ImageEncoder_float32.mlpackage')
imgPIL = Image.open("love-letters-and-hearts.jpg")
imgPIL = imgPIL.resize((224, 224), Image.BICUBIC)

img_np = np.asarray(imgPIL).astype(np.float32) # (224, 224, 3)
img_np = img_np[np.newaxis, :, :, :] # (1, 224, 224, 3)
img_np = np.transpose(img_np, [0, 3, 1, 2]) # (1, 3, 224, 224)
img_np = img_np / 255.0
torch_tensor_input = torch.from_numpy(img_np)
transform_model = torch.nn.Sequential(
        transforms.Normalize(mean=processor.image_processor.image_mean,
                             std=processor.image_processor.image_std),
)

predictions = image_encoder.predict({'colorImage': imgPIL})
out = traced_model(transform_model(torch_tensor_input))
print("PyTorch ImageEncoder ckpt out for jpg:\n>>>", out[0][0, :10])
print("\nCoreML ImageEncoder ckpt out for jpg:\n>>>", predictions['embOutput'][0, :10])

## Test result for max_length = 77

In [None]:
from transformers import CLIPTextModelWithProjection, CLIPTokenizerFast


model_id = "openai/clip-vit-base-patch32"
model = CLIPTextModelWithProjection.from_pretrained(model_id, return_dict=False)
tokenizer = CLIPTokenizerFast.from_pretrained(model_id)
model.eval()

example_input = tokenizer("a photo of a cat", return_tensors="pt")
example_input = example_input.data['input_ids']

traced_model = torch.jit.trace(model, example_input)

max_seq_length = 77 # if max_seq_length is 77 as in the original model, the validation fails, see details below. Set max_seq_length to 76 works fine with the app.
text_encoder_model = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[ct.TensorType(name="prompt",
                                 shape=[1,max_seq_length],
                                 dtype=np.int32)],
            outputs=[ct.TensorType(name="embOutput", dtype=np.float32),
                     ct.TensorType(name="embOutput2", dtype=np.float32)],
        )

# Choose a tokenizer, here we use the clip tokenizer
text = clip.tokenize("a photo of a cat")
text = text[:,:max_seq_length]

predictions = text_encoder_model.predict({'prompt': text})
out = traced_model(text)

print("PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n>>>", out[0][0, :10])
print("\nCoreML TextEncoder ckpt out for \"a photo of a cat\":\n>>>", predictions['embOutput'][0, :10])