In [1]:
import torch
import coremltools as ct
import clip
import numpy as np
from PIL import Image

scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.


# 1. Export TextEncoder

In [2]:
from transformers import CLIPTextModelWithProjection, CLIPTokenizerFast


model_id = "openai/clip-vit-base-patch32"
model = CLIPTextModelWithProjection.from_pretrained(model_id, return_dict=False)
tokenizer = CLIPTokenizerFast.from_pretrained(model_id)
model.eval()

example_input = tokenizer("a photo of a cat", return_tensors="pt")
example_input = example_input.data['input_ids']

traced_model = torch.jit.trace(model, example_input)

  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):


In [3]:
max_seq_length = 76 # if max_seq_length is 77 as in the original model, the validation fails, see details at the end of the notebook. Set max_seq_length to 76 works fine with the app.
text_encoder_model = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[ct.TensorType(name="prompt",
                                 shape=[1,max_seq_length],
                                 dtype=np.int32)],
            outputs=[ct.TensorType(name="embOutput", dtype=np.float32),
                     ct.TensorType(name="embOutput2", dtype=np.float32)],
        )
text_encoder_model.save("TextEncoder_float32_test.mlpackage")

Tuple detected at graph output. This will be flattened in the converted model.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/830 [00:00<?, ? ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  98%|█████████▊| 811/830 [00:00<00:00, 2660.33 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 828/830 [00:00<00:00, 2589.67 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 104.58 passes/s]
Running MIL default pipeline: 100%|██████████| 66/66 [00:09<00:00,  7.24 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 11/11 [00:00<00:00, 147.47 passes/s]


## Validate export  precision

In [4]:
# Load the model
model = ct.models.MLModel('TextEncoder_float32_test.mlpackage')

# Choose a tokenizer, here we use the clip tokenizer
text = clip.tokenize("a photo of a cat")
text = text[:,:max_seq_length]

# # Or use CLIPTokenizerFast
# text = tokenizer("a photo of a cat", return_tensors="pt", padding="max_length", max_length=max_seq_length)
# text = text.data['input_ids'].to(torch.int32)

predictions = model.predict({'prompt': text})
out = traced_model(text)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
print("PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n>>>", out[0][0, :10])
print("\nCoreML TextEncoder ckpt out for \"a photo of a cat\":\n>>>", predictions['embOutput'][0, :10])

PyTorch TextEncoder ckpt out for "a photo of a cat":
>>> tensor([ 0.1555,  0.0733, -0.2448, -0.2212, -0.1934,  0.2052, -0.3175, -0.7824,
        -0.1816,  0.1943], grad_fn=<SliceBackward0>)

CoreML TextEncoder ckpt out for "a photo of a cat":
>>> [ 0.15560171  0.0732335  -0.24512495 -0.22117633 -0.19336982  0.20523793
 -0.3182205  -0.78206545 -0.18144566  0.19457956]


You can see that there is some loss in precision, but it is still acceptable.

# 2. Export ImageEncoder

In [7]:
from transformers import CLIPVisionModelWithProjection, CLIPProcessor

model_id = "openai/clip-vit-base-patch32"
model = CLIPVisionModelWithProjection.from_pretrained(model_id, return_dict=False)
processor = CLIPProcessor.from_pretrained(model_id)
model.eval()

img = Image.open("love-letters-and-hearts.jpg")
example_input = processor(images=img, return_tensors="pt")
example_input = example_input['pixel_values']
traced_model = torch.jit.trace(model, example_input)

  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):


In [8]:
bias = [-processor.image_processor.image_mean[i]/processor.image_processor.image_std[i] for i in range(3)]
scale = 1.0 / (processor.image_processor.image_std[0] * 255.0)

image_input_scale = ct.ImageType(name="colorImage",
                           color_layout=ct.colorlayout.RGB,
                           shape=example_input.shape,
                           scale=scale, bias=bias,
                           channel_first=True,)

image_encoder_model = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[image_input_scale],
            outputs=[ct.TensorType(name="embOutput", dtype=np.float32), 
                     ct.TensorType(name="embOutput2", dtype=np.float32)],
        )

image_encoder_model.save("ImageEncoder_float32.mlpackage")

Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 698/700 [00:00<00:00, 1354.45 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 74.55 passes/s]
Running MIL default pipeline: 100%|██████████| 66/66 [00:14<00:00,  4.54 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 11/11 [00:00<00:00, 220.59 passes/s]


## Validate export

In [9]:
import torchvision.transforms as transforms

image_encoder = ct.models.MLModel('ImageEncoder_float32.mlpackage')
imgPIL = Image.open("love-letters-and-hearts.jpg")
imgPIL = imgPIL.resize((224, 224), Image.BICUBIC)

img_np = np.asarray(imgPIL).astype(np.float32) # (224, 224, 3)
img_np = img_np[np.newaxis, :, :, :] # (1, 224, 224, 3)
img_np = np.transpose(img_np, [0, 3, 1, 2]) # (1, 3, 224, 224)
img_np = img_np / 255.0
torch_tensor_input = torch.from_numpy(img_np)
transform_model = torch.nn.Sequential(
        transforms.Normalize(mean=processor.image_processor.image_mean,
                             std=processor.image_processor.image_std),
)

predictions = image_encoder.predict({'colorImage': imgPIL})
out = traced_model(transform_model(torch_tensor_input))
print("PyTorch ImageEncoder ckpt out for jpg:\n>>>", out[0][0, :10])
print("\nCoreML ImageEncoder ckpt out for jpg:\n>>>", predictions['embOutput'][0, :10])

PyTorch ImageEncoder ckpt out for jpg:
>>> tensor([ 0.2489, -0.0874, -0.2821, -0.1859, -0.4129, -0.4474,  0.3093,  0.3759,
         1.0730,  0.1773], grad_fn=<SliceBackward0>)

CoreML ImageEncoder ckpt out for jpg:
>>> [ 0.24853516 -0.07476807 -0.30615234 -0.20996094 -0.4177246  -0.42163086
  0.34545898  0.47338867  1.1083984   0.1899414 ]


## Test result for max_length = 77

In [10]:
from transformers import CLIPTextModelWithProjection, CLIPTokenizerFast


model_id = "openai/clip-vit-base-patch32"
model = CLIPTextModelWithProjection.from_pretrained(model_id, return_dict=False)
tokenizer = CLIPTokenizerFast.from_pretrained(model_id)
model.eval()

example_input = tokenizer("a photo of a cat", return_tensors="pt")
example_input = example_input.data['input_ids']

traced_model = torch.jit.trace(model, example_input)

max_seq_length = 77 # if max_seq_length is 77 as in the original model, the validation fails, see details below. Set max_seq_length to 76 works fine with the app.
text_encoder_model = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[ct.TensorType(name="prompt",
                                 shape=[1,max_seq_length],
                                 dtype=np.int32)],
            outputs=[ct.TensorType(name="embOutput", dtype=np.float32),
                     ct.TensorType(name="embOutput2", dtype=np.float32)],
        )

# Choose a tokenizer, here we use the clip tokenizer
text = clip.tokenize("a photo of a cat")
text = text[:,:max_seq_length]

predictions = text_encoder_model.predict({'prompt': text})
out = traced_model(text)

print("PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n>>>", out[0][0, :10])
print("\nCoreML TextEncoder ckpt out for \"a photo of a cat\":\n>>>", predictions['embOutput'][0, :10])

  if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/830 [00:00<?, ? ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  88%|████████▊ | 732/830 [00:00<00:00, 1993.68 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 828/830 [00:00<00:00, 2065.24 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 129.31 passes/s]
Running MIL default pipeline: 100%|██████████| 66/66 [00:10<00:00,  6.39 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 11/11 [00:00<00:00, 182.44 passes/s]


PyTorch TextEncoder ckpt out for "a photo of a cat":
>>> tensor([ 0.1555,  0.0733, -0.2448, -0.2212, -0.1934,  0.2052, -0.3175, -0.7824,
        -0.1816,  0.1943], grad_fn=<SliceBackward0>)

CoreML TextEncoder ckpt out for "a photo of a cat":
>>> [-0.066312    0.17878246  0.40718645 -0.08806399  0.26841    -0.22685118
  0.2679821  -1.7103907  -0.33836532  0.28941655]
