In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

In [2]:
import os

os.environ["TORCH_LOGS"] = "+dynamic"

In [3]:
import torch
import coremltools as ct

from src.mslm.utils.setup_train import build_model
from src.mslm.utils.config_loader import ConfigLoader

scikit-learn version 1.7.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.5.1. Disabling scikit-learn conversion API.
Torch version 2.7.1+cu128 has not been tested with coremltools. You may run into unexpected errors. Torch 2.5.0 is the most recent version that has been tested.
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_parameters = ConfigLoader("../config/model/config.toml").load_config()
model_parameters.update({
    "input_size": 133 * 2,
    "output_size": 3072,
    #"use_checkpoint": False
})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
version = 106
checkpoint = 1
epoch = 9

model_parameters

{'output_size': 3072,
 'hidden_size': 1792,
 'nhead': 16,
 'ff_dim': 2816,
 'n_layers': 10,
 'encoder_dropout': 0.45,
 'multihead_dropout': 0.4,
 'sequential_dropout': 0.6,
 'pool_dim': 256,
 'input_size': 266}

In [None]:
def load_model():
    model = build_model(**model_parameters)    

    model_location = f"../../outputs/checkpoints/{version}/{checkpoint}/{epoch}/checkpoint.pth" 
    if not os.path.exists(model_location):
        raise FileNotFoundError(
            f"Model not found {model_location}")

    state_dict = torch.load(model_location, weights_only=False)

    model.load_state_dict(state_dict["model_state"])

    return model

In [13]:
model_test = load_model()
model_test.to(device).eval()

Model Parameters:  {'input_size': 266, 'hidden_size': 1792, 'output_size': 3072, 'nhead': 16, 'ff_dim': 2816, 'n_layers': 10, 'max_seq_length': 301, 'pool_dim': 256, 'encoder_dropout': 0.45, 'multihead_dropout': 0.4, 'sequential_dropout': 0.6}
MHARoPE kwargs {'device': None, 'dtype': None}
dim: 1792 num_heads: 16 dim rope 112




Imitator(
  (linear_feat): Sequential(
    (0): Linear(in_features=266, out_features=1792, bias=True)
    (1): GELU(approximate='none')
    (2): LayerNorm((1792,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=1792, out_features=896, bias=True)
    (4): GELU(approximate='none')
    (5): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
  )
  (conv1): Conv1d(896, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (act1): GELU(approximate='none')
  (conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
  (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (act2): GELU(approximate='none')
  (linear_hidden): Linear(in_features=256, out_features=1792, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-9): 10 x TransformerEncoderLayerRoPE(
        (self_attn): MultiheadAttentionRoPE(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1792, out_feat

TypeError: 'OptimizedModule' object is not subscriptable

In [13]:
example_input = torch.randn(1, 179, 133, 2).to(device)
example_input_mask = torch.zeros((1, 179), dtype=torch.bool).to(device)

In [14]:
with torch.no_grad():
    output = model_test(example_input, example_input_mask)

In [15]:
output_cpu = output.cpu()
output_cpu

tensor([[[-0.0317,  0.1019, -0.1475,  ..., -0.0188, -0.1541,  0.0189],
         [-0.2179, -0.2696,  2.0419,  ..., -0.8005,  0.0937,  0.9494],
         [-0.2375, -0.5725,  1.0683,  ..., -0.0072, -1.1862,  0.6337],
         ...,
         [-0.1980,  0.4222,  0.0597,  ..., -0.8337, -0.1530, -0.3015],
         [ 0.0068,  0.0682,  0.4659,  ..., -0.0224, -0.2580, -0.1173],
         [ 0.2847,  0.5836,  0.6099,  ..., -0.6654, -0.8022,  1.0683]]])

In [16]:
output_cpu.size()

torch.Size([1, 30, 3072])

In [17]:
output_np = output_cpu.numpy()
output_np

array([[[-0.03165762,  0.10191493, -0.14751945, ..., -0.01876208,
         -0.15405497,  0.01890181],
        [-0.21793239, -0.26959455,  2.0419297 , ..., -0.8005126 ,
          0.09369794,  0.9493704 ],
        [-0.23749627, -0.5724557 ,  1.068256  , ..., -0.00720272,
         -1.1861858 ,  0.63368315],
        ...,
        [-0.19800192,  0.42222905,  0.05972207, ..., -0.83373713,
         -0.1530048 , -0.30145833],
        [ 0.00675472,  0.06821718,  0.46587744, ..., -0.0224178 ,
         -0.25804543, -0.11734705],
        [ 0.28465277,  0.5835949 ,  0.60991955, ..., -0.66544235,
         -0.8021887 ,  1.0683386 ]]], shape=(1, 30, 3072), dtype=float32)

In [18]:
gemma_model = torch.load("../../local_models/gemma_W_embeds/gemma_embedding_matrix.pt")
gemma_model.size()

torch.Size([262400, 2048])

In [19]:
output_location = f"../../outputs/model_exports/{version}/{checkpoint}/{epoch}"
os.makedirs(output_location, exist_ok=True)

model_iphone = load_model().to("cpu")
model_iphone.eval()

example_input = example_input.to("cpu") 
example_input_mask = example_input_mask.to("cpu")

Model Parameters:  {'input_size': 266, 'hidden_size': 1792, 'output_size': 3072, 'nhead': 16, 'ff_dim': 2304, 'n_layers': 8, 'max_seq_length': 30, 'pool_dim': 512, 'encoder_dropout': 0.1, 'multihead_dropout': 0.35, 'sequential_dropout': 0.2}
MHARoPE kwargs {'device': None, 'dtype': None}
dim: 1792 num_heads: 16 dim rope 112
Imitator(
  (linear_feat): Sequential(
    (0): Linear(in_features=266, out_features=1792, bias=True)
    (1): GELU(approximate='none')
    (2): LayerNorm((1792,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=1792, out_features=896, bias=True)
    (4): GELU(approximate='none')
    (5): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
  )
  (conv1): Conv1d(896, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (act1): GELU(approximate='none')
  (conv2): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
  (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (act2): GELU(ap

# Convert to Executorch

In [20]:
from torch.export import export, Dim

dynamic_shapes = {
    "x": {
        1: Dim("seq_len", min=17, max=180),
    },
    "frames_padding_mask": {
        1: Dim("seq_len", min=17, max=180),
    }
}

exported_program = export(model_iphone, (example_input, example_input_mask), dynamic_shapes=dynamic_shapes)


I0728 18:16:50.731000 1607792 site-packages/torch/fx/experimental/symbolic_shapes.py:3334] [0/0] create_env
V0728 18:16:50.750000 1607792 site-packages/torch/fx/experimental/symbolic_shapes.py:6071] [0/0] _update_var_to_range s0 = VR[17, 180] (update)
I0728 18:16:50.750000 1607792 site-packages/torch/fx/experimental/symbolic_shapes.py:4606] [0/0] create_symbol s0 = 179 for L['x'].size()[1] [17, 180] (_dynamo/variables/builder.py:3033 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s0" or to suppress this message run with TORCHDYNAMO_EXTENDED_ADVICE="0"
V0728 18:16:50.752000 1607792 site-packages/torch/fx/experimental/symbolic_shapes.py:7018] [0/0] runtime_assert True == True [statically known]
V0728 18:16:50.758000 1607792 site-packages/torch/fx/experimental/symbolic_shapes.py:7018] [0/0] runtime_assert True == True [statically known]
V0728 18:16:50.760000 1607792 site-packages/torch/fx/experimental/symbolic_shapes.py:6787] [0/0] eval size_oblivious(Ne(s0

UserError: Constraints violated (seq_len)! For more information, run with TORCH_LOGS="+dynamic".
  - Not all values of seq_len = L['x'].size()[1] in the specified range 17 <= seq_len <= 180 satisfy the generated guard (L['x'].size()[1] % 16) != 0.


# Convert to CoreMLTools (deprecated)

In [None]:

#scripted_model = torch.jit.script(model_iphone, (example_input.to('cpu'), example_input_mask.to('cpu')))
traced_model = torch.jit.trace(model_iphone, (example_input, example_input_mask))

inputKeypoints_shape = ct.Shape(shape=(1, ct.RangeDim(lower_bound=1, upper_bound=180, default=90), 133,2))
inputMask_shape = ct.Shape(shape=(1, ct.RangeDim(lower_bound=1, upper_bound=180, default=90)))

coreml_model = ct.convert(traced_model, 
                          inputs = [
    ct.TensorType(shape=inputKeypoints_shape, name="keypoints"),
    ct.TensorType(shape=inputMask_shape, name="mask")],
    outputs=[
    ct.TensorType(name="embeddings")],
    convert_to="mlprogram")

coreml_model.save(f"{output_location}/model.mlpackage")