# Compilation du modèle avec ONNX

## 1 - Préparation et compilation du modèle

In [1]:
import torch
import torchvision
from collections import OrderedDict

Modification du modèle pour avoir le bon nombre de sortie dans la dernière couche et
pour calculer le softmax sur les sorties du modèle pour avoir directement les probabilités.

In [3]:
# Load pretrained ViT
num_labels = 11  # Get number of labels (e.g., 8)

model = torchvision.models.vit_b_16(weights="IMAGENET1K_V1")  # Load a pretrained model

model.heads = torch.nn.Sequential(
    OrderedDict(
        [
            ("head", torch.nn.Linear(model.heads.head.in_features, num_labels)),
            (
                "norm",
                torch.nn.Softmax(1),
            ),
        ]
    )
)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model.load_state_dict(
    torch.load(
        "models/tomato_model_2025_02_28_v2.pt",
        map_location=device,
        weights_only=True,
    )
)

model.eval()

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

Exportation du modèle, en incluant un tenseur aléatoire pour fournir la bonne taille de
tenseur en entrée.

In [5]:
torch_input = torch.randn(1, 3, 224, 224)
onnx_program = torch.onnx.dynamo_export(model, torch_input)

  param_schemas = callee.param_schemas()
  param_schemas = callee.param_schemas()
  self.param_schema = self.onnxfunction.param_schemas()


Applied 37 of general pattern rewrite rules.


In [6]:
onnx_program.save("models/tomato_model_2025_02_28_v2.onnx")

## 2 - Validation de l'exécution du modèle avec ONNX runtime

In [8]:
import onnxruntime
import PIL
import numpy as np

onnx_input = [torch_input]
print(f"Input length: {len(onnx_input)}")
# print(f"Sample input: {onnx_input}")

ort_session = onnxruntime.InferenceSession(
    "models/tomato_model_2025_02_28_v2.onnx", providers=["CPUExecutionProvider"]
)

Input length: 1


Pipeline fait avec pytorch...

In [9]:
import torchvision.transforms.v2 as transforms_v2

transformations = [
    # Conversion to float32 image tensor
    transforms_v2.ToImage(),
    transforms_v2.ToDtype(torch.float32, scale=True),
    # Normalization for ViT, valeurs de ImageNet1K
    transforms_v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]

pipeline = transforms_v2.Compose(transformations)

onnx_input = np.array(
    PIL.Image.open(
        "dataset/tomato/88614302-e6d2-4327-a4fb-a3db9c9ea72e___YLCV_NREC_2861.JPG"
    )
    .convert("RGB")
    .resize((224, 224))
)

onnx_input = pipeline(onnx_input).numpy().reshape(1, 3, 224, 224)

Pipeline fait sans pytorch

In [13]:
def single_image_pipeline(
    image_path, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], dtype="float32"
):
    # Load image into numpy float array
    image = (
        np.array(
            PIL.Image.open(image_path).convert("RGB").resize((224, 224)), dtype=dtype
        )
        / 256
    )

    # Move channel axis as first axis instead of last, and create batch size of 1
    image = np.moveaxis(image, -1, 0).reshape(1, 3, 224, 224)

    # Normalize using mean and std
    mean = np.array(mean, dtype=dtype)
    std = np.array(std, dtype=dtype)

    image = (image - mean[None, :, None, None]) / std[None, :, None, None]

    return image

In [14]:
# Exécution du modèle
onnx_input = single_image_pipeline(
    "dataset/tomato/88614302-e6d2-4327-a4fb-a3db9c9ea72e___YLCV_NREC_2861.JPG"
)

onnxruntime_outputs = ort_session.run(None, {"l_x_": onnx_input})
onnxruntime_outputs

[array([[1.4499942e-06, 9.2473765e-06, 2.0949008e-06, 8.7807439e-06,
         1.5942234e-05, 7.3268388e-06, 1.2185769e-06, 9.3832878e-06,
         7.4934546e-06, 1.7394845e-05, 9.9991953e-01]], dtype=float32)]