# Compilation du modèle avec ONNX

## 1 - Préparation et compilation du modèle

In [1]:
import torch
import torchvision
from collections import OrderedDict

In [2]:
# Créé à l'aide de ChatGPT
import torch

class ReshapeToBatchChannelFirst(torch.nn.Module):
    def __init__(self):
        super(ReshapeToBatchChannelFirst, self).__init__()

    def forward(self, x):
        # Ensure the input is of shape (224, 224, 3)
        #assert x.dim() == 3 and x.shape[-1] == 3, "Input must be (224, 224, 3)"
        
        # Permute dimensions from (H, W, C) to (C, H, W)
        x = x.permute(2, 0, 1)
        
        # Add a batch dimension at the beginning: (1, C, H, W)
        x = x.unsqueeze(0)
        return x

# Example usage
x = torch.rand(224, 224, 3)  # Example input
layer = ReshapeToBatchChannelFirst()
output = layer(x)
print(output.shape)  # Expected: torch.Size([1, 3, 224, 224])


torch.Size([1, 3, 224, 224])


In [3]:
# Créé à l'aide de ChatGPT
import torch
import torch.nn as nn

class FixedNormLayer(torch.nn.Module):
    def __init__(self, scale: torch.Tensor, mean: torch.Tensor, std: torch.Tensor):
        """
        Args:
            mean (torch.Tensor): Precomputed mean for normalization.
            std (torch.Tensor): Precomputed standard deviation for normalization.
        """
        super(FixedNormLayer, self).__init__()
        self.register_buffer("mean", mean[:, None, None])
        self.register_buffer("std", std[:, None, None])
        self.register_buffer("scale", scale)

    def forward(self, x):
        return (self.scale * x - self.mean) / self.std

# Example usage
mean = torch.tensor([0.5, 0.5, 0.5])  # Example mean for 3 channels
std = torch.tensor([0.2, 0.2, 0.2])   # Example std for 3 channels
scale = torch.tensor([1 / 256])
layer = FixedNormLayer(scale, mean, std)

# Test with a sample input
x = torch.rand(1, 3, 1, 1)  # Example input
output = layer(x)
print(output)

tensor([[[[-2.4836]],

         [[-2.4908]],

         [[-2.4918]]]])


In [4]:
# Créé à l'aide de ChatGPT
class InferenceModel(torch.nn.Module):
    def __init__(self, model, scale, mean, std):
        super(InferenceModel, self).__init__()
        self.preprocess = nn.Sequential(
            OrderedDict(
                [
                    ("reshape", ReshapeToBatchChannelFirst()),
                    ("normalize", FixedNormLayer(scale, mean, std)),
                ]
            )
        )
        self.model = model  # The main model
        self.postprocess = torch.nn.Softmax(1)

    def forward(self, x):
        x = self.preprocess(x)  # Apply reshaping and normalization
        x = self.model(x)  # Pass to the main model
        return self.postprocess(x)

Modification du modèle pour avoir le bon nombre de sortie dans la dernière couche et
pour calculer le softmax sur les sorties du modèle pour avoir directement les probabilités.

In [23]:
# Load pretrained ViT
num_labels = 11  # Get number of labels (e.g., 8)

model = torchvision.models.vit_b_16(weights="IMAGENET1K_V1")  # Load a pretrained model
model.heads.head = torch.nn.Linear(model.heads.head.in_features, num_labels)

In [24]:
model

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model.load_state_dict(
    torch.load(
        "models/tomato_model_2025_02_28_v2.pt",
        map_location=device,
        weights_only=True,
    )
)

<All keys matched successfully>

In [26]:
# Create inference model
scale = torch.tensor([1 / 256])
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
inference_model = InferenceModel(model, scale, mean, std)
inference_model.eval()

test = torch.randn(224, 224, 3)
inference_model(test)

tensor([[0.0214, 0.6055, 0.2210, 0.0180, 0.0034, 0.0022, 0.0020, 0.0813, 0.0186,
         0.0066, 0.0199]], grad_fn=<SoftmaxBackward0>)

Exportation du modèle, en incluant un tenseur aléatoire pour fournir la bonne taille de
tenseur en entrée.

In [27]:
torch_input = torch.randn(224, 224, 3)
onnx_program = torch.onnx.export(inference_model, torch_input, dynamo=True)

[torch.onnx] Obtain model graph for `InferenceModel([...]` with `torch.export.export`...
[torch.onnx] Obtain model graph for `InferenceModel([...]` with `torch.export.export`... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅


Ces warnings ne sont probablement pas grave, selon cette
[source](https://github.com/pytorch/pytorch/issues/144331).

In [28]:
onnx_program.save("models/tomato_model_2025_02_28_v2.onnx")

Exportation du modèle en format .ort pour l'exécution sur mobile.

In [12]:
import pathlib
from onnxruntime.tools import convert_onnx_models_to_ort as convert_onnx

convert_onnx.convert_onnx_models_to_ort(
    pathlib.Path("models/tomato_model_2025_02_28_v2.onnx"),
    output_dir=pathlib.Path("models"),
    optimization_styles=[convert_onnx.OptimizationStyle.Fixed],
    target_platform="arm",
)

Converting models with optimization style 'Fixed' and level 'all'
Converting optimized ONNX model /home/maxime/Documents/Code/happybud/training/models/tomato_model_2025_02_28_v2.onnx to ORT format model /home/maxime/Documents/Code/happybud/training/models/tomato_model_2025_02_28_v2.ort
Converted 1/1 models successfully.
Generating config file from ORT format models with optimization style 'Fixed' and level 'all'


2025-03-19 10:02:40,959 ort_format_model.utils [INFO] - Created config in /home/maxime/Documents/Code/happybud/training/models/tomato_model_2025_02_28_v2.required_operators.config


## 2 - Validation de l'exécution du modèle avec ONNX runtime

In [11]:
import onnxruntime
import PIL.Image
import numpy as np

ort_session = onnxruntime.InferenceSession(
    "models/tomato_model_2025_02_28_v2.onnx", providers=["CPUExecutionProvider"]
)

Pipeline fait sans pytorch

In [43]:
def single_image_pipeline(image_path, dtype="float32"):
    # Load image into numpy float array
    image = np.array(
        PIL.Image.open(image_path).convert("RGB").resize((224, 224)), dtype=dtype
    )

    return image

In [13]:
# Exécution du modèle
onnx_input = single_image_pipeline(
    "dataset/tomato/88614302-e6d2-4327-a4fb-a3db9c9ea72e___YLCV_NREC_2861.JPG"
)

onnxruntime_outputs = ort_session.run(None, {"l_x_": onnx_input})
onnxruntime_outputs

[array([[1.4499942e-06, 9.2473765e-06, 2.0949008e-06, 8.7807439e-06,
         1.5942234e-05, 7.3268388e-06, 1.2185769e-06, 9.3832878e-06,
         7.4934546e-06, 1.7394845e-05, 9.9991953e-01]], dtype=float32)]

## 2 - Séparation de l'encodeur et du décodeur

In [14]:
class InferenceModelEncoder(torch.nn.Module):
    def __init__(self, model, scale, mean, std):
        super(InferenceModelEncoder, self).__init__()
        self.preprocess = torch.nn.Sequential(
            OrderedDict(
                [
                    ("reshape", ReshapeToBatchChannelFirst()),
                    ("normalize", FixedNormLayer(scale, mean, std)),
                ]
            )
        )
        self.model = model  # The main model

    def forward(self, x):
        x = self.preprocess(x)  # Apply reshaping and normalization
        x = self.model(x)  # Pass to the main model
        return x

In [15]:
# Créé à l'aide de ChatGPT
class InferenceModelDecoder(torch.nn.Module):
    def __init__(self, model):
        super(InferenceModelDecoder, self).__init__()
        self.model = model  # The main model last layer
        self.postprocess = torch.nn.Softmax(1)

    def forward(self, x):
        x = self.model(x)  # Pass to the main model
        return self.postprocess(x)

In [16]:
# Load pretrained ViT
num_labels = 11  # Get number of labels (e.g., 8)

model = torchvision.models.vit_b_16(weights="IMAGENET1K_V1")  # Load a pretrained model
model.heads.head = torch.nn.Linear(model.heads.head.in_features, num_labels)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model.load_state_dict(
    torch.load(
        "models/tomato_model_2025_02_28_v2.pt",
        map_location=device,
        weights_only=True,
    )
)

<All keys matched successfully>

In [18]:
#Extract last layer
last_layer = torch.nn.Linear(model.heads.head.in_features, num_labels)
last_layer.weight = model.heads.head.weight
last_layer.weight

Parameter containing:
tensor([[ 0.0183, -0.0336, -0.0224,  ..., -0.0234, -0.0337,  0.0222],
        [ 0.0133,  0.0046,  0.0376,  ..., -0.0338, -0.0195, -0.0039],
        [ 0.0117,  0.0335, -0.0085,  ...,  0.0160, -0.0332, -0.0282],
        ...,
        [ 0.0254, -0.0356, -0.0124,  ...,  0.0298,  0.0253, -0.0077],
        [ 0.0218,  0.0312, -0.0024,  ..., -0.0325,  0.0179,  0.0299],
        [-0.0239, -0.0136,  0.0181,  ...,  0.0369, -0.0023, -0.0013]],
       requires_grad=True)

In [19]:
# Remove last layer from model
model.heads.head = torch.nn.Identity()

In [20]:
# Define encoder and decoder inference models
scale = torch.tensor([1 / 256])
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])

inference_model_encoder = InferenceModelEncoder(model, scale, mean, std)
inference_model_decoder = InferenceModelDecoder(last_layer)

In [21]:
# Test inference model encoder and decoder
test_input = torch.randn(224, 224, 3)

encoded = inference_model_encoder(test_input)
decoded = inference_model_decoder(encoded)
decoded

tensor([[0.0223, 0.5971, 0.2289, 0.0185, 0.0035, 0.0023, 0.0020, 0.0798, 0.0187,
         0.0065, 0.0203]], grad_fn=<SoftmaxBackward0>)

In [22]:
# Export to ONNX
onnx_encoder = torch.onnx.dynamo_export(inference_model_encoder, test_input)
onnx_encoder.save("models/tomato_model_2025_02_28_v2_encoder.onnx")

onnx_decoder = torch.onnx.dynamo_export(inference_model_decoder, encoded)
onnx_decoder.save("models/tomato_model_2025_02_28_v2_decoder.onnx")

  new_node = self.module.graph.get_attr(normalized_name)


Applied 37 of general pattern rewrite rules.




## 4 - Validation de l'exécution du modèle avec ONNX runtime (encoder decoder)

In [23]:
import onnxruntime
import PIL.Image
import numpy as np

ort_session_encoder = onnxruntime.InferenceSession(
    "models/tomato_model_2025_02_28_v2_encoder.onnx", providers=["CPUExecutionProvider"]
)

ort_session_decoder = onnxruntime.InferenceSession(
    "models/tomato_model_2025_02_28_v2_decoder.onnx", providers=["CPUExecutionProvider"]
)

In [24]:
def single_image_pipeline(image_path, dtype="float32"):
    # Load image into numpy float array
    image = np.array(
        PIL.Image.open(image_path).convert("RGB").resize((224, 224)), dtype=dtype
    )

    return image

In [25]:
# Exécution du modèle
onnx_input = single_image_pipeline(
    "dataset/tomato/88614302-e6d2-4327-a4fb-a3db9c9ea72e___YLCV_NREC_2861.JPG"
)

encoded_image = ort_session_encoder.run(None, {"l_x_": onnx_input})[0]
encoded_image

array([[-9.07714486e-01,  3.03961448e-02,  8.70138049e-01,
        -8.18296313e-01, -2.94012398e-01,  3.38017076e-01,
        -1.66627139e-01,  2.50272781e-01,  9.25461113e-01,
        -1.28882423e-01, -4.14819419e-01,  5.26395440e-01,
         3.60564232e-01, -9.18536007e-01,  1.99042797e-01,
         3.08384933e-03,  4.41153377e-01, -9.58641946e-01,
        -7.41269961e-02, -6.60064936e-01, -8.72934222e-01,
         1.00470936e+00, -4.35145199e-01,  5.68119168e-01,
         4.17836249e-01, -1.07043219e+00,  3.48928981e-02,
         8.65398526e-01, -1.04338002e+00,  7.23374724e-01,
         1.12749267e+00, -8.93600583e-02, -9.41086709e-01,
        -1.05196142e+00,  7.97558546e-01, -3.89004320e-01,
         2.09360883e-01,  2.86927879e-01,  8.63989651e-01,
        -1.43604144e-01, -1.26661256e-01, -1.56991899e-01,
        -6.36127710e-01, -1.38812959e-01,  4.89442796e-01,
         6.29252195e-01,  7.34239161e-01, -7.45797306e-02,
        -1.43365800e+00,  4.67331916e-01, -7.53821850e-0

In [26]:
# Decoding
decoded_output = ort_session_decoder.run(None, {"l_x_": encoded_image})
decoded_output

[array([[1.4769056e-06, 8.9960467e-06, 2.1273770e-06, 8.8322167e-06,
         1.5939237e-05, 7.6271754e-06, 1.2036876e-06, 9.0402264e-06,
         7.3717015e-06, 1.6906304e-05, 9.9992037e-01]], dtype=float32)]

## 5 - Modèle des plantes (MobileNet_v3)

In [29]:
# Load pretrained MobileNet_v3
num_labels = 3  # Get number of labels (e.g., 8)

model = torchvision.models.mobilenet_v3_large(weights="IMAGENET1K_V1")  # Load a pretrained model
model.classifier[3] = nn.Linear(
    model.classifier[3].in_features, num_labels
)  # Modify last layer

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.load_state_dict(
    torch.load(
        "models/plant_model_2025_03_21.pt",
        map_location=device,
        weights_only=True,
    )
)

<All keys matched successfully>

In [31]:
# Create inference model
scale = torch.tensor([1 / 256])
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
inference_model = InferenceModel(model, scale, mean, std)
inference_model.eval()

test = torch.randn(224, 224, 3)
inference_model(test)

tensor([[0.5562, 0.0982, 0.3456]], grad_fn=<SoftmaxBackward0>)

In [36]:
torch_input = torch.randn(224, 224, 3)
onnx_program = torch.onnx.export(inference_model, torch_input, dynamo=True)

[torch.onnx] Obtain model graph for `InferenceModel([...]` with `torch.export.export`...
[torch.onnx] Obtain model graph for `InferenceModel([...]` with `torch.export.export`... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅


In [38]:
onnx_program.save("models/plant_model_2025_03_21.onnx")

In [39]:
import pathlib
from onnxruntime.tools import convert_onnx_models_to_ort as convert_onnx

convert_onnx.convert_onnx_models_to_ort(
    pathlib.Path("models/plant_model_2025_03_21.onnx"),
    output_dir=pathlib.Path("models"),
    optimization_styles=[convert_onnx.OptimizationStyle.Fixed],
    target_platform="arm",
)

[0;93m2025-03-24 11:01:05.392481808 [W:onnxruntime:, graph.cc:109 MergeShapeInfo] Error merging shape info for output. '_native_batch_norm_legit_no_training__1' source:{16} target:{0}. Falling back to lenient merge.[m
[0;93m2025-03-24 11:01:05.392524817 [W:onnxruntime:, graph.cc:109 MergeShapeInfo] Error merging shape info for output. '_native_batch_norm_legit_no_training__2' source:{16} target:{0}. Falling back to lenient merge.[m
[0;93m2025-03-24 11:01:05.392663392 [W:onnxruntime:, graph.cc:109 MergeShapeInfo] Error merging shape info for output. '_native_batch_norm_legit_no_training_1__1' source:{16} target:{0}. Falling back to lenient merge.[m
[0;93m2025-03-24 11:01:05.392676723 [W:onnxruntime:, graph.cc:109 MergeShapeInfo] Error merging shape info for output. '_native_batch_norm_legit_no_training_1__2' source:{16} target:{0}. Falling back to lenient merge.[m
[0;93m2025-03-24 11:01:05.392787645 [W:onnxruntime:, graph.cc:109 MergeShapeInfo] Error merging shape info for outp

Converting models with optimization style 'Fixed' and level 'all'
Converting optimized ONNX model /home/maxime/Documents/Code/happybud/training/models/plant_model_2025_03_21.onnx to ORT format model /home/maxime/Documents/Code/happybud/training/models/plant_model_2025_03_21.ort
Converted 1/1 models successfully.
Generating config file from ORT format models with optimization style 'Fixed' and level 'all'


In [41]:
import onnxruntime
import PIL.Image
import numpy as np

ort_session = onnxruntime.InferenceSession(
    "models/plant_model_2025_03_21.ort", providers=["CPUExecutionProvider"]
)

In [45]:
# Exécution du modèle
onnx_input = single_image_pipeline(
    "dataset/tomato/88614302-e6d2-4327-a4fb-a3db9c9ea72e___YLCV_NREC_2861.JPG"
)

onnxruntime_outputs = ort_session.run(None, {"x": onnx_input})
onnxruntime_outputs

[array([[8.8753900e-08, 9.9999964e-01, 2.4354483e-07]], dtype=float32)]