In [2]:
from polygraphy.backend.onnxrt import OnnxrtRunner, SessionFromOnnx
from polygraphy.backend.trt import TrtRunner, EngineFromNetwork, NetworkFromOnnxPath, Profile
from polygraphy.comparator import Comparator, DataLoader
from polygraphy.backend.trt import CreateConfig as CreateTrtConfig, SaveEngine
import numpy as np
import tensorrt as trt


In [3]:
SAVE_ENGINE = True

In [12]:
model_path = "/home/ubuntu/vlm-vfm-processing-pipeline/models/vfm_fix_outofrange_fp16.onnx"
engine_save_path = "/home/ubuntu/vlm-vfm-processing-pipeline/models/vfm.engine"

In [5]:
profiles = [
    Profile()
    .add('patch_attn_mask', min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032])
    .add('all_pixel_values', min=[30, 3, 14, 14448], opt=[30, 3, 14, 14448], max=[30, 3, 14, 14448])
    ]

create_trt_config = CreateTrtConfig(
    profiles=profiles,
    # hardware_compatibility_level=trt.HardwareCompatibilityLevel.AMPERE_PLUS
    )

[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.


In [None]:
build_onnxrt_session = SessionFromOnnx(model_path)
build_engine = EngineFromNetwork(NetworkFromOnnxPath(model_path), config=create_trt_config)

if SAVE_ENGINE:
    # Save the engine to disk
    # Note: This is a blocking call and will take some time to complete
    engine = build_engine()
    SaveEngine(build_engine, engine_save_path)()
    
runners = [
    OnnxrtRunner(build_onnxrt_session),
    TrtRunner(build_engine),
]

data_loader = [{
    "all_pixel_values": np.zeros((30, 3, 14, 14448), dtype=np.float32),
    "patch_attn_mask": np.zeros((30, 1, 1032), dtype=np.bool_),
}]

[I] Configuring with profiles:[
        Profile 0:
            {patch_attn_mask [min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032]],
             all_pixel_values [min=[30, 3, 14, 14448], opt=[30, 3, 14, 14448], max=[30, 3, 14, 14448]]}
    ]
[38;5;11m[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.[0m
[38;5;14m[I] Building engine with configuration:
    Flags                  | []
    Engine Capability      | EngineCapability.STANDARD
    Memory Pools           | [WORKSPACE: 22515.75 MiB, TACTIC_DRAM: 22515.75 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
    Tactic Sources         | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [PROFILE_SHARING_0806][0m
[38;5;10m[I] Finished engine building in 95.400 seconds[0m
[I] Configuring with profiles:[
        Profile 0:
            {patch_attn_mask [min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032]

In [14]:
# Inspect the engine to verify that it was built correctly
cmd = f"polygraphy inspect model {engine_save_path}"
!{cmd}

[I] Loading bytes from /home/ubuntu/vlm-vfm-processing-pipeline/models/vfm.engine
[38;5;11m[W] hasImplicitBatchDimension is deprecated and always return false.[0m
[I] ==== TensorRT Engine ====
    Name: Unnamed Network 0 | Explicit Batch Engine
    
    ---- 2 Engine Input(s) ----
    {all_pixel_values [dtype=float32, shape=(30, 3, 14, 14448)],
     patch_attn_mask [dtype=bool, shape=(30, 1, 1032)]}
    
    ---- 1 Engine Output(s) ----
    {vision_embedding [dtype=float32, shape=(30, 64, 3584)]}
    
    ---- Memory ----
    Device Memory: 7662036480 bytes
    
    ---- 1 Profile(s) (3 Tensor(s) Each) ----
    - Profile: 0
        Tensor: all_pixel_values          (Input), Index: 0 | Shapes: min=(30, 3, 14, 14448), opt=(30, 3, 14, 14448), max=(30, 3, 14, 14448)
        Tensor: patch_attn_mask           (Input), Index: 1 | Shapes: min=(30, 1, 1032), opt=(30, 1, 1032), max=(30, 1, 1032)
        Tensor: vision_embedding         (Output), Index: 2 | Shape: (30, 64, 3584)
    
    ---- 4

In [22]:
# Run inference with trt 
import torch
from polygraphy.backend.common import BytesFromPath
from polygraphy.backend.trt import EngineFromBytes, TrtRunner
load_engine = EngineFromBytes(BytesFromPath(engine_save_path))

with TrtRunner(load_engine) as runner:
        all_pixel_values = torch.load(
            "/home/ubuntu/vlm-vfm-processing-pipeline/test_data/all_pixel_values.pkl",
            weights_only=True,
            map_location="cuda",
        )
        patch_attn_mask = torch.load(
            "/home/ubuntu/vlm-vfm-processing-pipeline/test_data/patch_attn_mask.pkl",
            weights_only=True,
            map_location="cuda",
        )

        # NOTE: The runner owns the output buffers and is free to reuse them between `infer()` calls.
        # Thus, if you want to store results from multiple inferences, you should use `copy.deepcopy()`.
        outputs = runner.infer(feed_dict={
            "all_pixel_values": all_pixel_values,
            "patch_attn_mask": patch_attn_mask,
            })


[I] Loading bytes from /home/ubuntu/vlm-vfm-processing-pipeline/models/vfm.engine


In [21]:
outputs['vision_embedding'].dtype

torch.float32

In [None]:
# Compare the results from the ONNX Runtime and TensorRT engines
results = Comparator.run(runners, data_loader=data_loader)


[38;5;14m[I] onnxrt-runner-N0-03/26/25-14:46:29  | Activating and starting inference[0m
[38;5;14m[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider'][0m
[I] onnxrt-runner-N0-03/26/25-14:46:29 
    ---- Inference Input(s) ----
    {all_pixel_values [dtype=float32, shape=(30, 3, 14, 14448)],
     patch_attn_mask [dtype=bool, shape=(30, 1, 1032)]}
[I] onnxrt-runner-N0-03/26/25-14:46:29 
    ---- Inference Output(s) ----
    {vision_embedding [dtype=float32, shape=(30, 64, 3584)]}
[38;5;10m[I] onnxrt-runner-N0-03/26/25-14:46:29  | Completed 1 iteration(s) in 1.295e+05 ms | Average inference time: 1.295e+05 ms.[0m
[38;5;14m[I] trt-runner-N0-03/26/25-14:46:29     | Activating and starting inference[0m
[I] Configuring with profiles:[
        Profile 0:
            {patch_attn_mask [min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032]],
             all_pixel_values [min=[30, 3, 14, 14448], opt=[30, 3, 14, 14448], max=[30, 3, 14, 14448]]}
    ]
[38;5

In [13]:
Comparator.compare_accuracy(results)

[38;5;14m[I] Accuracy Comparison | onnxrt-runner-N0-03/26/25-14:46:29 vs. trt-runner-N0-03/26/25-14:46:29[0m
[38;5;14m[I]     Comparing Output: 'vision_embedding' (dtype=float32, shape=(30, 64, 3584)) with 'vision_embedding' (dtype=float32, shape=(30, 64, 3584))[0m
[I]         Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error
[I]         onnxrt-runner-N0-03/26/25-14:46:29: vision_embedding | Stats: mean=-0.00056166, std-dev=0.51486, var=0.26509, median=-0.00073969, min=-6.8264 at (0, 8, 1929), max=9.062 at (0, 63, 2570), avg-magnitude=0.3349, p90=0.49777, p95=0.75957, p99=1.5146
[I]             ---- Histogram ----
                Bin Range        |  Num Elems | Visualization
                (-6.83 , -5.24 ) |        390 | 
                (-5.24 , -3.65 ) |       1800 | 
                (-3.65 , -2.06 ) |      20790 | 
                (-2.06 , -0.471) |     726710 | ####
                (-0.471, 1.12  ) |    5977060 | ########################################
             

OrderedDict([(('onnxrt-runner-N0-03/26/25-14:46:29', 'trt-runner-N0-03/26/25-14:46:29'), [OrderedDict([('vision_embedding', <polygraphy.comparator.compare.OutputCompareResult object at 0x7e4a8410a9e0>)])])])