In [1]:
from polygraphy.backend.onnxrt import OnnxrtRunner, SessionFromOnnx
from polygraphy.backend.trt import TrtRunner, EngineFromNetwork, NetworkFromOnnxPath, Profile
from polygraphy.comparator import Comparator, DataLoader
from polygraphy.backend.trt import CreateConfig as CreateTrtConfig, SaveEngine
import numpy as np
import tensorrt as trt


In [2]:
SAVE_ENGINE = True

In [3]:
model_path = "/home/ubuntu/vlm-vfm-processing-pipeline/models/vfm_fix_outofrange_fp16.onnx"
# engine_save_path = "/home/ubuntu/vlm-vfm-processing-pipeline/models/vfm.engine"
engine_save_path = "/home/ubuntu/vlm-vfm-processing-pipeline/models/model.plen"


In [4]:
profiles = [
    Profile()
    .add('patch_attn_mask', min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032])
    .add('all_pixel_values', min=[30, 3, 14, 14448], opt=[30, 3, 14, 14448], max=[30, 3, 14, 14448])
    ]

create_trt_config = CreateTrtConfig(
    profiles=profiles,
    # hardware_compatibility_level=trt.HardwareCompatibilityLevel.AMPERE_PLUS
    )

[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.


In [5]:
build_onnxrt_session = SessionFromOnnx(model_path)
build_engine = EngineFromNetwork(NetworkFromOnnxPath(model_path), config=create_trt_config)

if SAVE_ENGINE:
    # Save the engine to disk
    # Note: This is a blocking call and will take some time to complete
    engine = build_engine()
    SaveEngine(build_engine, engine_save_path)()
    
runners = [
    OnnxrtRunner(build_onnxrt_session),
    TrtRunner(build_engine),
]

data_loader = [{
    "all_pixel_values": np.zeros((30, 3, 14, 14448), dtype=np.float32),
    "patch_attn_mask": np.zeros((30, 1, 1032), dtype=np.bool_),
}]

[I] Configuring with profiles:[
        Profile 0:
            {patch_attn_mask [min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032]],
             all_pixel_values [min=[30, 3, 14, 14448], opt=[30, 3, 14, 14448], max=[30, 3, 14, 14448]]}
    ]
[38;5;11m[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.[0m
[38;5;14m[I] Building engine with configuration:
    Flags                  | []
    Engine Capability      | EngineCapability.STANDARD
    Memory Pools           | [WORKSPACE: 22515.75 MiB, TACTIC_DRAM: 22515.75 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
    Tactic Sources         | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [PROFILE_SHARING_0806][0m


[38;5;10m[I] Finished engine building in 138.729 seconds[0m


[I] Configuring with profiles:[
        Profile 0:
            {patch_attn_mask [min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032]],
             all_pixel_values [min=[30, 3, 14, 14448], opt=[30, 3, 14, 14448], max=[30, 3, 14, 14448]]}
    ]
[38;5;14m[I] Building engine with configuration:
    Flags                  | []
    Engine Capability      | EngineCapability.STANDARD
    Memory Pools           | [WORKSPACE: 22515.75 MiB, TACTIC_DRAM: 22515.75 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
    Tactic Sources         | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [PROFILE_SHARING_0806][0m


[38;5;10m[I] Finished engine building in 143.899 seconds[0m


[I] Saving engine to /home/ubuntu/vlm-vfm-processing-pipeline/models/model.plen


In [6]:
# Inspect the engine to verify that it was built correctly
cmd = f"polygraphy inspect model {engine_save_path}"
!{cmd}

[38;5;9m[!] Could not automatically determine model type for: /home/ubuntu/vlm-vfm-processing-pipeline/models/model.plen
    Please explicitly specify the type with the --model-type option[0m


In [7]:
# Run inference with trt 
import torch
from polygraphy.backend.common import BytesFromPath
from polygraphy.backend.trt import EngineFromBytes, TrtRunner
load_engine = EngineFromBytes(BytesFromPath(engine_save_path))

with TrtRunner(load_engine) as runner:
        all_pixel_values = torch.load(
            "/home/ubuntu/vlm-vfm-processing-pipeline/test_data/all_pixel_values.pkl",
            weights_only=True,
            map_location="cuda",
        )
        patch_attn_mask = torch.load(
            "/home/ubuntu/vlm-vfm-processing-pipeline/test_data/patch_attn_mask.pkl",
            weights_only=True,
            map_location="cuda",
        )

        # NOTE: The runner owns the output buffers and is free to reuse them between `infer()` calls.
        # Thus, if you want to store results from multiple inferences, you should use `copy.deepcopy()`.
        outputs = runner.infer(feed_dict={
            "all_pixel_values": all_pixel_values,
            "patch_attn_mask": patch_attn_mask,
            })


[I] Loading bytes from /home/ubuntu/vlm-vfm-processing-pipeline/models/model.plen


In [8]:
outputs['vision_embedding'].dtype

torch.float32

In [9]:
# Compare the results from the ONNX Runtime and TensorRT engines
results = Comparator.run(runners, data_loader=data_loader)


[38;5;14m[I] onnxrt-runner-N0-04/03/25-08:20:36  | Activating and starting inference[0m
[38;5;14m[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider'][0m


[I] onnxrt-runner-N0-04/03/25-08:20:36 
    ---- Inference Input(s) ----
    {all_pixel_values [dtype=float32, shape=(30, 3, 14, 14448)],
     patch_attn_mask [dtype=bool, shape=(30, 1, 1032)]}


[I] onnxrt-runner-N0-04/03/25-08:20:36 
    ---- Inference Output(s) ----
    {vision_embedding [dtype=float32, shape=(30, 64, 3584)]}
[38;5;10m[I] onnxrt-runner-N0-04/03/25-08:20:36  | Completed 1 iteration(s) in 1.284e+05 ms | Average inference time: 1.284e+05 ms.[0m


[38;5;14m[I] trt-runner-N0-04/03/25-08:20:36     | Activating and starting inference[0m


[I] Configuring with profiles:[
        Profile 0:
            {patch_attn_mask [min=[30, 1, 1032], opt=[30, 1, 1032], max=[30, 1, 1032]],
             all_pixel_values [min=[30, 3, 14, 14448], opt=[30, 3, 14, 14448], max=[30, 3, 14, 14448]]}
    ]
[38;5;14m[I] Building engine with configuration:
    Flags                  | []
    Engine Capability      | EngineCapability.STANDARD
    Memory Pools           | [WORKSPACE: 22515.75 MiB, TACTIC_DRAM: 22515.75 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
    Tactic Sources         | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [PROFILE_SHARING_0806][0m


[38;5;10m[I] Finished engine building in 144.412 seconds[0m


[I] trt-runner-N0-04/03/25-08:20:36    
    ---- Inference Input(s) ----
    {all_pixel_values [dtype=float32, shape=(30, 3, 14, 14448)],
     patch_attn_mask [dtype=bool, shape=(30, 1, 1032)]}


[I] trt-runner-N0-04/03/25-08:20:36    
    ---- Inference Output(s) ----
    {vision_embedding [dtype=float32, shape=(30, 64, 3584)]}
[38;5;10m[I] trt-runner-N0-04/03/25-08:20:36     | Completed 1 iteration(s) in 1999 ms | Average inference time: 1999 ms.[0m


In [10]:
Comparator.compare_accuracy(results)

[38;5;14m[I] Accuracy Comparison | onnxrt-runner-N0-04/03/25-08:20:36 vs. trt-runner-N0-04/03/25-08:20:36[0m
[38;5;14m[I]     Comparing Output: 'vision_embedding' (dtype=float32, shape=(30, 64, 3584)) with 'vision_embedding' (dtype=float32, shape=(30, 64, 3584))[0m
[I]         Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error


[I]         onnxrt-runner-N0-04/03/25-08:20:36: vision_embedding | Stats: mean=-0.00056166, std-dev=0.51486, var=0.26509, median=-0.00073969, min=-6.8264 at (0, 8, 1929), max=9.062 at (0, 63, 2570), avg-magnitude=0.3349, p90=0.49777, p95=0.75957, p99=1.5146
[I]             ---- Histogram ----
                Bin Range        |  Num Elems | Visualization
                (-6.83 , -5.24 ) |        390 | 
                (-5.24 , -3.65 ) |       1800 | 
                (-3.65 , -2.06 ) |      20790 | 
                (-2.06 , -0.471) |     726710 | ####
                (-0.471, 1.12  ) |    5977060 | ########################################
                (1.12  , 2.71  ) |     145260 | 
                (2.71  , 4.3   ) |       7740 | 
                (4.3   , 5.88  ) |        980 | 
                (5.88  , 7.47  ) |        480 | 
                (7.47  , 9.06  ) |         70 | 


[I]         trt-runner-N0-04/03/25-08:20:36: vision_embedding | Stats: mean=-0.00056189, std-dev=0.51487, var=0.26509, median=-0.00074273, min=-6.8263 at (0, 8, 1929), max=9.0618 at (0, 63, 2570), avg-magnitude=0.33491, p90=0.49779, p95=0.75954, p99=1.5143
[I]             ---- Histogram ----
                Bin Range        |  Num Elems | Visualization
                (-6.83 , -5.24 ) |        390 | 
                (-5.24 , -3.65 ) |       1800 | 
                (-3.65 , -2.06 ) |      20790 | 
                (-2.06 , -0.471) |     726660 | ####
                (-0.471, 1.12  ) |    5977110 | ########################################
                (1.12  , 2.71  ) |     145260 | 
                (2.71  , 4.3   ) |       7740 | 
                (4.3   , 5.88  ) |        980 | 
                (5.88  , 7.47  ) |        480 | 
                (7.47  , 9.06  ) |         70 | 
[I]         Error Metrics: vision_embedding
[I]             Minimum Required Tolerance: elemwise error | [abs=0

[I]             Absolute Difference | Stats: mean=7.1331e-05, std-dev=0.0001252, var=1.5674e-08, median=3.2037e-05, min=0 at (0, 0, 1913), max=0.0030964 at (1, 46, 775), avg-magnitude=7.1331e-05, p90=0.00016517, p95=0.00027137, p99=0.00063539
[I]                 ---- Histogram ----
                    Bin Range            |  Num Elems | Visualization
                    (0       , 0.00031 ) |    6602900 | ########################################
                    (0.00031 , 0.000619) |     205190 | #
                    (0.000619, 0.000929) |      49870 | 
                    (0.000929, 0.00124 ) |      15720 | 
                    (0.00124 , 0.00155 ) |       4760 | 
                    (0.00155 , 0.00186 ) |       1760 | 
                    (0.00186 , 0.00217 ) |        700 | 
                    (0.00217 , 0.00248 ) |        280 | 
                    (0.00248 , 0.00279 ) |         40 | 
                    (0.00279 , 0.0031  ) |         60 | 


[I]             Relative Difference | Stats: mean=0.0017297, std-dev=0.14582, var=0.021263, median=0.00015721, min=0 at (0, 0, 1913), max=77.412 at (1, 5, 2535), avg-magnitude=0.0017297, p90=0.0012476, p95=0.0025453, p99=0.012876
[I]                 ---- Histogram ----
                    Bin Range    |  Num Elems | Visualization
                    (0   , 7.74) |    6881150 | ########################################
                    (7.74, 15.5) |         90 | 
                    (15.5, 23.2) |         20 | 
                    (23.2, 31  ) |          0 | 
                    (31  , 38.7) |          0 | 
                    (38.7, 46.4) |          0 | 
                    (46.4, 54.2) |          0 | 
                    (54.2, 61.9) |          0 | 
                    (61.9, 69.7) |          0 | 
                    (69.7, 77.4) |         20 | 
[38;5;9m[E]         FAILED | Output: 'vision_embedding' | Difference exceeds tolerance (rel=1e-05, abs=1e-05)[0m
[38;5;9m[E]     FAILED

OrderedDict([(('onnxrt-runner-N0-04/03/25-08:20:36', 'trt-runner-N0-04/03/25-08:20:36'), [OrderedDict([('vision_embedding', <polygraphy.comparator.compare.OutputCompareResult object at 0x7fa3b815dff0>)])])])