# Identify Model Requirements and think which of these will do better

## Model Optimisations

### Baseline Inference

In [3]:
import torch
import numpy as np
import time
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

In [6]:
model_name = "gpt2"
device = "mps" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
####Lines to change when you are using custom model
model_path = "models/food11.pth"  
device = torch.device("cpu")
model = torch.load(model_path, map_location=device, weights_only=False)
model.eval()  
summary(model)

In [7]:
################Just Test#######################


# Warm-up
for _ in range(5):
    _ = model(input_ids)

# Benchmark inference
num_trials = 100
latencies = []

for _ in range(num_trials):
    start = time.time()
    _ = model(input_ids)
    latencies.append(time.time() - start)

latencies = np.array(latencies)
model_size = sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / 1e6  # float32

# Results
print(" PyTorch GPT-2 Baseline")
print(f"Model Size: {model_size:.2f} MB (approx)")
print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th Percentile: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"99th Percentile: {np.percentile(latencies, 99)*1000:.2f} ms")
print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/s")

 PyTorch GPT-2 Baseline
Model Size: 497.76 MB (approx)
Median Latency: 17.19 ms
95th Percentile: 17.70 ms
99th Percentile: 19.33 ms
Throughput: 58.12 req/s


In [8]:
# Batch Throughput Benchmark
batch_size = 16
batch_input = input_ids.repeat(batch_size, 1)  # Repeat same input to simulate batch
batch_latencies = []
num_batches = 50

# Warm-up
for _ in range(5):
    _ = model(batch_input)

for _ in range(num_batches):
    start = time.time()
    _ = model(batch_input)
    batch_latencies.append(time.time() - start)

total_tokens = batch_size * num_batches
batch_fps = total_tokens / np.sum(batch_latencies)

print("\n=== Batch Inference ===")
print(f"Batch Size: {batch_size}")
print(f"Total Tokens: {total_tokens}")
print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
print(f"Average Time per Batch: {np.mean(batch_latencies)*1000:.2f} ms")



=== Batch Inference ===
Batch Size: 16
Total Tokens: 800
Batch Throughput: 217.13 tokens/sec
Average Time per Batch: 73.69 ms


#### Compiled Model

In [None]:
###add

model.compile()

##this line to above code and run again

### Convert to ONNX

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

In [10]:
model_name = "gpt2"
output_path = "gpt2.onnx"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Symptoms: fever and cough\nQuestion: What should I do?\nAnswer:"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

torch.onnx.export(
    model,
    input_ids,
    f=output_path,
    input_names=["input_ids"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch", 1: "sequence"}, "logits": {0: "batch"}},
    opset_version=13,
    do_constant_folding=True,
)

print(f"Exported GPT-2 to ONNX format: {output_path}")
print(f"Model Size: {os.path.getsize(output_path)/1e6:.2f} MB")

#______________Just a test to convert ONNX, need to scale to lab level when working on docker______________

Exported GPT-2 to ONNX format: gpt2.onnx
Model Size: 653.66 MB


#### Validate ONNX Model

In [9]:
import onnx
onnx_model = onnx.load("gpt2.onnx")
onnx.checker.check_model(onnx_model)
print("‚úÖ ONNX model is valid.")

### Onnx Inference session on CPU

In [12]:
import onnxruntime as ort
import numpy as np
import time
import os
from transformers import AutoTokenizer

# Load tokenizer (same as PyTorch version)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load ONNX model
model_path = "gpt2.onnx"
session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])

# Prepare input
prompt = "Symptoms: fever and cough\nQuestion: What should I do?\nAnswer:"
input_ids = tokenizer.encode(prompt, return_tensors="np").astype(np.int64)

input_name = session.get_inputs()[0].name
num_trials = 100

# Warm-up
for _ in range(5):
    session.run(None, {input_name: input_ids})

# Measure single-sample inference
latencies = []
for _ in range(num_trials):
    start = time.time()
    _ = session.run(None, {input_name: input_ids})
    latencies.append(time.time() - start)

latencies = np.array(latencies)
model_size = os.path.getsize(model_path) / 1e6

print("ONNX Inference (Single Sample)")
print(f"Model Size: {model_size:.2f} MB")
print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th Percentile Latency: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"99th Percentile Latency: {np.percentile(latencies, 99)*1000:.2f} ms")
print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/sec")

# Batch throughput
batch_size = 16
batch_input = np.tile(input_ids, (batch_size, 1))
batch_latencies = []
num_batches = 50

# Warm-up
for _ in range(5):
    session.run(None, {input_name: batch_input})

for _ in range(num_batches):
    start = time.time()
    session.run(None, {input_name: batch_input})
    batch_latencies.append(time.time() - start)

total_tokens = batch_input.shape[0] * num_batches
batch_fps = total_tokens / np.sum(batch_latencies)

print("\nONNX Batch Inference")
print(f"Batch Size: {batch_size}")
print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
print(f"Avg Batch Latency: {np.mean(batch_latencies)*1000:.2f} ms")


ONNX Inference (Single Sample)
Model Size: 653.66 MB
Median Latency: 11.74 ms
95th Percentile Latency: 11.90 ms
99th Percentile Latency: 11.95 ms
Throughput: 84.95 req/sec

ONNX Batch Inference
Batch Size: 16
Batch Throughput: 115.63 tokens/sec
Avg Batch Latency: 138.37 ms


### Applying Graph Optimisations

In [13]:
import onnxruntime as ort

onnx_model_path = "gpt2.onnx"
optimized_model_path = "gpt2_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_options.optimized_model_filepath = optimized_model_path

# Run once to save optimized model
_ = ort.InferenceSession(onnx_model_path, sess_options=session_options, providers=["CPUExecutionProvider"])

print(f"Saved optimized ONNX model to: {optimized_model_path}")


Saved optimized ONNX model to: gpt2_optimized.onnx


#### Inference for Graph Optimised

In [14]:
import onnxruntime as ort
import numpy as np
import time
import os
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load optimized model
model_path = "gpt2_optimized.onnx"
session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])

# Prepare input
prompt = "Symptoms: fever and cough\nQuestion: What should I do?\nAnswer:"
input_ids = tokenizer.encode(prompt, return_tensors="np").astype(np.int64)

input_name = session.get_inputs()[0].name
num_trials = 100

# Warm-up
for _ in range(5):
    session.run(None, {input_name: input_ids})

# Single-sample latency
latencies = []
for _ in range(num_trials):
    start = time.time()
    _ = session.run(None, {input_name: input_ids})
    latencies.append(time.time() - start)

latencies = np.array(latencies)
model_size = os.path.getsize(model_path) / 1e6

print("ONNX Optimized Inference (Single Sample)")
print(f"Model Size: {model_size:.2f} MB")
print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th Percentile Latency: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"99th Percentile Latency: {np.percentile(latencies, 99)*1000:.2f} ms")
print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/sec")

# Batch throughput
batch_size = 16
batch_input = np.tile(input_ids, (batch_size, 1))
batch_latencies = []
num_batches = 50

# Warm-up
for _ in range(5):
    session.run(None, {input_name: batch_input})

for _ in range(num_batches):
    start = time.time()
    session.run(None, {input_name: batch_input})
    batch_latencies.append(time.time() - start)

total_tokens = batch_input.shape[0] * num_batches
batch_fps = total_tokens / np.sum(batch_latencies)

print("\nüì¶ ONNX Optimized Batch Inference")
print(f"Batch Size: {batch_size}")
print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
print(f"Avg Batch Latency: {np.mean(batch_latencies)*1000:.2f} ms")


üìä ONNX Optimized Inference (Single Sample)
Model Size: 653.41 MB
Median Latency: 12.31 ms
95th Percentile Latency: 24.24 ms
99th Percentile Latency: 31.38 ms
Throughput: 73.20 req/sec

üì¶ ONNX Optimized Batch Inference
Batch Size: 16
Batch Throughput: 118.04 tokens/sec
Avg Batch Latency: 135.55 ms


### Dynamic Quatization

In [45]:
import os
import onnx
import onnxruntime as ort
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig
from neural_compressor.model.onnx_model import ONNXModel
from transformers import AutoTokenizer
import numpy as np
import time

# Paths
fp32_model_path = "gpt2_optimized.onnx"
quant_model_path = "gpt2_quantized_dynamic_inc.onnx"

# Load the GPT-2 model into INC
fp32_model = ONNXModel(fp32_model_path)

# Configure dynamic quantization with fallback type info
config = PostTrainingQuantConfig(
    approach="dynamic"
)

# Fit quantized model
q_model = quantization.fit(
    model=fp32_model,
    conf=config
)

# Save it
quant_model_path = "models/gpt2_quantized_static.onnx"
os.makedirs(os.path.dirname(quant_model_path), exist_ok=True)
q_model.save_model_to_file(quant_model_path)
print(f"Static quantized model saved: {quant_model_path}")
print(f" Model size: {os.path.getsize(quant_model_path)/1e6:.2f} MB")



2025-05-05 17:48:17 [INFO] Start auto tuning.
2025-05-05 17:48:17 [INFO] Quantize model without tuning!
2025-05-05 17:48:17 [INFO] Quantize the model with default configuration without evaluating the model.                To perform the tuning process, please either provide an eval_func or provide an                    eval_dataloader an eval_metric.
2025-05-05 17:48:17 [INFO] Adaptor has 5 recipes.
2025-05-05 17:48:17 [INFO] 0 recipes specified by user.
2025-05-05 17:48:17 [INFO] 3 recipes require future tuning.
2025-05-05 17:48:17 [INFO] *** Initialize auto tuning
2025-05-05 17:48:17 [INFO] {
2025-05-05 17:48:17 [INFO]     'PostTrainingQuantConfig': {
2025-05-05 17:48:17 [INFO]         'AccuracyCriterion': {
2025-05-05 17:48:17 [INFO]             'criterion': 'relative',
2025-05-05 17:48:17 [INFO]             'higher_is_better': True,
2025-05-05 17:48:17 [INFO]             'tolerable_loss': 0.01,
2025-05-05 17:48:17 [INFO]             'absolute': None,
2025-05-05 17:48:17 [INFO]     

Static quantized model saved: models/gpt2_quantized_static.onnx
 Model size: 165.39 MB
‚úÖ Quantized GPT-2 model saved to: models/gpt2_quantized_static.onnx
üì¶ Model Size: 165.39 MB


In [21]:
import onnxruntime as ort
import numpy as np
import time
import os
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load quantized model
model_path = "gpt2_quantized_dynamic_inc.onnx"
session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])

# Prepare input
prompt = "Symptoms: fever and cough\nQuestion: What should I do?\nAnswer:"
input_ids = tokenizer.encode(prompt, return_tensors="np").astype(np.int64)
input_name = session.get_inputs()[0].name

# Warm-up
for _ in range(5):
    session.run(None, {input_name: input_ids})

# Single-sample latency
num_trials = 100
latencies = []

for _ in range(num_trials):
    start = time.time()
    _ = session.run(None, {input_name: input_ids})
    latencies.append(time.time() - start)

latencies = np.array(latencies)
model_size = os.path.getsize(model_path) / 1e6

print("üìä Quantized GPT-2 (Dynamic)")
print(f"Model Size: {model_size:.2f} MB")
print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th Percentile Latency: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"99th Percentile Latency: {np.percentile(latencies, 99)*1000:.2f} ms")
print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/sec")

# Batch throughput
batch_size = 16
batch_input = np.tile(input_ids, (batch_size, 1))
batch_latencies = []
num_batches = 50

# Warm-up
for _ in range(5):
    session.run(None, {input_name: batch_input})

for _ in range(num_batches):
    start = time.time()
    session.run(None, {input_name: batch_input})
    batch_latencies.append(time.time() - start)

total_tokens = batch_input.shape[0] * num_batches
batch_fps = total_tokens / np.sum(batch_latencies)

print("\nüì¶ Quantized Batch Inference")
print(f"Batch Size: {batch_size}")
print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
print(f"Avg Batch Latency: {np.mean(batch_latencies)*1000:.2f} ms")


üìä Quantized GPT-2 (Dynamic)
Model Size: 165.39 MB
Median Latency: 7.50 ms
95th Percentile Latency: 7.81 ms
99th Percentile Latency: 8.97 ms
Throughput: 132.02 req/sec

üì¶ Quantized Batch Inference
Batch Size: 16
Batch Throughput: 152.59 tokens/sec
Avg Batch Latency: 104.85 ms


### Static Quantisation

In [32]:
from transformers import AutoTokenizer
import numpy as np
import torch

class GPT2CalibrationDataLoader:
    def __init__(self, tokenizer, batch_size=8, num_batches=10):
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.num_batches = num_batches
        self.prompt = "Symptoms: fever and cough. Question: What should I do?"

    def __len__(self):
        return self.num_batches

    def __iter__(self):
        for _ in range(self.num_batches):
            input_ids = self.tokenizer.encode(self.prompt, return_tensors="np").astype(np.int64)
            batch = np.tile(input_ids, (self.batch_size, 1))  # shape (B, T)
            dummy_labels = np.zeros((self.batch_size,))       # üü° placeholder
            yield (batch, dummy_labels)


In [44]:
import os
import onnx
from transformers import AutoTokenizer
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion
from neural_compressor.model.onnx_model import ONNXModel

# Override: Use float type when type inference fails
os.environ["INC_ONNX_DEFAULT_TENSOR_TYPE"] = str(onnx.TensorProto.FLOAT)

# Prepare calibration loader
tokenizer = AutoTokenizer.from_pretrained("gpt2")
calib_loader = GPT2CalibrationDataLoader(tokenizer=tokenizer)

# Prepare config
config = PostTrainingQuantConfig(
    approach="static",
    calibration_sampling_size=calib_loader.batch_size,
    excluded_precisions=[],
    accuracy_criterion=AccuracyCriterion(tolerable_loss=0.05, higher_is_better=True)
)

# Load model
fp32_model_path = "gpt2_optimized.onnx"
quant_model_path = "models/gpt2_quantized_static.onnx"
fp32_model = ONNXModel(fp32_model_path)

# Quantize
q_model = quantization.fit(
    model=fp32_model,
    conf=config,
    calib_dataloader=calib_loader
)

# Save
q_model.save_model_to_file(quant_model_path)
print(f"Static quantized model saved: {quant_model_path}")
print(f" Model size: {os.path.getsize(quant_model_path)/1e6:.2f} MB")


2025-05-05 17:45:47 [INFO] Start auto tuning.
2025-05-05 17:45:47 [INFO] Quantize model without tuning!
2025-05-05 17:45:47 [INFO] Quantize the model with default configuration without evaluating the model.                To perform the tuning process, please either provide an eval_func or provide an                    eval_dataloader an eval_metric.
2025-05-05 17:45:47 [INFO] Adaptor has 5 recipes.
2025-05-05 17:45:47 [INFO] 0 recipes specified by user.
2025-05-05 17:45:47 [INFO] 3 recipes require future tuning.
2025-05-05 17:45:47 [INFO] *** Initialize auto tuning
2025-05-05 17:45:47 [INFO] {
2025-05-05 17:45:47 [INFO]     'PostTrainingQuantConfig': {
2025-05-05 17:45:47 [INFO]         'AccuracyCriterion': {
2025-05-05 17:45:47 [INFO]             'criterion': 'relative',
2025-05-05 17:45:47 [INFO]             'higher_is_better': True,
2025-05-05 17:45:47 [INFO]             'tolerable_loss': 0.05,
2025-05-05 17:45:47 [INFO]             'absolute': None,
2025-05-05 17:45:47 [INFO]     

FileNotFoundError: [Errno 2] No such file or directory: 'models/gpt2_quantized_static.onnx'

In [34]:
import onnxruntime as ort
import numpy as np
import time
import os
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load optimized model
model_path = "gpt2_quantized_static.onnx"
session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])

# Prepare input
prompt = "Symptoms: fever and cough\nQuestion: What should I do?\nAnswer:"
input_ids = tokenizer.encode(prompt, return_tensors="np").astype(np.int64)

input_name = session.get_inputs()[0].name
num_trials = 100

# Warm-up
for _ in range(5):
    session.run(None, {input_name: input_ids})

# Single-sample latency
latencies = []
for _ in range(num_trials):
    start = time.time()
    _ = session.run(None, {input_name: input_ids})
    latencies.append(time.time() - start)

latencies = np.array(latencies)
model_size = os.path.getsize(model_path) / 1e6

print("üìä ONNX Optimized Inference (Single Sample)")
print(f"Model Size: {model_size:.2f} MB")
print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th Percentile Latency: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"99th Percentile Latency: {np.percentile(latencies, 99)*1000:.2f} ms")
print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/sec")

# Batch throughput
batch_size = 16
batch_input = np.tile(input_ids, (batch_size, 1))
batch_latencies = []
num_batches = 50

# Warm-up
for _ in range(5):
    session.run(None, {input_name: batch_input})

for _ in range(num_batches):
    start = time.time()
    session.run(None, {input_name: batch_input})
    batch_latencies.append(time.time() - start)

total_tokens = batch_input.shape[0] * num_batches
batch_fps = total_tokens / np.sum(batch_latencies)

print("\nüì¶ ONNX Optimized Batch Inference")
print(f"Batch Size: {batch_size}")
print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
print(f"Avg Batch Latency: {np.mean(batch_latencies)*1000:.2f} ms")


üìä ONNX Optimized Inference (Single Sample)
Model Size: 165.23 MB
Median Latency: 7.90 ms
95th Percentile Latency: 7.92 ms
99th Percentile Latency: 7.93 ms
Throughput: 126.63 req/sec

üì¶ ONNX Optimized Batch Inference
Batch Size: 16
Batch Throughput: 139.42 tokens/sec
Avg Batch Latency: 114.76 ms


### Different Execution Providers

In [40]:
import onnxruntime as ort
import numpy as np
import time
import os
from transformers import AutoTokenizer

def benchmark_onnx_model(model_path, provider="CPUExecutionProvider", prompt=None, batch_size=16, num_trials=100, num_batches=50):
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    prompt = prompt or "Symptoms: fever and cough\nQuestion: What should I do?\nAnswer:"
    input_ids = tokenizer.encode(prompt, return_tensors="np").astype(np.int64)

    session = ort.InferenceSession(model_path, providers=[provider])
    input_name = session.get_inputs()[0].name

    # Warm-up
    for _ in range(5):
        session.run(None, {input_name: input_ids})

    # Single-sample latency
    latencies = []
    for _ in range(num_trials):
        start = time.time()
        _ = session.run(None, {input_name: input_ids})
        latencies.append(time.time() - start)
    latencies = np.array(latencies)

    # Model size
    model_size = os.path.getsize(model_path) / 1e6

    print(f"\nüìä Benchmark ‚Äî {os.path.basename(model_path)} on {provider}")
    print(f"Model Size: {model_size:.2f} MB")
    print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
    print(f"95th Percentile: {np.percentile(latencies, 95)*1000:.2f} ms")
    print(f"99th Percentile: {np.percentile(latencies, 99)*1000:.2f} ms")
    print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/sec")

    # Batch throughput
    batch_input = np.tile(input_ids, (batch_size, 1))
    for _ in range(5):
        session.run(None, {input_name: batch_input})

    batch_latencies = []
    for _ in range(num_batches):
        start = time.time()
        session.run(None, {input_name: batch_input})
        batch_latencies.append(time.time() - start)

    total_tokens = batch_input.shape[0] * num_batches
    batch_fps = total_tokens / np.sum(batch_latencies)

    print(f"\nüì¶ Batch Size: {batch_size}")
    print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
    print(f"Avg Batch Latency: {np.mean(batch_latencies)*1000:.2f} ms")


In [None]:
import onnxruntime as ort
print("Available Providers:", ort.get_available_providers())

In [7]:
model_path = "gpt2_quantized_static.onnx"
for provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "TensorrtExecutionProvider"]:
    if provider in ort.get_available_providers():
        benchmark_onnx_model(model_path, provider=provider)
    else:
        print(f"‚ùå {provider} not available on this machine.")


ONNX Inference Latency (median): 9.14 ms
Throughput: 108.20 req/s


'CPU'

In [43]:
onnx_model_path = "gpt2.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['mps'])
benchmark_onnx_model(onnx_model_path, provider='mps')
ort.get_device()

*************** EP Error ***************
EP Error Unknown Provider Type: mps when using ['mps']
Falling back to ['CPUExecutionProvider'] and retrying.
****************************************
*************** EP Error ***************
EP Error Unknown Provider Type: mps when using ['mps']
Falling back to ['CPUExecutionProvider'] and retrying.
****************************************

üìä Benchmark ‚Äî gpt2.onnx on mps
Model Size: 653.66 MB
Median Latency: 11.71 ms
95th Percentile: 12.18 ms
99th Percentile: 14.61 ms
Throughput: 84.47 req/sec

üì¶ Batch Size: 16
Batch Throughput: 117.91 tokens/sec
Avg Batch Latency: 135.70 ms


'CPU'

### Can run these tests of different execution providers on all the Models

In [None]:
model_variants = [
    "gpt2.onnx",
    "gpt2_optimized.onnx",
    "gpt2_quantized_dynamic.onnx",
    "gpt2_quantized_static.onnx"
]

for model in model_variants:
    for provider in ["CPUExecutionProvider", "CUDAExecutionProvider", "TensorrtExecutionProvider"]:
        if provider in ort.get_available_providers():
            benchmark_onnx_model(model, provider=provider)
