### Setup and model loading

In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [3]:
from time import perf_counter
from functools import wraps
import numpy as np

def measure_func_time(trails: int):

    def decorator(func):

        @wraps(func)
        def wrapper(*args, **kwargs) -> float:

            time_meaures = []
            for i in range(trails):
                start = perf_counter()
                _ = func(*args, **kwargs)
                end = perf_counter()
                elapsed = end - start
                time_meaures.append(elapsed)

            mean_time = np.mean(time_meaures).item()
            print(f"[{func.__name__}] Inference time: {mean_time:.4f} seconds")
            return mean_time

        return wrapper

    return decorator

In [4]:
import torch


@measure_func_time(trails=1000)
@torch.no_grad()
def inference_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output


@measure_func_time(trails=1000)
@torch.inference_mode()
@torch.autocast(device_type="cuda")
def autocast_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output

#### Excercise 4

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

text = "Example sentence for testing onnx and onnxruntime"

inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
inputs_gpu = {k: v.to(device) for k,v in inputs.items()}
model.to(device);

cuda


In [6]:
compiled_default = torch.compile(model)
compiled_cudagraphs = torch.compile(model, mode="max-autotune")
compiled_no_cudagraphs = torch.compile(model, mode="max-autotune-no-cudagraphs")

In [7]:
outputs = compiled_default(**inputs_gpu)
outputs = compiled_cudagraphs(**inputs_gpu)
outputs = compiled_no_cudagraphs(**inputs_gpu)


In [8]:
reference = inference_forward(inputs_gpu, compiled_default);
cudagraphs_time = inference_forward(inputs_gpu, compiled_cudagraphs);
no_cudagraphs_time = inference_forward(inputs_gpu, compiled_no_cudagraphs);

[inference_forward] Inference time: 0.0061 seconds




[inference_forward] Inference time: 0.0092 seconds
[inference_forward] Inference time: 0.0073 seconds


In [9]:
print(f"Cuda graphs speedup: {(reference / cudagraphs_time):.4f}")
print(f"No cuda graphs speedup: {(reference / no_cudagraphs_time):.4f}")

Cuda graphs speedup: 0.6667
No cuda graphs speedup: 0.8438


In [10]:
text = """
In the quiet town of Eldermoor, nestled between
rolling hills and misty forests, life had a rhythm dictated by the seasons.
The townspeople rose with the sun, tending to gardens, markets, and workshops that had existed for generations.
Among them, Maren, a young apothecary’s apprentice, was known not only for her curiosity but also for her relentless
questioning of tradition. While others accepted the prescriptions handed down through centuries, Maren wondered whether
the roots and herbs could yield more potent remedies if prepared differently, or if forgotten knowledge might be hiding
in the old manuscripts stored in the town library’s dusty alcoves.
"""

inputs2= tokenizer(text, padding=True, truncation=True, return_tensors="pt")
inputs_gpu2 = {k: v.to(device) for k,v in inputs2.items()}

In [11]:
reference = inference_forward(inputs_gpu2, compiled_default);
cudagraphs_time = inference_forward(inputs_gpu2, compiled_cudagraphs);
no_cudagraphs_time = inference_forward(inputs_gpu2, compiled_no_cudagraphs);

W1120 00:16:53.343000 5853 torch/_inductor/utils.py:1436] [0/6] Not enough SMs to use max_autotune_gemm mode


[inference_forward] Inference time: 0.0346 seconds


AUTOTUNE addmm(133x768, 133x768, 768x768)
strides: [0, 1], [768, 1], [1, 768]
dtypes: torch.float32, torch.float32, torch.float32
  addmm 0.1351 ms 100.0% 
  bias_addmm 0.1413 ms 95.6% 
SingleProcess AUTOTUNE benchmarking takes 0.0449 seconds and 0.0003 seconds precompiling for 2 choices
W1120 00:18:02.236000 5853 torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
W1120 00:18:02.236000 5853 torch/_dynamo/convert_frame.py:1016] [0/8]    function: 'forward' (/usr/local/lib/python3.12/dist-packages/transformers/models/mpnet/modeling_mpnet.py:449)
W1120 00:18:02.236000 5853 torch/_dynamo/convert_frame.py:1016] [0/8]    last reason: 0/7: 
W1120 00:18:02.236000 5853 torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1120 00:18:02.236000 5853 torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


[inference_forward] Inference time: 0.0477 seconds
[inference_forward] Inference time: 0.0130 seconds


In [12]:
print(f"Cuda graphs speedup: {(reference / cudagraphs_time):.4f}")
print(f"No cuda graphs speedup: {(reference / no_cudagraphs_time):.4f}")

Cuda graphs speedup: 0.7255
No cuda graphs speedup: 2.6620


#### Excercise 5

In [15]:
capability = torch.cuda.get_device_capability()
print(f"CUDA device capability: {capability}")

# Tensor Cores are available on NVidia GPUs with CUDA >= 7 (e.g. Volta, Turing, Ampere, Hopper)
if capability >= (7, 0):
    print("Tensor Cores available: fast float16 supported.")
else:
    print("Tensor Cores not available: float16 may be slow or unsupported.")

CUDA device capability: (7, 5)
Tensor Cores available: fast float16 supported.


In [16]:
from copy import deepcopy

model.to("cuda")
model_half = deepcopy(model)
model_half = model_half.half().to('cuda')


In [17]:
full_precision_time = inference_forward(inputs_gpu, model);
half_precision_time = inference_forward(inputs_gpu, model_half);
autocast_time = autocast_forward(inputs_gpu, model);

[inference_forward] Inference time: 0.0125 seconds
[inference_forward] Inference time: 0.0184 seconds
[autocast_forward] Inference time: 0.0194 seconds


In [18]:
full_precision_time = inference_forward(inputs_gpu2, model);
half_precision_time = inference_forward(inputs_gpu2, model_half);
autocast_time = autocast_forward(inputs_gpu2, model);

[inference_forward] Inference time: 0.0127 seconds
[inference_forward] Inference time: 0.0094 seconds
[autocast_forward] Inference time: 0.0114 seconds


In practice I would choose the fastest one, but before I would have to doublecheck that changing the precision does not affect model accuracy too much.