### Setup

In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### Excercise 1

In [2]:
from transformers import AutoModel, AutoTokenizer

model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Skipping import of cpp extensions due to incompatible torch version 2.9.1 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info
W1120 13:44:32.172000 26245 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [15]:
tokenizer.save_pretrained("models/tokenizer")

('models/tokenizer/tokenizer_config.json',
 'models/tokenizer/special_tokens_map.json',
 'models/tokenizer/vocab.txt',
 'models/tokenizer/added_tokens.json',
 'models/tokenizer/tokenizer.json')

In [3]:
from time import perf_counter
from functools import wraps
import numpy as np

def measure_func_time(trails: int):

    def decorator(func):

        @wraps(func)
        def wrapper(*args, **kwargs) -> float:
            
            time_meaures = []
            for i in range(trails):
                start = perf_counter()
                _ = func(*args, **kwargs)
                end = perf_counter()
                elapsed = end - start
                time_meaures.append(elapsed)

            mean_time = np.mean(time_meaures).item()
            print(f"[{func.__name__}] Average function runtime: {mean_time:.4f} seconds")
            return mean_time
        
        return wrapper
    
    return decorator

In [4]:
import torch

@measure_func_time(trails=100)
def basic_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output

@measure_func_time(trails=100)
@torch.no_grad()
def no_grad_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output

@measure_func_time(trails=100)
@torch.inference_mode()
def inference_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output

In [5]:
text = "How many people live in London?"

encoded = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [6]:
model.train()
reference = basic_forward(encoded, model);

model.eval()
eval_mode_time = basic_forward(encoded, model);
no_grad_time = no_grad_forward(encoded, model);
inference_time = inference_forward(encoded, model);

[basic_forward] Average function runtime: 0.0407 seconds
[basic_forward] Average function runtime: 0.0356 seconds
[no_grad_forward] Average function runtime: 0.0345 seconds
[inference_forward] Average function runtime: 0.0331 seconds


In [7]:
print(f"Eval speedup: {(reference / eval_mode_time):.4f}")
print(f"No grad speedup: {(reference / no_grad_time):.4f}")
print(f"Inference speedup: {(reference / inference_time):.4f}")

Eval speedup: 1.1434
No grad speedup: 1.1817
Inference speedup: 1.2285


#### Excercise 2

In [8]:
start = perf_counter()
model.eval()
compiled_model = torch.compile(model)
warmup = compiled_model(input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"])
elapsed = perf_counter() - start

print(f"Model compilation and first inference took: {elapsed:.4f} seconds")

Model compilation and first inference took: 10.0339 seconds


In [9]:
compiled_model_time = inference_forward(encoded, compiled_model)
print(f"Compilation speedup: {(reference / compiled_model_time):.4f}")

[inference_forward] Average function runtime: 0.0760 seconds
Compilation speedup: 0.5355


#### Excercise 3

In [10]:
from copy import deepcopy
import torchao
from torchao.quantization.quant_api import Int8DynamicActivationInt8WeightConfig

compiled_model = compiled_model.to("cpu")
quantized_model = deepcopy(compiled_model)

torchao.quantization.quantize_(
    quantized_model,
    Int8DynamicActivationInt8WeightConfig()
)

In [11]:
torch.save(compiled_model.state_dict(), "compiled_model.pt")
torch.save(quantized_model.state_dict(), "quantized_model.pt")

In [12]:
!du -sh *.pt

418M	compiled_model.pt
173M	quantized_model.pt


In [13]:
inference_forward(encoded, compiled_model);
inference_forward(encoded, quantized_model);

[inference_forward] Average function runtime: 0.0643 seconds
[inference_forward] Average function runtime: 0.1503 seconds


#### Excercise 6

1.

In [None]:
import torch
import torch.onnx


model_cpu = model.eval().cpu()
sample_input = tokenizer(
    "This is a sample input text for ONNX export.",
    padding=True,
    truncation=True,
    return_tensors="pt",
)

# Export to ONNX format
torch.onnx.export(
    model_cpu,
    (sample_input["input_ids"], sample_input["attention_mask"]),
    "models/model.onnx",
    opset_version=17,
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size"},
    },
)

In [17]:
sample_input = tokenizer(
    "This is a sample input text for ONNX inference.",
    padding=True,
    truncation=True,
    return_tensors="np",
)

inputs_onnx = {
    "input_ids": sample_input["input_ids"],
    "attention_mask": sample_input["attention_mask"],
}

In [50]:
import onnxruntime as ort

def create_basic_session():
    return ort.InferenceSession("models/model.onnx")

@measure_func_time(trails=100)
def cold_start_online():
    ort_session = create_basic_session()
    ort_session.run(None, inputs_onnx)

cold_start_online();

[cold_start_online] Inference time: 0.1539 seconds


In [51]:
import onnxruntime as ort

sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options.optimized_model_filepath = "models/model_optimized.onnx"
ort.InferenceSession("models/model.onnx", sess_options)

<onnxruntime.capi.onnxruntime_inference_collection.InferenceSession at 0x37bb95d90>

In [52]:
def create_optimized_session():
    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL

    ort_session_optimized = ort.InferenceSession(
        "models/model_optimized.onnx", 
        sess_options=sess_options, 
        providers=['CPUExecutionProvider']
    )
    return ort_session_optimized

@measure_func_time(trails=100)
def cold_start_offline():
    session = create_optimized_session()
    session.run(None, inputs_onnx)

cold_start_offline();

[cold_start_offline] Inference time: 0.1430 seconds


2.

In [55]:
@measure_func_time(trails=10_000)
def onnx_inference(inputs, session):
    outputs = session.run(None, inputs)
    return outputs

In [85]:
onnx_inference(inputs_onnx, create_basic_session())
onnx_inference(inputs_onnx, create_optimized_session());

[onnx_inference] Inference time: 0.0064 seconds
[onnx_inference] Inference time: 0.0064 seconds


3.

In [51]:
!docker images

REPOSITORY       TAG       IMAGE ID       CREATED         SIZE
apis-torch_api   latest    47ec3cdabe5c   2 minutes ago   2.47GB
apis-onnx_api    latest    709d91e87347   4 minutes ago   1.54GB


4.

In [60]:
import requests

@measure_func_time(trails=100)
def test_api(address: str, inputs: dict):
    requests.post(address, json=inputs)


base_address = "http://localhost:{}/inference"
onnx_port = 8000
torch_port = 8001

onnx_address = base_address.format(onnx_port)
torch_address = base_address.format(torch_port)

inputs_dict = {
    "input_ids": sample_input["input_ids"].tolist(),
    "attention_mask": sample_input["attention_mask"].tolist(),
}

test_api(onnx_address, {"text": "This is a text sentence, to check how the latency of the api"})
test_api(torch_address, {"text": "This is a text sentence, to check how the latency of the api"});

[test_api] Average function runtime: 0.0310 seconds
[test_api] Average function runtime: 0.0944 seconds
