### Setup

In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### Excercise 1

In [2]:
from transformers import AutoModel, AutoTokenizer

model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Skipping import of cpp extensions due to incompatible torch version 2.9.1 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info
W1119 19:21:20.959000 48575 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [3]:
from time import perf_counter
from functools import wraps
import numpy as np

def measure_func_time(trails: int):

    def decorator(func):

        @wraps(func)
        def wrapper(*args, **kwargs) -> float:
            
            time_meaures = []
            for i in range(trails):
                start = perf_counter()
                _ = func(*args, **kwargs)
                end = perf_counter()
                elapsed = end - start
                time_meaures.append(elapsed)

            mean_time = np.mean(time_meaures).item()
            print(f"[{func.__name__}] Inference time: {mean_time:.4f} seconds")
            return mean_time
        
        return wrapper
    
    return decorator

In [4]:
import torch

@measure_func_time(trails=100)
def basic_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output

@measure_func_time(trails=100)
@torch.no_grad()
def no_grad_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output

@measure_func_time(trails=100)
@torch.inference_mode()
def inference_forward(inputs: dict[str, torch.Tensor], model: AutoModel):
    output = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )
    return output

In [5]:
text = "How many people live in London?"

encoded = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [6]:
model.train()
reference = basic_forward(encoded, model);

model.eval()
eval_mode_time = basic_forward(encoded, model);
no_grad_time = no_grad_forward(encoded, model);
inference_time = inference_forward(encoded, model);

[basic_forward] Inference time: 0.0339 seconds
[basic_forward] Inference time: 0.0328 seconds
[no_grad_forward] Inference time: 0.0328 seconds
[inference_forward] Inference time: 0.0322 seconds


In [7]:
print(f"Eval speedup: {(reference / eval_mode_time):.4f}")
print(f"No grad speedup: {(reference / no_grad_time):.4f}")
print(f"Inference speedup: {(reference / inference_time):.4f}")

Eval speedup: 1.0329
No grad speedup: 1.0333
Inference speedup: 1.0502


#### Excercise 2

In [8]:
start = perf_counter()
model.eval()
compiled_model = torch.compile(model)
warmup = compiled_model(input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"])
elapsed = perf_counter() - start

print(f"Model compilation and first inference took: {elapsed:.4f} seconds")

Model compilation and first inference took: 6.9826 seconds


In [9]:
compiled_model_time = inference_forward(encoded, compiled_model)
print(f"Compilation speedup: {(reference / compiled_model_time):.4f}")

[inference_forward] Inference time: 0.0622 seconds
Compilation speedup: 0.5447


#### Excercise 3

In [10]:
from copy import deepcopy
import torchao
from torchao.quantization.quant_api import Int8DynamicActivationInt8WeightConfig

compiled_model = compiled_model.to("cpu")
quantized_model = deepcopy(compiled_model)

torchao.quantization.quantize_(
    quantized_model,
    Int8DynamicActivationInt8WeightConfig()
)

In [11]:
torch.save(compiled_model.state_dict(), "compiled_model.pt")
torch.save(quantized_model.state_dict(), "quantized_model.pt")

In [12]:
!du -sh *.pt

432M	compiled_model.pt
176M	quantized_model.pt


In [14]:
inference_forward(encoded, compiled_model);
inference_forward(encoded, quantized_model);

[inference_forward] Inference time: 0.0313 seconds
[inference_forward] Inference time: 0.0420 seconds


#### Excercise 4