In [1]:
import torch._dynamo as torchdynamo

print(torchdynamo.list_backends())

['ansor', 'aot_autograd', 'aot_cudagraphs', 'aot_eager', 'aot_inductor_debug', 'aot_print', 'aot_ts', 'aot_ts_nvfuser', 'aot_ts_nvfuser_nodecomps', 'cudagraphs', 'cudagraphs_ts', 'cudagraphs_ts_ofi', 'eager', 'fx2trt', 'inductor', 'ipex', 'nnc', 'nnc_ofi', 'nvprims_aten', 'nvprims_nvfuser', 'ofi', 'onednn', 'onnx2tensorrt', 'onnx2tensorrt_alt', 'onnx2tf', 'onnxrt', 'onnxrt_cpu', 'onnxrt_cpu_numpy', 'onnxrt_cuda', 'static_runtime', 'taso', 'tensorrt', 'torch2trt', 'torchxla_trace_once', 'torchxla_trivial', 'ts', 'ts_nvfuser', 'ts_nvfuser_ofi', 'tvm', 'tvm_meta_schedule']


## Random example

In [2]:
import torch

@torchdynamo.optimize("inductor")
def fn(x, y):
    a = torch.cos(x)
    b = torch.sin(y)
    return a + b


In [3]:
x = torch.randn(3, 5)
y = torch.randn(3, 5)

fn(x, y)

tensor([[ 0.0823,  1.1241,  0.3413,  0.5874,  0.4794],
        [-0.1441,  1.4301,  0.2987, -0.2521,  1.3261],
        [ 0.7339,  0.0656,  0.3527,  1.9016, -0.0326]])

In [4]:
%%timeit
fn(x, y)

10.1 µs ± 144 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


> ResetRequired: 
Must call `torch._dynamo.reset()` before changing backends.  Detected two calls to
`torch._dynamo.optimize(...)` with a different backend compiler arguments.

In [14]:
import torch

torchdynamo.reset()

@torchdynamo.optimize("fx2trt")
def fn(x, y):
    a = torch.cos(x)
    b = torch.sin(y)
    return a + b


In [None]:
x = torch.randn(3, 5)
y = torch.randn(3, 5)

fn(x, y)

> ModuleNotFoundError: No module named 'torch_tensorrt'

To use `fx2trt` backend, you need to install `torch-tensorrt` package.

In [18]:
import torch

torchdynamo.reset()

@torchdynamo.optimize("onnxrt_cuda")
def fn(x, y):
    a = torch.cos(x)
    b = torch.sin(y)
    return a + b


In [None]:
x = torch.randn(3, 5)
y = torch.randn(3, 5)

fn(x, y)

> ModuleNotFoundError: No module named 'onnxruntime'

To use `onnxrt` backend, you need to install `onnxruntime` package.

## Transformers example

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)


Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [29]:
import torch._dynamo as torchdynamo
from torch._dynamo.optimizations import backends

optimizer = torchdynamo.optimize("inductor")

In [18]:
len(input_ids[0])

10

In [32]:
payload="I like you. I love" * 21
payload="I like you. I love" 

input_ids = tokenizer(payload ,return_tensors="pt")["input_ids"].to(device)


# print payload lenght
print(f"length: {input_ids.shape}")


def vanilla_inf(input_ids):
    with torch.inference_mode():
      logits = model(input_ids=input_ids).logits.softmax(-1)[0]
      pred = logits.tolist()
      return pred

vanilla_inf(input_ids=input_ids)

length: torch.Size([1, 8])


[0.0001205497101182118, 0.9998794794082642]

In [33]:
%%timeit
vanilla_inf(input_ids=input_ids)

4.22 ms ± 83.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
torchdynamo.reset()


@torchdynamo.optimize("inductor")
def dyn_inf(input_ids):
    logits = model(input_ids=input_ids).logits.softmax(-1)[0]
    pred = logits.tolist()
    return pred
    
dyn_inf(input_ids)

[0.0001205498119816184, 0.9998794794082642]

In [35]:
%%timeit
dyn_inf(input_ids)

2.07 ms ± 17.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Pipeline example

In [105]:
from transformers import pipeline

clf = pipeline("text-classification",device=0)

payload = "I like you. I love you"

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [106]:
%%timeit
clf(payload)

4.95 ms ± 70.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
torchdynamo.reset()


@torchdynamo.optimize("inductor")
def dyn_p(payload):
    logits = clf(payload)
    return logits
    
dyn_p(input_ids)


> ReferenceError: weakly-referenced object no longer exists


## Create Abstract model class for TorchDynamo

In [108]:
from torch import nn
from typing import Optional
import torch
import torch._dynamo as torchdynamo


class TorchDynamoModel(nn.Module):
    def __init__(self, transformer: nn.Module = None, optimizer: str = None):
        super(TorchDynamoModel, self).__init__()
        self.transformer = transformer
        self.config = transformer.config
        self.optimizer = optimizer
        self.forward = self.create_optimized_forward(optimizer)
    
    def create_optimized_forward(self, optimizer):
      torchdynamo.reset()

      @torchdynamo.optimize(optimizer)
      def new_fwd( *args, **kwargs):
        return self.transformer(*args, **kwargs)
      
      return new_fwd
    

trt = TorchDynamoModel(transformer=model,optimizer="inductor")
trt(input_ids)
  

SequenceClassifierOutput(loss=None, logits=tensor([[-4.3282,  4.6951]], device='cuda:0',
       grad_fn=<CompiledFunctionBackward>), hidden_states=None, attentions=None)

In [109]:
%%timeit
trt(input_ids)

1.92 ms ± 27.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [136]:
%%timeit
model(input_ids)

6.43 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Test pipeline with `TorchDynamoModel`

In [None]:
dyn_clf = pipeline("text-classification", model=trt, tokenizer=tokenizer, framework="pt", device=0)

In [114]:
%%timeit
dyn_clf(payload)

2.08 ms ± 13.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [113]:
%%timeit
clf(payload)



5 ms ± 75.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


it will "re-optimize" once the input length changes 

In [115]:
dyn_clf("I hate you")
# 15s



[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [117]:
dyn_clf("I hate you")
# 0.2s



[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]

## Test Accelerate approach 

based on https://github.com/huggingface/accelerate/pull/829/files#diff-2d7515874eaecac2687c7fc1a9c720be53f802bf14b4c3dcebe14ad443d075dc

In [2]:
import torch._dynamo as dynamo
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)

optimizer = "inductor"
o_model = dynamo.optimize(optimizer)(model)

o_clf = pipeline("text-classification", model=o_model, tokenizer=tokenizer, framework="pt", device=0)

In [3]:
o_model

<torch._dynamo.eval_frame._TorchDynamoContext.__call__.<locals>.TorchDynamoNNModuleWrapper at 0x7f368d2a0070>

In [9]:
payload="I like you. I love you" 

input_ids = tokenizer(payload ,return_tensors="pt")["input_ids"].to(device)

In [10]:
%%timeit
o_model(input_ids)

477 µs ± 113 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
o_clf(payload)



4.86 ms ± 18.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Results

| kind                             | model latency | pipeline latency | payload              |
|----------------------------------|---------------|------------------|----------------------|
| vanilla transformers             | 4.22ms        | 5ms              | "I like you. I love" |
| TorchDynamoModel (wrapped fwd)   | 1.92ms        | 2.08ms           | "I like you. I love" |
| accelerate (_TorchDynamoContext) | 1.93ms        | 4.95ms           | "I like you. I love" |