## Testing Torchdynamo for improving Pegasus

In [1]:
!nvidia-smi

Thu Aug 11 12:46:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   31C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## General utils

In [15]:
from time import perf_counter
import numpy as np

def measure_latency(payload,model, tokenizer,num_beams=1):
    latencies = []
    # warm up
    for _ in range(2):
        _ = generate_from_model(payload,model, tokenizer,num_beams)
    # Timed run
    for _ in range(50):
        start_time = perf_counter()
        _ =  generate_from_model(payload,model, tokenizer,num_beams)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms

def generate_from_model(payload,model, tokenizer,num_beams=1):
    encoded_input = tokenizer(payload, return_tensors='pt',pad_to_multiple_of=8)
    output_sequences = model.generate(input_ids=encoded_input['input_ids'].to(device),num_beams=num_beams)
    return tokenizer.decode(output_sequences[0], skip_special_tokens=True)


## Regular transformers

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("RobertoFont/pegasus-large-samsum")
model = AutoModelForSeq2SeqLM.from_pretrained("RobertoFont/pegasus-large-samsum").to(device)

In [10]:
payload="Renee: Just saying Hi. Thought of you this morning.Layla misses you. She is having knee surgery. Hope you are doing well. Rachel: Renee! Hey! Whoa! So crazy that you wrote. I was literally thinking of you the other day as well. Rachel: I am doing really well, getting settled here and everything. Looking for a new job. Rachel: Layla is getting knew surgery?! What happened? Renee: Her arthritis got really bad. I saw her limping every time we went out for a walk, and so I took her to the vet and they decided she needs surgery. Rachel: Oh wow. That is really intense. I am sending her lots of love. Renee: Thanks Rachel. Renee: Here is a pic of Layla from this morning. Renee: <file_picture> Renee: She is all cute in her usual spot. Rachel: Ha! Yeah, I remember, she would always try to fit in there even though she's obviously too big Rachel: 😂 Renee: Yeah, she's always getting into some sort of silly situations. Renee: What kind of work are you looking for? Rachel: Just the usual, something with teaching. I am not too stressed yet, I have some savings. Renee: That's good, yeah, you don't want to have to feel pressured into taking a certain job. Renee: Just so you know, if you are ever back, you are more than welcome to come back and work for me. Renee: Layla is definitely missing you taking her on walks everyday. Rachel: Aw, yeah, I miss you guys too. Of course If I am ever back I am" * 3

print(f"Payload token length {len(tokenizer(payload)['input_ids'])}")

Payload token length 1023


In [11]:
input_ids = tokenizer(payload ,return_tensors="pt")["input_ids"].to(device)
logits = model.generate(input_ids=input_ids)
tokenizer.decode(logits[0],skip_special_tokens=True)


"Renee's dog Layla is having knee surgery. Rachel is looking for a new job."

In [17]:
measure_latency(payload,model,tokenizer)[0]

'P95 latency (ms) - 464.1055683999639; Average latency (ms) - 455.84 +\\- 5.26;'

In [16]:
measure_latency(payload,model,tokenizer,num_beams=5)[0]

'P95 latency (ms) - 681.1002442500694; Average latency (ms) - 674.37 +\\- 3.89;'

**Greedy Search**
```
GPU: 'P95 latency (ms) - 464.1055683999639; Average latency (ms) - 455.84 +\\- 5.26;'
TorchDynamo: 
```

**Beam Search (5)**

```
GPU: 'P95 latency (ms) - 681.1002442500694; Average latency (ms) - 674.37 +\\- 3.89;'
TorchDynamo: 
```


# Torchdynamo

In [19]:
# list backends 
import torchdynamo

torchdynamo.list_backends()

ModuleNotFoundError: No module named 'torchdynamo'

get backend/optimizer

In [None]:
from torchdynamo.optimizations import backends
dynamo_fx2trt_fp32
optimizer = torchdynamo.optimize(backends.fx2trt_compiler_fp16)

optimize model

In [None]:
optimized_mod = torchdynamo.optimize(optimizer)(model)


In [None]:
measure_latency(payload,optimized_mod,tokenizer)[0]