## Hardware

In [None]:
!lscpu

In [None]:
!nvidia-smi

## 0. Setup

In [None]:
!pip install --upgrade diffusers[torch]
!pip install tomesd
!pip install "optimum[onnxruntime, openvino]"

In [None]:
import torch
import time
import contextlib
from tqdm import tqdm


class Profile(contextlib.ContextDecorator):
    # YOLOv5 Profile class. Usage: @Profile() decorator or 'with Profile():' context manager
    def __init__(self, t=0.0):
        self.t = t
        self.cuda = torch.cuda.is_available()

    def __enter__(self):
        self.start = self.time()
        return self

    def __exit__(self, type, value, traceback):
        self.dt = self.time() - self.start  # delta-time
        self.t += self.dt  # accumulate dt

    def time(self):
        if self.cuda:
            torch.cuda.synchronize()
        return time.time()

def measure_latency(pipeline, prompt, nsteps=20, nimg=1):
    latencies = []
    dt = Profile()
    # warm up
    for _ in range(nimg):
        _ =  pipeline(prompt, num_inference_steps=nsteps)
    # Timed run
    for _ in tqdm(range(nimg)):
        with dt:
            _ = pipeline(prompt, num_inference_steps=nsteps)

    return dt.t / nimg

In [None]:
import torch
from diffusers import StableDiffusionPipeline
# from datasets import load_dataset

# CONFIG = {
#     "model_id": "runwayml/stable-diffusion-v1-5",
#     "dtype": torch.float16,
#     "gen": torch.manual_seed(0),
#     "inference_steps": 25,
#     "num_images_per_prompt": 4,
#     "resolution": 512,
#     "num_parti_prompts": 100,
#     "challenge": "basic",
#     "seed": 0,
#     "tome_ratio": 0.5,
# }

# prompts = load_dataset("nateraw/parti-prompts", split="train")
# prompts = prompts.shuffle()
# prompts = [prompts[i]["Prompt"] for i in range(10)]
prompt = 'Fire and Ice Dragon'

## 1. CPU Benchmark

In [None]:
CPU_DEVICE = 'cpu'
NUM_STEPS = 20
NUM_IMAGES = 1

### Finetuning Pipeline

In [None]:
model_id = "Zero-nnkn/stable-diffusion-2-pokemon"
pipeline = StableDiffusionPipeline.from_pretrained(model_id)
pipeline.to(CPU_DEVICE)

In [None]:
t = measure_latency(pipeline, prompt, nsteps=NUM_STEPS , nimg=NUM_IMAGES)
print(f'\n{t} (s/image)')

In [None]:
del pipeline
torch.cuda.empty_cache()

### Pipeline + ToMe

In [None]:
import tomesd

model_id = "Zero-nnkn/stable-diffusion-2-pokemon"
pipeline = StableDiffusionPipeline.from_pretrained(model_id)
tomesd.apply_patch(pipeline, ratio=0.5)
pipeline.to(CPU_DEVICE)

In [None]:
t = measure_latency(pipeline, prompt, nsteps=NUM_STEPS , nimg=NUM_IMAGES)
print(f'\n{t} (s/image)')

In [None]:
del pipeline
torch.cuda.empty_cache()

### ONNX

In [None]:
from optimum.onnxruntime import ORTStableDiffusionPipeline

model_id = "Zero-nnkn/stable-diffusion-2-pokemon"
revision="onnx"
pipeline = ORTStableDiffusionPipeline.from_pretrained(
    model_id,
    revision=revision
)
pipeline.to(CPU_DEVICE)

In [None]:
t = measure_latency(pipeline, prompt, nsteps=NUM_STEPS - 1, nimg=NUM_IMAGES)
print(f'\n{t} (s/image)')

In [None]:
del pipeline
torch.cuda.empty_cache()

### ONNX UINT8

In [None]:
from optimum.onnxruntime import ORTStableDiffusionPipeline

model_id = "Zero-nnkn/stable-diffusion-2-pokemon"
revision="onnx-u8"
pipeline = ORTStableDiffusionPipeline.from_pretrained(
    model_id,
    revision=revision
)

pipeline.to(CPU_DEVICE)

In [None]:
t = measure_latency(pipeline, prompt, nsteps=NUM_STEPS -1, nimg=NUM_IMAGES)
print(f'\n{t} (s/image)')

In [None]:
del pipeline
torch.cuda.empty_cache()

### OpenVINO

In [None]:
from optimum.intel import OVStableDiffusionPipeline

model_id = "Zero-nnkn/stable-diffusion-2-pokemon"
revision="openvino"
pipeline = OVStableDiffusionPipeline.from_pretrained(
    model_id,
    revision=revision,
    device='CPU',
    compile=False,
)


batch_size, num_images, height, width = 1, 1, 512, 512
# Statically reshape the model
pipeline.reshape(batch_size, height, width, num_images)
# Compile the model before inference
pipeline.compile()

In [None]:
t = measure_latency(pipeline, prompt, nsteps=NUM_STEPS - 1, nimg=NUM_IMAGES)
print(f'\n{t} (s/image)')

In [None]:
del pipeline
torch.cuda.empty_cache()