TODO: TensorRT
* https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#tensorrt-text2image-stable-diffusion-pipeline
* https://github.com/huggingface/diffusers/blob/main/examples/community/stable_diffusion_tensorrt_txt2img.py
* https://github.com/NVIDIA/TensorRT/tree/main/demo/Diffusion
* https://www.cerebrium.ai/blog/improve-stable-diffusion-inference-by-50-with-tensorrt-or-aitemplate
https://github.com/facebookincubator/AITemplate/tree/main/examples/05_stable_diffusion




In [1]:
# %pip install diffusers==0.17.1
# %pip install --upgrade transformers accelerate
# %pip install tomesd
# %pip install xformers
# %pip install torchsde

from PIL import Image
from IPython.display import display

import torch
from diffusers import DiffusionPipeline, StableDiffusionPipeline
SD1_5 = "runwayml/stable-diffusion-v1-5"
from diffusers import EulerAncestralDiscreteScheduler

import utils


pipe = StableDiffusionPipeline.from_pretrained(
    SD1_5,

)
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")


def inference(pipe=pipe):
  prompt = "a photo of an astronaut riding a horse on mars"
  image = pipe(
      [prompt],
      height=512,
      width=512,
      num_inference_steps=20,
      generator=torch.Generator(device="cuda").manual_seed(10),
    ).images[0]
  return image


  from .autonotebook import tqdm as notebook_tqdm
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [2]:

# NOTE: vanilla
with utils.measure_time("vanilla, float32"):
    inference()

100%|██████████| 20/20 [00:06<00:00,  3.26it/s]


[PERF ] execution time for codeblock 'vanilla, float32': 6.706009s


In [3]:

# NOTE: tf32
with utils.measure_time("vanilla, tf32"):
    torch.backends.cuda.matmul.allow_tf32 = True
    inference()

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:05<00:00,  3.48it/s]


[PERF ] execution time for codeblock 'vanilla, tf32': 6.074373s


In [10]:
torch.backends.cuda.matmul.allow_tf32 = False

with utils.measure_time("vanilla, channel_last"):
    # NOTE: Channel last memory format
    pipe_channel_last = StableDiffusionPipeline(**pipe.components)

    pipe_channel_last.unet.to(memory_format=torch.channels_last)
    inference(pipe_channel_last)

100%|██████████| 20/20 [00:05<00:00,  3.49it/s]


[PERF ] execution time for codeblock 'vanilla, channel_last': 6.077928s


In [11]:
with utils.measure_time("half, FlashAttention?"):
    try:
        from diffusers.models.attention_processor import AttnProcessor2_0

        pipe_flash = StableDiffusionPipeline(**pipe_channel_last.components)
        pipe_flash.unet.set_attn_processor(AttnProcessor2_0()) # NOTE: FlashAttention is applied by default?
        pipe_flash.unet.to(memory_format=torch.channels_last)
        

        with torch.no_grad():
            image = inference(pipe_flash)
        image
    except Exception as e:
        pass

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:05<00:00,  3.47it/s]


[PERF ] execution time for codeblock 'half, torch.compile': 6.110065s


In [12]:
with utils.measure_time("vanilla, token merging"):
    import tomesd

    pipe_tomesd = StableDiffusionPipeline(**pipe.components)
    tomesd.apply_patch(pipe_tomesd, ratio=0.5)
    #pipe_16_tomesd.unet.set_attn_processor(AttnProcessor2_0())
    #pipe_16_tomesd.unet.to(memory_format=torch.channels_last)
    #pipe_16_tomesd.unet = torch.compile(pipe_16_tomesd.unet, mode="max-autotune", fullgraph=True)


    inference(pipe_tomesd)

100%|██████████| 20/20 [00:05<00:00,  3.51it/s]


[PERF ] execution time for codeblock 'half, token merging': 6.174046s


Benchmark

https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335


In [7]:
import tomesd
import torch
import torch.utils.benchmark as benchmark
from diffusers import StableDiffusionPipeline


def benchmark_torch_function(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
    )
    return round(t0.blocked_autorange(min_run_time=1).mean, 2)


model_id = "runwayml/stable-diffusion-v1-5"
prompt = "a photo of an astronaut riding a horse on mars"
steps = 20
num_images_per_prompt = 1
dtype = torch.float32
resolution = 512

pipe_ = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=dtype, safety_checker=None
)
pipe_.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe_.scheduler.config)
pipe_ = pipe_.to("cuda")
pipe_.set_progress_bar_config(disable=True)

# Vanilla
print("Running benchmark with vanilla pipeline...")
f = lambda: pipe_(
    prompt,
    height=resolution,
    width=resolution,
    num_inference_steps=steps,
    num_images_per_prompt=num_images_per_prompt,
).images
time_vanilla = benchmark_torch_function(f)

# With ToMe
print("Running benchmark with ToMe patched pipeline...")
tomesd.apply_patch(pipe_, ratio=0.5)
f = lambda: pipe_(
    prompt,
    height=resolution,
    width=resolution,
    num_inference_steps=steps,
    num_images_per_prompt=num_images_per_prompt,
).images
time_tome = benchmark_torch_function(f)

# With ToMe + xformers
print("Running benchmark with ToMe patched + xformers enabled pipeline...")
tomesd.remove_patch(pipe_)
pipe_.enable_xformers_memory_efficient_attention()
tomesd.apply_patch(pipe_, ratio=0.5)
f = lambda: pipe_(
    prompt,
    height=resolution,
    width=resolution,
    num_inference_steps=steps,
    num_images_per_prompt=num_images_per_prompt,
).images
time_tome_xformers = benchmark_torch_function(f)


print(
    f"Model: {model_id}, dtype: {dtype}, steps: {steps}, num_images_per_prompt: {num_images_per_prompt}, resolution: {resolution} x {resolution}"
)
print(f"Vanilla        : {time_vanilla} s")
print(f"ToMe           : {time_tome} s")
print(f"ToMe + xformers: {time_tome_xformers} s")

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Running benchmark with vanilla pipeline...
Running benchmark with ToMe patched pipeline...
Running benchmark with ToMe patched + xformers enabled pipeline...


Blocksparse is not available: the current GPU does not expose Tensor cores


Model: runwayml/stable-diffusion-v1-5, dtype: torch.float32, steps: 20, num_images_per_prompt: 1, resolution: 512 x 512
Vanilla        : 6.09 s
ToMe           : 5.38 s
ToMe + xformers: 5.35 s
