In [1]:
!pip install --upgrade diffusers[torch]
!pip install "optimum[onnxruntime, openvino]"

Collecting diffusers[torch]
  Downloading diffusers-0.20.1-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.13.2 (from diffusers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1 (from diffusers[torch])
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.11.0 (from diffusers[torch])
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, hugg

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
!git config --global user.email "your_email"
!git config --global user.name "your_user_name"

## ONNX code
If you want to understand more about the process, read this code

In [None]:
%%writefile export_sd_onnx.py
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import shutil
from pathlib import Path

import onnx
import torch

from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline, StableDiffusionPipeline
from diffusers.models.attention_processor import AttnProcessor


def onnx_export(
    model,
    model_args: tuple,
    save_dir: Path,
    ordered_input_names,
    output_names,
    dynamic_axes,
    opset,
):
    save_dir.parent.mkdir(parents=True, exist_ok=True)
    torch.onnx.export(
        model,
        model_args,
        f=save_dir.as_posix(),
        input_names=ordered_input_names,
        output_names=output_names,
        dynamic_axes=dynamic_axes,
        do_constant_folding=True,
        opset_version=opset,
    )



@torch.no_grad()
def export_pipeline_onnx(
    model_path: str,
    save_dir: str,
    opset: int,
    fp16: bool = False
):
    DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder"
    DIFFUSION_MODEL_UNET_SUBFOLDER = "unet"
    DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER = "vae_encoder"
    DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER = "vae_decoder"
    DIFFUSION_MODEL_SAFTY_CHECKER_SUBFOLDER = "safety_checker"
    WEIGHT_ONNX = "model.onnx"


    dtype = torch.float16 if fp16 else torch.float32
    if fp16 and torch.cuda.is_available():
        device = "cuda"
    elif fp16 and not torch.cuda.is_available():
        raise ValueError("`float16` model export is only supported on GPUs with CUDA")
    else:
        device = "cpu"
    pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)
    save_dir = Path(save_dir)

    # TEXT ENCODER
    num_tokens = pipeline.text_encoder.config.max_position_embeddings
    text_hidden_size = pipeline.text_encoder.config.hidden_size
    text_input = pipeline.tokenizer(
        "A sample prompt",
        padding="max_length",
        max_length=pipeline.tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt",
    )
    onnx_export(
        pipeline.text_encoder,
        # casting to torch.int32 until the CLIP fix is released: https://github.com/huggingface/transformers/pull/18515/files
        model_args=(text_input.input_ids.to(device=device, dtype=torch.int32)),
        save_dir=save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / WEIGHT_ONNX,
        ordered_input_names=["input_ids"],
        output_names=["last_hidden_state", "pooler_output"],
        dynamic_axes={
            "input_ids": {0: "batch", 1: "sequence"},
        },
        opset=opset,
    )
    del pipeline.text_encoder

    # UNET
    # PyTorch does not support the ONNX export of torch.nn.functional.scaled_dot_product_attention
    # https://github.com/huggingface/diffusers/issues/2878
    pipeline.unet.set_attn_processor(AttnProcessor())
    unet_in_channels = pipeline.unet.config.in_channels
    unet_sample_size = pipeline.unet.config.sample_size
    unet_path = save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / WEIGHT_ONNX
    onnx_export(
        pipeline.unet,
        model_args=(
            torch.randn(2, unet_in_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
            torch.randn(2).to(device=device, dtype=dtype),
            torch.randn(2, num_tokens, text_hidden_size).to(device=device, dtype=dtype),
            False,
        ),
        save_dir=unet_path,
        ordered_input_names=["sample", "timestep", "encoder_hidden_states", "return_dict"],
        output_names=["out_sample"],  # has to be different from "sample" for correct tracing
        dynamic_axes={
            "sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
            "timestep": {0: "batch"},
            "encoder_hidden_states": {0: "batch", 1: "sequence"},
        },
        opset=opset,
        # use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
    )
    unet_model_path = str(unet_path.absolute().as_posix())
    unet_dir = os.path.dirname(unet_model_path)
    unet = onnx.load(unet_model_path)
    # clean up existing tensor files
    shutil.rmtree(unet_dir)
    os.mkdir(unet_dir)
    # collate external tensor files into one
    onnx.save_model(
        unet,
        unet_model_path,
        save_as_external_data=True,
        all_tensors_to_one_file=True,
        location="weights.pb",
        convert_attribute=False,
    )
    del pipeline.unet

    # VAE ENCODER
    pipeline.vae.set_attn_processor(AttnProcessor())
    vae_encoder = pipeline.vae
    vae_in_channels = vae_encoder.config.in_channels
    vae_sample_size = vae_encoder.config.sample_size
    # need to get the raw tensor output (sample) from the encoder
    vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
    onnx_export(
        vae_encoder,
        model_args=(
            torch.randn(1, vae_in_channels, vae_sample_size, vae_sample_size).to(device=device, dtype=dtype),
            False,
        ),
        save_dir=save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / WEIGHT_ONNX,
        ordered_input_names=["sample", "return_dict"],
        output_names=["latent_sample"],
        dynamic_axes={
            "sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
        },
        opset=opset,
    )

    # VAE DECODER
    pipeline.vae.set_attn_processor(AttnProcessor())
    vae_decoder = pipeline.vae
    vae_latent_channels = vae_decoder.config.latent_channels
    vae_out_channels = vae_decoder.config.out_channels
    # forward only through the decoder part
    vae_decoder.forward = vae_encoder.decode
    onnx_export(
        vae_decoder,
        model_args=(
            torch.randn(1, vae_latent_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
            False,
        ),
        save_dir=save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / WEIGHT_ONNX,
        ordered_input_names=["latent_sample", "return_dict"],
        output_names=["sample"],
        dynamic_axes={
            "latent_sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
        },
        opset=opset,
    )
    del pipeline.vae

    # SAFETY CHECKER
    if pipeline.safety_checker is not None:
        safety_checker = pipeline.safety_checker
        clip_num_channels = safety_checker.config.vision_config.num_channels
        clip_image_size = safety_checker.config.vision_config.image_size
        safety_checker.forward = safety_checker.forward_onnx
        onnx_export(
            pipeline.safety_checker,
            model_args=(
                torch.randn(
                    1,
                    clip_num_channels,
                    clip_image_size,
                    clip_image_size,
                ).to(device=device, dtype=dtype),
                torch.randn(1, vae_sample_size, vae_sample_size, vae_out_channels).to(device=device, dtype=dtype),
            ),
            save_dir=save_dir / DIFFUSION_MODEL_SAFTY_CHECKER_SUBFOLDER / WEIGHT_ONNX,
            ordered_input_names=["clip_input", "images"],
            output_names=["out_images", "has_nsfw_concepts"],
            dynamic_axes={
                "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
                "images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
            },
            opset=opset,
        )
        del pipeline.safety_checker
        safety_checker = OnnxRuntimeModel.from_pretrained(save_dir / DIFFUSION_MODEL_SAFTY_CHECKER_SUBFOLDER)
        feature_extractor = pipeline.feature_extractor
    else:
        safety_checker = None
        feature_extractor = None

    onnx_pipeline = OnnxStableDiffusionPipeline(
        vae_encoder=OnnxRuntimeModel.from_pretrained(save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER),
        vae_decoder=OnnxRuntimeModel.from_pretrained(save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER),
        text_encoder=OnnxRuntimeModel.from_pretrained(save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER),
        tokenizer=pipeline.tokenizer,
        unet=OnnxRuntimeModel.from_pretrained(save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER),
        scheduler=pipeline.scheduler,
        safety_checker=safety_checker,
        feature_extractor=feature_extractor,
        requires_safety_checker=safety_checker is not None,
    )

    onnx_pipeline.save_pretrained(save_dir)
    print("ONNX pipeline saved to", save_dir)

    del pipeline
    del onnx_pipeline
    _ = OnnxStableDiffusionPipeline.from_pretrained(save_dir, provider="CPUExecutionProvider")
    print("ONNX pipeline is loadable")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model_path",
        type=str,
        required=True,
        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
    )

    parser.add_argument("--save_dir", type=str, required=True, help="Path to the output model.")

    parser.add_argument(
        "--opset",
        default=15,
        type=int,
        help="The version of the ONNX operator set to use.",
    )
    parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")

    args = parser.parse_args()

    export_pipeline_onnx(args.model_path, args.save_dir, args.opset, args.fp16)

Writing export_sd_onnx.py


In [None]:
!python export_sd_onnx.py \
  --model_path="Zero-nnkn/stable-diffusion-2-pokemon" \
  --opset=15 \
  --save_dir="onnx" \
  # --fp16

Downloading (…)ain/model_index.json: 100% 598/598 [00:00<00:00, 3.07MB/s]
Fetching 13 files:   0% 0/13 [00:00<?, ?it/s]
Downloading (…)cial_tokens_map.json: 100% 460/460 [00:00<00:00, 2.76MB/s]

Downloading (…)tokenizer/vocab.json:   0% 0.00/1.06M [00:00<?, ?B/s][A

Downloading (…)rocessor_config.json: 100% 518/518 [00:00<00:00, 4.15MB/s]
Fetching 13 files:   8% 1/13 [00:01<00:14,  1.19s/it]

Downloading (…)cheduler_config.json: 100% 374/374 [00:00<00:00, 3.16MB/s]


Downloading (…)_encoder/config.json: 100% 615/615 [00:00<00:00, 5.13MB/s]


Downloading (…)okenizer_config.json: 100% 737/737 [00:00<00:00, 5.22MB/s]

Downloading (…)tokenizer/vocab.json: 100% 1.06M/1.06M [00:00<00:00, 2.13MB/s]

Downloading (…)d54/unet/config.json: 100% 1.81k/1.81k [00:00<00:00, 6.64MB/s]

Downloading (…)9d54/vae/config.json: 100% 661/661 [00:00<00:00, 5.26MB/s]

Downloading (…)tokenizer/merges.txt:   0% 0.00/525k [00:00<?, ?B/s][A

Downloading model.safetensors:   0% 0.00/681M [00:00<?, ?B/s][A[A
Dow

In [None]:
!python export_sd_onnx.py \
  --model_path="Zero-nnkn/stable-diffusion-2-pokemon" \
  --opset=15 \
  --save_dir="onnx16" \
  --fp16

## Optimum Code

### ONNX32

In [None]:
from optimum.onnxruntime import ORTStableDiffusionPipeline

# Read pytorch pipeline and save to onnx
pipeline = ORTStableDiffusionPipeline.from_pretrained(
    "Zero-nnkn/stable-diffusion-2-pokemon",
    export=True,
)

pipeline.save_pretrained("./onnx")

In [None]:
from huggingface_hub import create_repo, create_branch, delete_folder, upload_folder

repo_id = "Zero-nnkn/stable-diffusion-2-pokemon"
branch_name = "onnx"
folder = "onnx"

create_branch(repo_id, branch=branch_name, exist_ok=True)
# delete_branch(repo_id="Zero-nnkn/stable-diffusion-2-pokemon", branch="onnx")

upload_folder(
    repo_id=repo_id,
    revision=branch_name,
    folder_path=folder,
    commit_message="Upload ONNX",
    delete_patterns="*.*",
)

model.onnx:   0%|          | 0.00/1.95M [00:00<?, ?B/s]

'https://huggingface.co/Zero-nnkn/stable-diffusion-2-pokemon/tree/onnx/'

### ONNX8

In [None]:
from optimum.onnxruntime import ORTStableDiffusionPipeline

# Read pytorch pipeline and save to onnx
pipeline = ORTStableDiffusionPipeline.from_pretrained(
    "Zero-nnkn/stable-diffusion-2-pokemon",
    revision="onnx",
)

pipeline.save_pretrained("./onnx")

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
import shutil
from pathlib import Path

from onnxruntime.quantization import quantize_dynamic, QuantType


def quantize_sd_onnx(model_dir, save_dir):
    shutil.copytree(model_dir, save_dir)
    sub_folders = [subdir for subdir in Path(save_dir).glob('*') if subdir.is_dir()]

    for sdir in sub_folders:
        onnx_submodel_path = sdir / "model.onnx"
        if onnx_submodel_path.is_file():
            external_data = True if (sdir / "model.onnx_data").is_file() else False

            quantize_dynamic(
                model_input=onnx_submodel_path,
                model_output=onnx_submodel_path,
                per_channel=True,
                reduce_range=True,
                weight_type=QuantType.QUInt8,
                optimize_model=True,
            )
            print(f"Quantized model saved at: {onnx_submodel_path}")

            if external_data==True:
                (sdir / 'model.onnx_data').unlink() # remove

In [None]:
quantize_sd_onnx('onnx', 'onnx-u8')

Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.0/transformer_blocks.0/attn1/MatMul]
Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.0/transformer_blocks.0/attn1/MatMul_1]
Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.0/transformer_blocks.0/attn2/MatMul]
Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.0/transformer_blocks.0/attn2/MatMul_1]
Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.1/transformer_blocks.0/attn1/MatMul]
Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.1/transformer_blocks.0/attn1/MatMul_1]
Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.1/transformer_blocks.0/attn2/MatMul]
Ignore MatMul due to non constant B: /[/down_blocks.0/attentions.1/transformer_blocks.0/attn2/MatMul_1]
Ignore MatMul due to non constant B: /[/down_blocks.1/attentions.0/transformer_blocks.0/attn1/MatMul]
Ignore MatMul due to non constant B: /[/down_blocks.1/attentions.0/transfo

In [None]:
from huggingface_hub import create_repo, create_branch, delete_folder, upload_folder

repo_id = "Zero-nnkn/stable-diffusion-2-pokemon"
branch_name = "onnx-u8"
folder = "onnx-u8"

create_branch(repo_id, branch=branch_name, exist_ok=True)
# delete_branch(repo_id="Zero-nnkn/stable-diffusion-2-pokemon", branch="onnx")

upload_folder(
    repo_id=repo_id,
    revision=branch_name,
    folder_path=folder,
    commit_message="Upload ONNX uint8",
    delete_patterns="*.*",
)

'https://huggingface.co/Zero-nnkn/stable-diffusion-2-pokemon/tree/onnx-u8/'

### OpenVINO32

In [2]:
from optimum.intel import OVStableDiffusionPipeline

# Read pytorch pipeline and save to onnx
pipeline = OVStableDiffusionPipeline.from_pretrained(
    "Zero-nnkn/stable-diffusion-2-pokemon",
    export=True
)

pipeline.save_pretrained("./openvino")

Downloading (…)ain/model_index.json:   0%|          | 0.00/598 [00:00<?, ?B/s]

Framework not specified. Using pt to export to ONNX.


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/374 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)_encoder/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)d54/unet/config.json:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading (…)9d54/vae/config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

Keyword arguments {'subfolder': '', 'trust_remote_code': False} are not expected by StableDiffusionPipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

Using framework PyTorch: 2.0.1+cu118
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):


verbose: False, log level: Level.ERROR



Using framework PyTorch: 2.0.1+cu118
  if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
  assert hidden_states.shape[1] == self.channels
  assert hidden_states.shape[1] == self.channels
  assert hidden_states.shape[1] == self.channels
  if hidden_states.shape[0] >= 64:
  if not return_dict:
Saving external data to one file...


verbose: False, log level: Level.ERROR



Using framework PyTorch: 2.0.1+cu118
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


verbose: False, log level: Level.ERROR



Using framework PyTorch: 2.0.1+cu118


verbose: False, log level: Level.ERROR



Compiling the vae_decoder...
Compiling the unet...
Compiling the text_encoder...
Compiling the vae_encoder...


In [6]:
from huggingface_hub import create_repo, create_branch, delete_folder, upload_folder

repo_id = "Zero-nnkn/stable-diffusion-2-pokemon"
branch_name = "openvino"
folder = "openvino"

create_branch(repo_id, branch=branch_name, exist_ok=True)
# delete_branch(repo_id="Zero-nnkn/stable-diffusion-2-pokemon", branch="onnx")

upload_folder(
    repo_id=repo_id,
    revision=branch_name,
    folder_path=folder,
    commit_message="Upload OpenVINO",
    delete_patterns="*.*",
)

openvino_model.bin:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

openvino_model.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/137M [00:00<?, ?B/s]

'https://huggingface.co/Zero-nnkn/stable-diffusion-2-pokemon/tree/openvino/'