# Introduction - Stable Diffusion Example using Snapdragon Elite X NPU

This is using a reference here to understand to get this running, this is Qualcomms work: https://docs.qualcomm.com/bundle/publicresource/topics/80-64748-1/model_execution_windows.html

In [None]:
# Copy the required libraries and binaries to libs folder 
import shutil 
import os

execution_ws = os.getcwd()
SDK_dir = execution_ws + "qnn_assets\\" + #"<Insert path to unzipped QNN SDK here>"

lib_dir = SDK_dir + "\\lib\\aarch64-windows-msvc\\"
binary = SDK_dir + "\\bin\\aarch64-windows-msvc\qnn-net-run.exe"
skel = SDK_dir + "\\lib\\hexagon-v68\\unsigned\libQnnHtpV68Skel.so"
des_dir = execution_ws + "qnn_assets\\QNN_binaries"

# Copy necessary libraries to a common location
libs = ["QnnHtp.dll", "QnnHtpNetRunExtensions.dll", "QnnHtpPrepare.dll", "QnnHtpV68Stub.dll"]
for lib in libs:
    shutil.copy(lib_dir+lib, des_dir)
    
# Copy binary
shutil.copy(binary, des_dir)

# Copy Skel
shutil.copy(skel, des_dir)

In [None]:
import numpy as np

# Any user defined prompt
user_prompt = "decorated modern country house interior, 8 k, light reflections"

# User defined seed value
user_seed = np.int64(1.36477711e+14)

# User defined step value, any integer value in {20, 50}
user_step = 20

# User define text guidance, any float value in [5.0, 15.0]
user_text_guidance = 7.5

# Error checking for user_seed
assert isinstance(user_seed, np.int64) == True,"user_seed should be of type int64"

# Error checking for user_step
assert isinstance(user_step, int) == True,"user_step should be of type int"
assert user_step == 20 or user_step == 50,"user_step should be either 20 or 50"

# Error checking for user_text_guidance
assert isinstance(user_text_guidance, float) == True,"user_text_guidance should be of type float"
assert user_text_guidance >= 5.0 and user_text_guidance <= 15.0,"user_text_guidance should be a float from [5.0, 15.0]"

In [None]:
import torch
from diffusers import UNet2DConditionModel
from diffusers.models.embeddings import get_timestep_embedding

# pre-load time embedding
time_embeddings = UNet2DConditionModel.from_pretrained('runwayml/stable-diffusion-v1-5',
                                                       subfolder='unet', cache_dir='./cache/diffusers').time_embedding

def get_time_embedding(timestep):
    timestep = torch.tensor([timestep])
    t_emb = get_timestep_embedding(timestep, 320, True, 0)
    
    emb = time_embeddings(t_emb).detach().numpy()
    
    return emb

In [None]:
import numpy as np
from tokenizers import Tokenizer

# Define Tokenizer output max length (must be 77)
tokenizer_max_length = 77

# Initializing the Tokenizer
tokenizer = Tokenizer.from_pretrained("openai/clip-vit-base-patch32")

# Setting max length to tokenizer_max_length
tokenizer.enable_truncation(tokenizer_max_length)
tokenizer.enable_padding(pad_id=49407, length=tokenizer_max_length)

def run_tokenizer(prompt):
    # Run Tokenizer encoding
    token_ids = tokenizer.encode(prompt).ids
    # Convert tokens list to np.array
    token_ids = np.array(token_ids, dtype=np.float32)

    return token_ids

In [None]:
import numpy as np
import torch
from diffusers import DPMSolverMultistepScheduler

# Initializing the Scheduler
scheduler = DPMSolverMultistepScheduler(num_train_timesteps=1000, beta_start=0.00085,
                                        beta_end=0.012, beta_schedule="scaled_linear")
# Setting up user provided time steps for Scheduler
scheduler.set_timesteps(user_step)

def run_scheduler(noise_pred_uncond, noise_pred_text, latent_in, timestep):
    # Convert all inputs from NHWC to NCHW
    noise_pred_uncond = np.transpose(noise_pred_uncond, (0,3,1,2)).copy()
    noise_pred_text = np.transpose(noise_pred_text, (0,3,1,2)).copy()
    latent_in = np.transpose(latent_in, (0,3,1,2)).copy()

    # Convert all inputs to torch tensors
    noise_pred_uncond = torch.from_numpy(noise_pred_uncond)
    noise_pred_text = torch.from_numpy(noise_pred_text)
    latent_in = torch.from_numpy(latent_in)

    # Merge noise_pred_uncond and noise_pred_text based on user_text_guidance
    noise_pred = noise_pred_uncond + user_text_guidance * (noise_pred_text - noise_pred_uncond)

    # Run Scheduler step
    latent_out = scheduler.step(noise_pred, timestep, latent_in).prev_sample.numpy()
    
    # Convert latent_out from NCHW to NHWC
    latent_out = np.transpose(latent_out, (0,2,3,1)).copy()
    
    return latent_out

# Function to get timesteps
def get_timestep(step):
    return np.int32(scheduler.timesteps.numpy()[step])

In [None]:
# Run Tokenizer
uncond_tokens = run_tokenizer("")
cond_tokens = run_tokenizer(user_prompt)

# Run Text Encoder on Tokens
uncond_text_embedding = run_text_encoder(uncond_tokens)
user_text_embedding = run_text_encoder(cond_tokens)

# Initialize the latent input with random initial latent
random_init_latent = torch.randn((1, 4, 64, 64), generator=torch.manual_seed(user_seed)).numpy()
latent_in = random_init_latent.transpose((0, 2, 3, 1)).copy()

# Run the loop for user_step times
for step in range(user_step):
    print(f'Step {step} Running...')
    
    # Get timestep from step
    timestep = get_timestep(step)

    # Run U-net for const embeddings
    unconditional_noise_pred = run_unet(latent_in, get_time_embedding(timestep), uncond_text_embedding)

    # Run U-net for user text embeddings
    conditional_noise_pred = run_unet(latent_in, get_time_embedding(timestep), user_text_embedding)
    
    # Run Scheduler
    latent_in = run_scheduler(unconditional_noise_pred, conditional_noise_pred, latent_in, timestep)

# Run VAE
output_image = run_vae(latent_in)

In [None]:
from PIL import Image
from IPython.display import display

# Display the generated output
display(Image.fromarray(output_image, mode="RGB"))