In [9]:
import diffusers
import torch
from diffusers import DiffusionPipeline
from diffusers.pipelines.audioldm2.modeling_audioldm2 import AudioLDM2UNet2DConditionModel
from src.hooked_model.hooked_model_audioldm2 import HookedAudioLDM2Model
import numpy as np
import src.hooked_model.scheduler
from src.hooked_model.utils import get_timesteps
from IPython.display import clear_output

## How to use the hooked model interface with model from diffusers

In [None]:
model_name = "cvssp/audioldm2-large"
pipe = DiffusionPipeline.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    use_safetensors=True,
).to("cuda")
model = AudioLDM2UNet2DConditionModel.from_pretrained(
    model_name,
    subfolder="unet",
    torch_dtype=torch.float16,
    use_safetensors=True,
).to("cuda")
scheduler = src.hooked_model.scheduler.DDIMScheduler.from_config(pipe.scheduler.config)
pipe.scheduler = scheduler
pipe.unet = model
hooked_model = HookedAudioLDM2Model(
    model=model,
    scheduler=scheduler,
    encode_prompt=pipe.encode_prompt,
    get_timesteps=get_timesteps,
    pipeline=pipe,
    vae=pipe.vae,
)
clear_output()

You need to provide:
- denoiser model - either UNet or Transformer based, the assumption is that it should predict noise
- scheduler - it has to have certain fields and implement scale_model_input() and step() methods
- encode_prompt - function that encodes prompt into embeddings
- get_timesteps - function that returns discrete timesteps for the diffusion process given the number of inference steps
- vae - VAE model for latent space encoding/decoding, if latent space model is used

In [5]:
prompt = ['Male man singing a song. The low quality recording features a ballad song that contains sustained strings, mellow piano melody and soft male vocal singing over it. It sounds sad and soulful, like something you would hear at Sunday services. Male man voice.']
num_inference_steps = 100
seed = 22
num_waveforms_per_prompt = 1
audio_length_in_s = 9
guidance_scale = 5.0
negative_prompt = ["Low quality, average quality."]


out = hooked_model(
    prompt=prompt,
    generator=torch.Generator(device="cuda").manual_seed(seed),
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    negative_prompt=negative_prompt,
    audio_length_in_s=audio_length_in_s,
    num_waveforms_per_prompt=num_waveforms_per_prompt,
)
clear_output()

In [None]:
pipe.scheduler = diffusers.DDIMScheduler.from_config(pipe.scheduler.config)

out2 = pipe(
    prompt=prompt,
    generator=torch.Generator(device="cuda").manual_seed(seed),
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    negative_prompt=negative_prompt,
    audio_length_in_s=audio_length_in_s,
    num_waveforms_per_prompt=num_waveforms_per_prompt,
)[0]
clear_output()

In [None]:
np.all(np.isclose(out[0], out2[0]))

## How to gather activations at specific positions in the model

All you need to do is to provide a list of positions you want to cache.

In [None]:
prompt = 'Male man singing a song. The low quality recording features a ballad song that contains sustained strings, mellow piano melody and soft male vocal singing over it. It sounds sad and soulful, like something you would hear at Sunday services. Male man voice.'
num_inference_steps = 100
seed = 22
num_waveforms_per_prompt = 1
audio_length_in_s = 9
guidance_scale = 5.0
negative_prompt = "Low quality, average quality."

out, cache_dict = hooked_model.run_with_cache(
    prompt=prompt,
    generator=torch.Generator(device="cuda").manual_seed(seed),
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    negative_prompt=negative_prompt,
    audio_length_in_s=audio_length_in_s,
    num_waveforms_per_prompt=num_waveforms_per_prompt,
    positions_to_cache=["up_blocks.1.attentions.5", "up_blocks.1.attentions.10"],
)

In [None]:
print(cache_dict['output']['up_blocks.1.attentions.5'].shape)
print(cache_dict['output']['up_blocks.1.attentions.10'].shape)