## Load

In [1]:
import os
import torch

from animatediff.pipelines.pipeline_animation_inpaint import AnimationInpaintPipeline

from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL
from animatediff.models.unet import UNet3DConditionModel

stable_diffusion_model_path = os.path.join(os.getcwd(), "models", "StableDiffusion", "ACertainThing")

tokenizer = CLIPTokenizer.from_pretrained(stable_diffusion_model_path, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(stable_diffusion_model_path, subfolder="text_encoder").cuda()
vae = AutoencoderKL.from_pretrained(stable_diffusion_model_path, subfolder="vae").cuda()
unet = UNet3DConditionModel.from_pretrained_2d(stable_diffusion_model_path, subfolder="unet", unet_additional_kwargs={
    "unet_use_cross_frame_attention": False,
    "unet_use_temporal_attention": False,
    "use_motion_module": True,
    "motion_module_resolutions": [1, 2, 4, 8],
    "motion_module_mid_block": False,
    "motion_module_decoder_only": False,
    "motion_module_type": "Vanilla",
    "motion_module_kwargs": {
        "num_attention_heads": 8,
        "num_transformer_block": 1,
        "attention_block_types": ["Temporal_Self", "Temporal_Self"],
        "temporal_position_encoding": True,
        "temporal_position_encoding_max_len": 24,
        "temporal_attention_dim_div": 1
    }
}).cuda()
motion_module_path = os.path.join(os.getcwd(), "models", "Motion_Module", "mm_sd_v14.ckpt")
motion_module_state_dict = torch.load(motion_module_path, map_location="cpu")
missing, unexpected = unet.load_state_dict(motion_module_state_dict, strict=False)
assert len(unexpected) == 0
unet.enable_xformers_memory_efficient_attention()
scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="linear")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at /home/tonimono/AnimateDiff/models/StableDiffusion/ACertainThing were not used when initializing CLIPTextModel: ['text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.laye

loaded temporal unet's pretrained weights from /home/tonimono/AnimateDiff/models/StableDiffusion/ACertainThing/unet ...
### missing keys: 560; 
### unexpected keys: 0;
### Temporal Module Parameters: 417.1376 M


## Prompt

In [2]:
import PIL

prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
negative_prompt = ""
num_inference_steps = 25
guidance_scale = 7.5
width = 512
height = 512
video_length = 16
seed = 1
start_image = PIL.Image.open(os.path.join(os.getcwd(), "images", "start_image.png")).convert("RGB")



## Pipeline

In [3]:
from typing import List
import inspect
import numpy as np

vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)

def preprocess_image(image):
    w, h = image.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return 2.0 * image - 1.0

def encode_prompt(prompt, device, negative_prompt):
    batch_size = len(prompt) if isinstance(prompt, list) else 1
    text_inputs = tokenizer(
        prompt,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt",
    )
    text_input_ids = text_inputs.input_ids
    untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
    if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
        removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
    if hasattr(text_encoder.config, "use_attention_mask") and text_encoder.config.use_attention_mask:
        attention_mask = text_inputs.attention_mask.to(device)
    else:
        attention_mask = None
    text_embeddings = text_encoder(
        text_input_ids.to(device),
        attention_mask=attention_mask,
    )
    text_embeddings = text_embeddings[0]
    # duplicate text embeddings for each generation per prompt, using mps friendly method
    bs_embed, seq_len, _ = text_embeddings.shape
    text_embeddings = text_embeddings.view(bs_embed, seq_len, -1)
    # get unconditional embeddings for classifier free guidance
    uncond_tokens: List[str]
    if negative_prompt is None:
        uncond_tokens = [""] * batch_size
    elif type(prompt) is not type(negative_prompt):
        raise TypeError(
            f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
            f" {type(prompt)}."
        )
    elif isinstance(negative_prompt, str):
        uncond_tokens = [negative_prompt]
    elif batch_size != len(negative_prompt):
        raise ValueError(
            f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
            f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
            " the batch size of `prompt`."
        )
    else:
        uncond_tokens = negative_prompt
    max_length = text_input_ids.shape[-1]
    uncond_input = tokenizer(
        uncond_tokens,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    if hasattr(text_encoder.config, "use_attention_mask") and text_encoder.config.use_attention_mask:
        attention_mask = uncond_input.attention_mask.to(device)
    else:
        attention_mask = None
    uncond_embeddings = text_encoder(
        uncond_input.input_ids.to(device),
        attention_mask=attention_mask,
    )
    uncond_embeddings = uncond_embeddings[0]
    # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
    seq_len = uncond_embeddings.shape[1]
    uncond_embeddings = uncond_embeddings.view(batch_size, seq_len, -1)
    # For classifier free guidance, we need to do two forward passes.
    # Here we concatenate the unconditional and text embeddings into a single batch
    # to avoid doing two forward passes
    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
    return text_embeddings

def prepare_latents(batch_size, num_channels_latents, video_length, height, width, dtype, device):
    shape = (batch_size, num_channels_latents, video_length, height // vae_scale_factor, width // vae_scale_factor)
    rand_device = device
    latents = torch.randn(shape, device=rand_device, dtype=dtype).to(device)
    # scale the initial noise by the standard deviation required by the scheduler
    latents = latents * scheduler.init_noise_sigma
    return latents

def prepare_image_latents(image, timestep, batch_size, dtype, device):
    image = image.to(device=device, dtype=dtype)
    init_latent_dist = vae.encode(image).latent_dist
    init_latents = init_latent_dist.sample()
    init_latents = 0.18215 * init_latents
    init_latents = torch.cat([init_latents] * batch_size, dim=0)
    init_latents_orig = init_latents
    noise = torch.randn(init_latents.shape, device=device, dtype=dtype)
    init_latents = scheduler.add_noise(init_latents, noise, timestep)
    latents = init_latents
    return latents, init_latents_orig, noise



def prepare_extra_step_kwargs(generator, eta):
    # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
    # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
    # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
    # and should be between [0, 1]

    accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
    extra_step_kwargs = {}
    if accepts_eta:
        extra_step_kwargs["eta"] = eta

    # check if the scheduler accepts generator
    accepts_generator = "generator" in set(inspect.signature(scheduler.step).parameters.keys())
    if accepts_generator:
        extra_step_kwargs["generator"] = generator
    return extra_step_kwargs

In [4]:
batch_size = 1
if isinstance(prompt, list):
    batch_size = len(prompt)

device = "cuda"

prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
if negative_prompt is not None:
    negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size 
text_embeddings = encode_prompt(
    prompt, device, negative_prompt
)

scheduler.set_timesteps(num_inference_steps, device=device)
timesteps = scheduler.timesteps

num_channels_latents = unet.in_channels

latents = prepare_latents(batch_size, num_channels_latents, video_length, height, width, text_embeddings.dtype, device)
print(latents.shape)
latents_dtype = latents.dtype

start_image = preprocess_image(start_image)

latent_timestep = timesteps[:1].repeat(batch_size)
image_latents, init_latents_orig, noise = prepare_image_latents(start_image, timesteps[0], batch_size, latents_dtype, device)
print(image_latents.shape)

extra_step_kwargs = {"eta": 0.0, "generator": None}

latent_model_input = torch.cat([latents] * 2)
latent_model_input = scheduler.scale_model_input(latent_model_input, timesteps[0])


    


torch.Size([1, 4, 16, 64, 64])


  image = image.resize((w, h), resample=PIL.Image.LANCZOS)


torch.Size([1, 4, 96, 64])


: 

## Inference

In [None]:
import time
from datetime import datetime

from animatediff.utils.util import save_videos_grid

torch.manual_seed(seed)

savedir = os.path.join(os.getcwd(), "sample", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
savedir_sample = os.path.join(savedir, "sample")
os.makedirs(savedir, exist_ok=True)

save_sample_path = os.path.join(savedir_sample, f"{str(int(time.time()))}.mp4")
save_videos_grid(sample, save_sample_path)