In [2]:
!pip install diffusers accelerate

Collecting diffusers
  Downloading diffusers-0.25.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: diffusers, accelerate
Successfully installed accelerate-0.25.0 diffusers-0.25.0


In [8]:
import diffusers
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler, PNDMScheduler
from torch import autocast
import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler, DPMSolverMultistepScheduler
import numpy as np
import time
import tqdm
from PIL import Image

In [9]:
# Autoencoder for converting images to latent space and latent space to images
# This makes the images 8 times smaller, turining (3, 512, 512) into (3, 64, 64)
# This is what makes Stable diffusion (and other latent diffusion models) so efficent.
vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")

# Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

# the UNet model for generating the latents.
# This consists of an encoder that downsamples the latent space, and then reverses the process.
# The model contains short cut connections to avoid loss of detail, and uses cross attention to include the text embeddigns
unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")

scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
#scheduler = DPMSolverMultistepScheduler()

# Allow cuda, but dont force it.
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
vae.to(torch_device)
text_encoder.to(torch_device)
unet.to(torch_device)

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

UNet2DConditionModel(
  (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=320, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (down_blocks): ModuleList(
    (0): CrossAttnDownBlock2D(
      (attentions): ModuleList(
        (0-1): 2 x Transformer2DModel(
          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
          (transformer_blocks): ModuleList(
            (0): BasicTransformerBlock(
              (attn1): CrossAttention(
                (to_q): Linear(in_features=320, out_features=320, bias=False)
                (to_k): Linear(in_features=320, out_features=320, bias=False)
                (to_v): Linear(in_features=320, out_features=320, bias=False)
                (to_out): ModuleList(
          

In [10]:
# Added code start
def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
    """helper function to spherically interpolate two arrays v1 v2"""

    if not isinstance(v0, np.ndarray):
        inputs_are_torch = True
        input_device = v0.device
        v0 = v0.cpu().numpy()
        v1 = v1.cpu().numpy()

    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
    if np.abs(dot) > DOT_THRESHOLD:
        v2 = (1 - t) * v0 + t * v1
    else:
        theta_0 = np.arccos(dot)
        sin_theta_0 = np.sin(theta_0)
        theta_t = theta_0 * t
        sin_theta_t = np.sin(theta_t)
        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
        s1 = sin_theta_t / sin_theta_0
        v2 = s0 * v0 + s1 * v1

    if inputs_are_torch:
        v2 = torch.from_numpy(v2).to(input_device)

    return v2
# Added code end

In [11]:
def generate(startprompts, endprompts, interp=0, seed1=time.time(), seed2=time.time(), steps=20, width=512, height=512, guidance_scale=7.5):
    batch_size = len(startprompts)
    generator1 = torch.manual_seed(seed1)
    generator2 = torch.manual_seed(seed2)

    # Convert the text into embedings, just a vector representing the texts meaning
    start_text_input = tokenizer(startprompts, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    start_text_embeddings = text_encoder(start_text_input.input_ids.to(torch_device))[0]

    # Convert the text into embedings, just a vector representing the texts meaning
    end_text_input = tokenizer(endprompts, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    end_text_embeddings = text_encoder(end_text_input.input_ids.to(torch_device))[0]

    # Generate dummy text embeddings for classifier-free guidance
    max_length = start_text_input.input_ids.shape[-1]
    uncond_input = tokenizer(
                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
                )
    uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

    text_embeddings = torch.cat([uncond_embeddings, start_text_embeddings, end_text_embeddings])
    text_embeddings.to(torch_device)

    scheduler.set_timesteps(steps)
    timesteps = list(scheduler.timesteps)

    latents1 = torch.randn(
        (batch_size, unet.in_channels, height // 8, width // 8),
        generator=generator1,
    )
    # latents1 = latents1 * scheduler.init_noise_sigma

    latents2 = torch.randn(
        (batch_size, unet.in_channels, height // 8, width // 8),
        generator=generator2,
    )
    # latents2 = latents2 * scheduler.init_noise_sigma

    latents = slerp(interp, latents1, latents2)


    latents = latents.to(torch_device)

    #.....

    def decode_latents(latents, vae):
        """
        Utility function to covert latent space
        """
        latents = 1 / 0.18215 * latents
        with torch.no_grad():
            image = vae.decode(latents).sample

        image = (image / 2 + 0.5).clamp(0, 1)
        image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
        images = (image * 255).round().astype("uint8")
        return [Image.fromarray(image) for image in images]

    #This is the diffusion process
    for i,t in enumerate(tqdm.tqdm(timesteps)):
        # Expand the latents, 3 times becuase we now have 3 embeddings
        latent_model_input = torch.cat([latents] * 3)

        latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)

        # predict the noise residual
        with torch.no_grad():
            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

        # Split the results to extract noise predictions for every embedding
        noise_pred_uncond, noise_pred_start, noise_pred_end = noise_pred.chunk(3)

        # Interpolate the predicitons before performing classifyer free guidance
        start_guidance = (noise_pred_start - noise_pred_uncond) * (1-interp)
        end_guidance = (noise_pred_end - noise_pred_uncond) * (interp)

        noise_pred = noise_pred_uncond + (start_guidance + end_guidance) * guidance_scale

        # compute the previous less noisy sample.
        latents = scheduler.step(noise_pred, t, latents).prev_sample

    return decode_latents(latents, vae)

In [12]:
prompts =  [
    (43,"A man walking alone"),
    (1334,"A woman"),
]

interpolated_images = 10
steps = 10

# Keep a counter of generated images
frame_counter = 0
import os, shutil
shutil.rmtree('img')
os.mkdir('img')
# For every prompt pair...
for prompt in range(len(prompts) - 1):
    start = prompts[prompt]
    end = prompts[prompt + 1]
    print(start, end)
    # Generate interpolation frames
    for i in range(interpolated_images):
        interp = i/interpolated_images
        generate([start[1]], [end[1]], seed1=start[0], seed2=end[0], interp=interp, steps=steps)[0].save(f"img/out_{frame_counter}.png")
        frame_counter = frame_counter + 1

print(f"Generated {frame_counter} images!")

(43, 'A man walking alone') (1334, 'A woman')


100%|██████████| 10/10 [00:06<00:00,  1.45it/s]
100%|██████████| 10/10 [00:06<00:00,  1.47it/s]
100%|██████████| 10/10 [00:06<00:00,  1.47it/s]
100%|██████████| 10/10 [00:06<00:00,  1.45it/s]
100%|██████████| 10/10 [00:06<00:00,  1.45it/s]
100%|██████████| 10/10 [00:06<00:00,  1.44it/s]
100%|██████████| 10/10 [00:06<00:00,  1.44it/s]
100%|██████████| 10/10 [00:07<00:00,  1.42it/s]
100%|██████████| 10/10 [00:07<00:00,  1.41it/s]
100%|██████████| 10/10 [00:07<00:00,  1.40it/s]


Generated 10 images!


In [38]:
import cv2
import os
import imageio

image_folder = './img'
video_name = 'my_outout.mp4'

images = [img for img in os.listdir(image_folder) if img.startswith("out_") and img.endswith(".png")]
# images.sort()  # Ensure images are in the correct order
images = sorted(images, key=lambda x: int(x.split('_')[1].split('.')[0]))
print(images)

# frames = []
# for image in images:
#     frames.append(cv2.imread(os.path.join(image_folder, image)))

# imageio.mimsave(gif_name, frames, format='MP4', duration=50)  # Adjust duration as needed

frame = cv2.imread(os.path.join(image_folder, images[0]))
height, width, layers = frame.shape

video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), 3, (width, height))

for image in images:
    video.write(cv2.imread(os.path.join(image_folder, image)))

cv2.destroyAllWindows()
video.release()

['out_0.png', 'out_1.png', 'out_2.png', 'out_3.png', 'out_4.png', 'out_5.png', 'out_6.png', 'out_7.png', 'out_8.png', 'out_9.png', 'out_10.png', 'out_11.png', 'out_12.png', 'out_13.png', 'out_14.png', 'out_15.png', 'out_16.png', 'out_17.png', 'out_18.png', 'out_19.png', 'out_20.png', 'out_21.png', 'out_22.png', 'out_23.png', 'out_24.png', 'out_25.png', 'out_26.png', 'out_27.png', 'out_28.png', 'out_29.png', 'out_30.png', 'out_31.png', 'out_32.png', 'out_33.png', 'out_34.png', 'out_35.png', 'out_36.png', 'out_37.png', 'out_38.png', 'out_39.png']


In [1]:
pip install stable_diffusion_videos



In [2]:
from stable_diffusion_videos import StableDiffusionWalkPipeline
import torch



In [3]:
pipeline = StableDiffusionWalkPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    torch_dtype=torch.float16,
).to("cuda")

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

safety_checker/pytorch_model.fp16.safetensors not found


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

pytorch_model.fp16.bin:   0%|          | 0.00/608M [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

safety_checker/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/608M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

(…)kpoints/scheduler_config-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/492M [00:00<?, ?B/s]

pytorch_model.fp16.bin:   0%|          | 0.00/246M [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.bin:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.bin:   0%|          | 0.00/1.72G [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/1.72G [00:00<?, ?B/s]

diffusion_pytorch_model.non_ema.bin:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

(…)fusion_pytorch_model.non_ema.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.bin:   0%|          | 0.00/335M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
The config attributes {'scaling_factor': 0.18215} were passed to AutoencoderKL, but are not expected and will be ignored. Please verify your config.json configuration file.


In [5]:
video_path = pipeline.walk(
    prompts=['a cat', 'a tiger'],
    seeds=[42, 1337],
    num_interpolation_steps=10,
    height=512,  # use multiples of 64 if > 512. Multiples of 8 if < 512.
    width=512,   # use multiples of 64 if > 512. Multiples of 8 if < 512.
    output_dir='dreams',        # Where images/videos will be saved
    name='animals_test',        # Subdirectory of output_dir where images/videos will be saved
    guidance_scale=8.5,         # Higher adheres to prompt more, lower lets model take the wheel
    num_inference_steps=20,     # Number of diffusion steps per image generated. 50 is good default
)

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]