In [None]:
%%capture
!pip install diffusers["torch"] transformers

In [None]:
#Load the model and schedulers
from diffusers import DDPMScheduler, UNet2DModel
checkpoint="google/ddpm-cat-256"
scheduler = DDPMScheduler.from_pretrained(checkpoint)
model = UNet2DModel.from_pretrained(checkpoint).to("cuda")

#set the number of timesteps to run the denoising process for
scheduler.set_timesteps(50)



In [None]:
scheduler.timesteps

In [None]:
model.config

In [None]:
#create random noise
import torch
sample_size = model.config.sample_size
noise = torch.randn((1, 3, sample_size, sample_size)).to("cuda")

In [None]:
noise.shape

In [None]:
#noise.to("cuda")

create a loop: model gives out noise residual, noise residual fed into scheduler. scheduler.step() gives out denoised previous image
which is put into the loop again

In [None]:
input = noise

for t in scheduler.timesteps:
    #noise residual
    with torch.no_grad():
        noisy_residual = model(input, t).sample
    #denoise
    previous_noisy_sample = scheduler.step(model_output=noisy_residual, timestep=t, sample=input).prev_sample
    
    input = previous_noisy_sample

In [None]:
from PIL import Image
import numpy as np

image = (input / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
image = Image.fromarray((image * 255).round().astype("uint8"))
image

## Deconstruct stable diffusion

In [None]:
from PIL import Image
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler

checkpoint = "CompVis/stable-diffusion-v1-4"

In [None]:
vae = AutoencoderKL.from_pretrained(checkpoint, subfolder="vae")
model = UNet2DConditionModel.from_pretrained(checkpoint, subfolder="unet")
tokenizer = CLIPTokenizer.from_pretrained(checkpoint, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(checkpoint, subfolder="text_encoder")

In [None]:
from diffusers import UniPCMultistepScheduler
shecduler = UniPCMultistepScheduler.from_pretrained(checkpoint, subfolder="scheduler")

In [None]:
device="cuda"
vae.to(device)
text_encoder.to(device)
model.to(device)

### Create text embeddings

In [None]:
prompt = ["a cat is playing football wearing the Argentina jersey of Messi"]
height = 512
width = 512
num_inference_steps = 25
guidance_scale = 7.5
generator = torch.manual_seed(0)
batch_size = len(prompt)

In [None]:
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to(device))[0]

In [None]:
max_length = text_input.input_ids.shape[-1]
uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]

In [None]:
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

## Create Random Noise

In [None]:
model.in_channels

In [None]:
model.config.in_channels

In [None]:
#why divided by 8:  vae model has 3 down-sampling layers.
2 ** (len(vae.config.block_out_channels) - 1) == 8

In [None]:
latents = torch.randn((batch_size, model.config.in_channels, height//8, width//8), generator=generator)
latents.to(device)

## Denoise the image

In [None]:
scheduler.init_noise_sigma

In [None]:
#scaling the input with the initial noise distribution, sigma
latents = latents * scheduler.init_noise_sigma

In [None]:
latents.to(device)

In [None]:
text_embeddings.to(device)

In [None]:
%%capture
model.to(device)

In [None]:
model.config

In [None]:
from tqdm.auto import tqdm
scheduler.set_timesteps(num_inference_steps)

for t in tqdm(scheduler.timesteps):
    # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
    latent_model_input = torch.cat([latents]*2)
    latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
    
    with torch.no_grad():
        noise_pred = model(latent_model_input.cuda(), t, encoder_hidden_states=text_embeddings).sample
    
    
    #perform guidance
    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
    
    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

    # compute the previous noisy sample x_t -> x_t-1

    latents = scheduler.step(noise_pred.cuda(), t, latents.cuda()).prev_sample

### Decode Image

In [None]:
# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
with torch.no_grad():
    image = vae.decode(latents).sample

In [None]:
image = (image / 2 + 0.5).clamp(0, 1)

image = image.detach().cpu().permute(0, 2, 3, 1).numpy()

images = (image * 255).round().astype("uint8")

pil_images = [Image.fromarray(image) for image in images]

pil_images[0]

## Training

In [None]:
%%capture
!pip install dataclass

In [None]:
from dataclasses import dataclass

@dataclass
class TrainingConfig:
    image_size = 128  # the generated image resolution
    train_batch_size = 16
    eval_batch_size = 16  # how many images to sample during evaluation
    num_epochs = 50
    gradient_accumulation_steps = 1
    learning_rate = 1e-4
    lr_warmup_steps = 500
    save_image_epochs = 10
    save_model_epochs = 30
    mixed_precision = "fp16"  # `no` for float32, `fp16` for automatic mixed precision
    output_dir = "ddpm-butterflies-128"  # the model name locally and on the HF Hub
    push_to_hub = False  # whether to upload the saved model to the HF Hub
    hub_private_repo = False
    overwrite_output_dir = True  # overwrite the old model when re-running the notebook
    seed = 0


In [None]:
config = TrainingConfig()

In [None]:
from datasets import load_dataset
config.dataset_name = "huggan/few-shot-obama"
dataset = load_dataset(config.dataset_name, split="train")

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 4, figsize=(16, 4))
for i, image in enumerate(dataset[:4]["image"]):
    print(image.size)
    axs[i].imshow(image)
    axs[i].set_axis_off()
fig.show()

## Transformation

In [None]:
from torchvision import transforms

preprocess = transforms.Compose(
    [
        transforms.Resize((config.image_size, config.image_size)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5],[0.5])
    ]
)

In [None]:
def transform(examples):
    images = [preprocess(image.convert("RGB"))  for image in examples["image"]]
    return {"images":images}

In [None]:
dataset.set_transform(transform)

In [None]:
dataset

In [None]:
dataset[:1]["images"][0].shape

In [None]:
import torch

train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)

### Create UNet2DModel

In [None]:
from diffusers import UNet2DModel
model = UNet2DModel(
    sample_size=config.image_size,
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    block_out_channels=(128, 128, 256, 256, 512, 512),
        down_block_types=(
        "DownBlock2D",  # a regular ResNet downsampling block
        "DownBlock2D",
        "DownBlock2D",
        "DownBlock2D",
        "AttnDownBlock2D",  # a ResNet downsampling block with spatial self-attention
        "DownBlock2D",
    ),

    up_block_types=(
        "UpBlock2D",  # a regular ResNet upsampling block
        "AttnUpBlock2D",  # a ResNet upsampling block with spatial self-attention
        "UpBlock2D",
        "UpBlock2D",
        "UpBlock2D",
        "UpBlock2D",
    ),
)

In [None]:
%%capture
!pip install einops

In [None]:
dataset[0]["images"].size()

In [None]:
dataset[0]["images"].unsqueeze(0).size()

In [None]:
import einops

In [None]:
sample_image = dataset[0]["images"]
sample_image = einops.rearrange(sample_image, "c h w -> 1 c h w")
sample_image.shape

In [None]:
model(sample_image, timestep=0).sample.shape

### Create a scheduler

In [None]:
import torch
from PIL import Image
from diffusers import DDPMScheduler

noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
noise = torch.randn(sample_image.shape)
timesteps = torch.LongTensor([50])
noisy_image = noise_scheduler.add_noise(original_samples=sample_image, noise=noise, timesteps=timesteps)

In [None]:
noisy_image.shape

In [None]:
#move the channel to the last dim for PIL
x = einops.rearrange(noisy_image, "b c h w -> b h w c")
x.shape

In [None]:
Image.fromarray(((x + 1.0) * 127.5).type(torch.uint8).numpy()[0])

The purpose of training is to predict the noise added to the image

In [None]:
import torch.nn.functional as F
noise_pred = model(noisy_image, timesteps).sample
loss = F.mse_loss(noise_pred, noise)

## Training the model

Define optimizer and learning rate scheduler

In [None]:
from diffusers.optimization import get_cosine_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(),lr=config.learning_rate)
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps=config.lr_warmup_steps,
    num_training_steps=(len(train_dataloader) * config.num_epochs),
)

Define model evaluation metric

In [None]:
from diffusers import DDPMPipeline
import math
import os

def make_grid(images, rows, cols):
    w, h = images[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))
    for i, image in enumerate(images):
        grid.paste(image, box=(i % cols * w, i // cols * h))
    return grid

In [None]:
def evaluate(config, epoch, pipeline):
    images = pipeline(batch_size=config.eval_batch_size,generator=torch.manual_seed(config.seed),).images
    
    # Make a grid out of the images
    image_grid = make_grid(images, rows=4, cols=4)

    # Save the images
    test_dir = os.path.join(config.output_dir, "samples")
    os.makedirs(test_dir, exist_ok=True)
    image_grid.save(f"{test_dir}/{epoch:04d}.png")

In [None]:
%%capture
! pip install --upgrade accelerate

In [None]:
!pip show accelerate

In [None]:
!nvidia-smi

#### Training loop

In [None]:
from accelerate import Accelerator
from tqdm.auto import tqdm
from pathlib import Path
import os

def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
    accelerator = Accelerator(
        mixed_precision = config.mixed_precision,
        gradient_accumulation_steps = config.gradient_accumulation_steps,
        log_with="tensorboard",
        logging_dir=os.path.join(config.output_dir, "logs"),
    )
    
    if accelerator.is_main_process:
        if config.output_dir is not None:
            os.makedirs(config.output_dir, exist_ok=True)
        accelerator.init_trackers("train_example")
        
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )
    
    global_step = 0
    
    for epoch in range(config.num_epochs):
        progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
        progress_bar.set_description(f"Epoch {epoch}")
        
        for step, batch in enumerate(train_dataloader):
            clean_images = batch["images"]
            noise = torch.randn(clean_images.shape).to(clean_images.device)
            batch_size = clean_images.shape[0]
            
            #sample random timestep for each image
            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,), device=clean_images.device).long()
        
            #add noise to each clean image for each random timestep
            #this is the forward diffusion process
            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
            
            with accelerator.accumulate(model):
                noise_pred = model(noisy_images, timesteps, return_dict=False)[0]
                loss = F.mse_loss(noise_pred, noise)
                accelerator.backward(loss)
                
                accelerator.clip_grad_norm_(model.parameters(),1.0)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                
            progress_bar.update(1)
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
            global_step += 1
            
       # After each epoch you optionally sample some demo images with evaluate() and save the model
        if accelerator.is_main_process:
            pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
            if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
                evaluate(config, epoch, pipeline)
            if (epoch + 1) % config.save_model_epochs == 0 or epoch == config.num_epochs - 1:
                    pipeline.save_pretrained(config.output_dir)
        

In [None]:
from accelerate import notebook_launcher
args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler)
notebook_launcher(train_loop, args, num_processes=2)