# Diffusion

Exploring various Diffusion model architectures

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 27/02/2026   | Martin | Created   | Writing the DDPM algorithm | 

# Content

* [DDPM Algorithm](#ddpm-algorithm)

# DDPM Algorithm

The DDPM algorithm is the one that popularised diffusion models. It aims to predict the noise that needs to be removed from a Gaussian noise image to return back to the original distribution

__Forward Noise__

$$
x_t = \sqrt{\alpha_t}x_0 + \sqrt{1 - \alpha_t} \epsilon
$$

__Reverse Noise__

$$
E_{q, t}
  \left[ \frac{\beta_t^2}{2\sigma_t^2 \alpha_t (1 - \bar{\alpha_t})} ||\epsilon - \epsilon_\theta (x_t, t)||^2 \right]
$$

Where the model is trying to predict the noise term

$$\epsilon_\theta (x_t, t)$$

In [24]:
import torch
import deepinv
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image

In [6]:
# ========== Dataset Config ==========
device = "cuda"
batch_size = 32
image_size = 256

# ========== Training Config ==========
lr = 1e-4
epochs = 100
# Beta schedule
beta_start = 1e-4
beta_end = 0.02
timesteps = 1000
betas = torch.linspace(beta_start, beta_end, timesteps, device=device)
# Alpha terms
alphas = 1.0 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0)
sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)

In [12]:
# --------------------------------------------------
# Data Loader
# --------------------------------------------------
class ImageFolderDataset(Dataset):
  """
  Loads all images from a folder (and optionally subfolders).
  Resizes images to image_size x image_size and normalised to [-1, 1]
  """
  def __init__(
    self,
    folder: str,
    image_size: int = 256,
    recursive: bool = True
  ):
    self.paths = []
    folder = Path(folder)
    glob = folder.rglob('*') if recursive else folder.glob('*')
    for p in glob:
      self.paths.append(p)
    
    if len(self.paths) == 0:
      raise ValueError(f"No images found in {folder}")
    
    print(f"Found {len(self.paths)} images in {folder}")

    self.transform = transforms.Compose([
      transforms.Resize(image_size, interpolation=transforms.InterpolationMode.LANCZOS),
      transforms.CenterCrop(image_size),
      transforms.ToTensor(),
      transforms.Normalize([0.0, 0.0, 0.0], [1.0, 1.0, 1.0])
    ])
  
  def __len__(self):
    return len(self.paths)
  
  def __getitem__(self, idx):
    try:
      img = Image.open(self.paths[idx]).convert('RGB')
      return self.transform(img)
    except Exception as e:
      print(f"Warning: failed to load {self.paths[idx]}: {e}. Returning zeros.")
      return torch.zeros(3, self.transform.transforms[1].size, self.transform.transforms[1].size)

In [22]:
# --------------------------------------------------
# Model Definition
# --------------------------------------------------
model = deepinv.models.DiffUNet(
  in_channels=3,
  out_channels=3,
  pretrained=None
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
mse = deepinv.loss.MSE()

data = ImageFolderDataset(
  folder="./data/Celebrity Faces Dataset/Angelina Jolie",
  image_size=image_size,
)
loader = DataLoader(
  data,
  batch_size=batch_size,
  shuffle=True,
  num_workers=0,
  drop_last=True,
)

Found 100 images in data/Celebrity Faces Dataset/Angelina Jolie


In [26]:
# --------------------------------------------------
# Training Loop
# --------------------------------------------------
for epoch in range(epochs):
  model.train()
  for data in loader:
    imgs = data.to(device)
    noise = torch.randn_like(imgs)
    t = torch.randint(0, timesteps, (imgs.size(0), ), device=device)

    noised_imgs = (
      sqrt_alphas_cumprod[t, None, None, None] * imgs
      + sqrt_one_minus_alphas_cumprod * noise
    )

    optimizer.zero_grad()
    estimated_noise = model(noised_imgs, t, type_t='timestep')
    loss = mse(estimated_noise, noise)
    loss.backward()
    optimizer.step()

RuntimeError: The size of tensor a (1000) must match the size of tensor b (256) at non-singleton dimension 3

In [None]:
%load_ext watermark
%watermark