In [None]:
import os
import torch
from PIL import Image
from diffusers.models import AutoencoderKL
from torchvision import transforms
from pathlib import Path
from IPython.display import Image as displayimage
#from torchv

In [None]:
# Load the SDXL VAE
print("Loading SDXL VAE...")
vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
device = "cuda" if torch.cuda.is_available() else "cpu"
vae = vae.to(device)

In [None]:
# Define image transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])  # Normalize to [-1, 1]
])

In [None]:
def img_to_latent(img_path):
        img = Image.open(img_path).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            # Encode
            latent = vae.encode(img_tensor).latent_dist.sample()
        return latent

In [None]:
def latent_to_img(latent):
        reconstructed = vae.decode(latent).sample
        reconstructed = (reconstructed.squeeze(0).cpu() * 0.5 + 0.5).clamp(0, 1)
        #reconstructed_img = transforms.ToPILImage()(reconstructed)
        return reconstructed

In [None]:
img_path='../datasets/caltech256/159.people/159_0015.jpg'
displayimage(filename=img_path) 

In [None]:
latent=img_to_latent(img_path)

In [None]:
latent[0].shape

In [None]:
num_train_timesteps = 1000 
beta_start = 0.00085
beta_end = 0.0120   
betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2

alphas = 1.0 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0).to(device) 

In [None]:
noise_epsilon = torch.randn_like(latent)

In [None]:
transforms.ToPILImage()(latent_to_img(noise_epsilon))

In [None]:
transforms.ToPILImage()(latent_to_img(((latent*0.5).flip(-1)*2).flip(1)))