In [None]:
#!/usr/bin/env python3
import os
import glob

# Adjust the search root if needed; here we use the current working directory.
search_root = "/dtu/blackhole/00/215456/tta-vlm/benchmark_results/"

# Find folders starting with 'augmented_inputs_'
pattern = os.path.join(search_root, "augmented_inputs_*")
folders = [folder for folder in glob.glob(pattern) if os.path.isdir(folder)]
input_images_paths = []

for folder in folders:
    # Find all .png files in the folder
    png_pattern = os.path.join(folder, "*.png")
    png_files = glob.glob(png_pattern)
    
    # Skip the folder if no png files are found
    if not png_files:
        continue

    # Extract the numeric part of the filename (assuming the file is named like 'number.png')
    # and build a list of tuples: (index, absolute_path)
    indexed_files = []
    for f in png_files:
        basename = os.path.basename(f)
        name, ext = os.path.splitext(basename)
        try:
            index = int(name)
            indexed_files.append((index, os.path.abspath(f)))
        except ValueError:
            # Skip files that do not have an integer as their base name.
            continue

    if not indexed_files:
        continue

    # Find the file with the maximum index
    max_index, max_path = max(indexed_files, key=lambda item: item[0])
    input_images_paths.append(max_path)

Image-to-image

In [None]:
import torch
from diffusers import AutoPipelineForImage2Image
from diffusers.utils import load_image, make_image_grid
from IPython.display import display

pipeline = AutoPipelineForImage2Image.from_pretrained(
    "black-forest-labs/FLUX.1-dev" # "stabilityai/stable-diffusion-3.5-large", #torch_dtype=torch.bfloat16, use_safetensors=True
)
pipeline.enable_model_cpu_offload()
#pipeline.enable_xformers_memory_efficient_attention()

In [None]:
#url = '/dtu/blackhole/00/215456/tta-vlm/benchmark_results/augmented_inputs_1736425391.8340986/4.png'
# url = "/dtu/blackhole/00/215456/tta-vlm/benchmark_results/augmented_inputs_1737465403.7203906/9.png"
for index, url in enumerate(input_images_paths):
    init_image = load_image(url)
    crop_box = (0, 0, 512, init_image.height)
    init_image = init_image.crop(crop_box)
    prompt = "realistic image"

    image = pipeline(prompt, image=init_image, guidance_scale=3.0, strength=0.3).images[0]
    display(make_image_grid([init_image, image.resize(init_image.size)], rows=1, cols=2))

DDIM Inversion

In [None]:
from typing import Union, Tuple, Optional

import matplotlib.pyplot as plt
import torch
from PIL import Image
from diffusers import StableDiffusionPipeline, DDIMInverseScheduler, AutoencoderKL, DDIMScheduler, StableDiffusion3Pipeline
from torchvision import transforms as tvt


def load_image(imgname: str, target_size: Optional[Union[int, Tuple[int, int]]] = None) -> torch.Tensor:
    pil_img = Image.open(imgname).convert('RGB')
    if target_size is not None:
        if isinstance(target_size, int):
            target_size = (target_size, target_size)
        pil_img = pil_img.resize(target_size, Image.Resampling.LANCZOS)
    return tvt.ToTensor()(pil_img)[None, ...]  # add batch dimension


def img_to_latents(x: torch.Tensor, vae: AutoencoderKL):
    x = 2. * x - 1.
    posterior = vae.encode(x).latent_dist
    latents = posterior.mean * 0.18215 # sd < 3 
    # latents = (posterior.mean - 0.0609) * 1.5305 # sd >= 3
    return latents


@torch.no_grad()
def ddim_inversion(imgname: str, num_steps: int = 50, verify: Optional[bool] = False) -> torch.Tensor:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    dtype = torch.float16
    
    # model_path = 'stabilityai/stable-diffusion-3.5-large'
    model_path = 'stabilityai/stable-diffusion-2-1'

    inverse_scheduler = DDIMInverseScheduler.from_pretrained(model_path, subfolder='scheduler')
    pipe = StableDiffusionPipeline.from_pretrained(model_path,
                                                   scheduler=inverse_scheduler,
                                                   safety_checker=None,
                                                   torch_dtype=dtype)
    pipe.to(device)
    vae = pipe.vae

    input_img = load_image(imgname).to(device=device, dtype=dtype)
    # crop
    input_img = input_img[:, :, :input_img.shape[-2], :512]
    # height and width should be divisible by 8
    input_img = input_img[:, :, :input_img.shape[-2] // 8 * 8, :input_img.shape[-1] // 8 * 8]
    
    latents = img_to_latents(input_img, vae)

    inv_latents = pipe(prompt="a realistic image", negative_prompt="", guidance_scale=1.,
                          width=input_img.shape[-1], height=input_img.shape[-2],
                          output_type='latent', return_dict=False,
                          num_inference_steps=num_steps, latents=latents)[0]

    # verify
    if verify:
        pipe.scheduler = DDIMScheduler.from_pretrained(model_path, subfolder='scheduler')
        image = pipe(prompt="", negative_prompt="", guidance_scale=1.,
                     num_inference_steps=num_steps, latents=inv_latents)
        fig, ax = plt.subplots(1, 2)
        
        fig.set_figwidth(100)
        fig.set_figheight(50)
        
        ax[0].imshow(tvt.ToPILImage()(input_img[0]))
        ax[1].imshow(image.images[0])
        plt.show()
    return inv_latents

In [None]:
ddim_inversion('/dtu/blackhole/00/215456/tta-vlm/benchmark_results/augmented_inputs_1737465403.7203906/9.png', num_steps=999, verify=True)

Conditioning

In [None]:
from diffusers import StableDiffusionImageVariationPipeline
from PIL import Image
from torchvision import transforms

device = "cuda"
sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
  "lambdalabs/sd-image-variations-diffusers",
  revision="v2.0",
  )
sd_pipe = sd_pipe.to(device)

im = Image.open('/dtu/blackhole/00/215456/tta-vlm/benchmark_results/augmented_inputs_1737465403.7203906/9.png')
crop_box = (0, 0, 512, im.height)
im = im.crop(crop_box)

tform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(
        (224, 224),
        interpolation=transforms.InterpolationMode.BICUBIC,
        antialias=False,
        ),
    transforms.Normalize(
      [0.48145466, 0.4578275, 0.40821073],
      [0.26862954, 0.26130258, 0.27577711]),
])
inp = tform(im).to(device).unsqueeze(0)

out = sd_pipe(inp, guidance_scale=0.1)


In [None]:
from diffusers.utils import make_image_grid

make_image_grid([im, out.images[0].resize(im.size)], rows=1, cols=2)

lambdalabs/sd-image-variations-diffusers

In [None]:
from diffusers.utils import load_image
url = '/dtu/blackhole/00/215456/tta-vlm/benchmark_results/augmented_inputs_1737465403.7203906/9.png'
init_image = load_image(url)
crop_box = (0, 0, 512, init_image.height)
init_image = init_image.crop(crop_box)

In [None]:
from diffusers import StableDiffusionImageVariationPipeline
from PIL import Image
import torchvision.transforms as transforms

device = "cuda:0"
sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
  "lambdalabs/sd-image-variations-diffusers",
  revision="v2.0",
  )
sd_pipe = sd_pipe.to(device)

tform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(
        (224, 224),
        interpolation=transforms.InterpolationMode.BICUBIC,
        antialias=False,
        ),
    transforms.Normalize(
      [0.48145466, 0.4578275, 0.40821073],
      [0.26862954, 0.26130258, 0.27577711]),
])
inp = tform(init_image).to(device).unsqueeze(0)

out = sd_pipe(inp, guidance_scale=5)

In [None]:
init_image

In [None]:
out["images"][0]

sayakpaul/FLUX.1-dev-edit-v0

In [None]:
from diffusers.utils import load_image
url = '/dtu/blackhole/00/215456/tta-vlm/benchmark_results/augmented_inputs_1737465403.7203906/9.png'
init_image = load_image(url)
crop_box = (0, 0, 512, init_image.height)
init_image = init_image.crop(crop_box)

In [None]:
from diffusers import FluxControlPipeline, FluxTransformer2DModel
from diffusers.utils import load_image
import torch 

path = "sayakpaul/FLUX.1-dev-edit-v0" 
edit_transformer = FluxTransformer2DModel.from_pretrained(path, torch_dtype=torch.bfloat16)
pipeline = FluxControlPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev", transformer=edit_transformer, torch_dtype=torch.bfloat16
).to("cuda")

image = init_image
print(image.size)

prompt = "make the image grayscale"
output_image = pipeline(
    control_image=image,
    prompt=prompt,
    guidance_scale=30., # change this as needed.
    num_inference_steps=50, # change this as needed.
    max_sequence_length=512,
    height=image.height,
    width=image.width,
    generator=torch.manual_seed(0)
).images[0]

In [None]:
output_image

In [None]:
image