In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### **CLIP-I, CLIP-T, DINO**

- **CLIP-I** - average pairwise cosine similarity between CLIP embeddings of generated and real images
- **CLIP-T** - average cosine similarity between prompt and CLIP embeddings of generated images
- **DINO** - average pairwise cosine similarity between the Vit-S/16 DINO embeddings of generated and real images

In [20]:
from PIL import Image
import matplotlib.pyplot as plt
import torch
from torchvision import transforms
from torch.nn import functional as F
from transformers import AutoModel, AutoProcessor, AutoTokenizer
import argparse
import os

import torch
from diffusers import DiffusionPipeline, StableDiffusionXLImg2ImgPipeline
import itertools


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

In [21]:
from PIL import Image

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

In [24]:
def get_model_outs(pretrained_model_name_or_path: str, prompts:list, samples_per_prompt: int):
    model_path = pretrained_model_name_or_path
    diffusion_pipe = DiffusionPipeline.from_pretrained(
        model_path, torch_dtype=torch.float16
    )
    diffusion_pipe = diffusion_pipe.to("cuda")
    generator = torch.Generator("cuda")
    generator = generator.manual_seed(0)

    tasks_prompts = []
    tasks_samples = []
    for current_prompt in prompts:
        task_prompts = [current_prompt] * samples_per_prompt
        task_samples = diffusion_pipe(
            prompt=task_prompts, output_type="pil", generator=generator
        )
        task_prompts.append(task_prompts)
        tasks_samples.append(task_samples)

    return tasks_prompts, list(itertools.chain.from_iterable([out.images for out in tasks_samples]))

In [25]:
N_TASKS = 5
MODELS = ["wolf_plushie_sd1", "backpack_sd2", "dog6_sd3", "candle_sd4", "cat2_sd5"]
TOKENS = ["sks stuffed animal", "zwz backpack", "sbu dog", "uwu candle", "pdw cat"]
AFTER_TASK_MODEL = {
    k+1:v for k,v in zip(range(N_TASKS), MODELS)
}
TOKEN_TASK = {
    k+1: v for k,v in zip(range(N_TASKS), MODELS)
}

PROMPTS = ['a {} in a purple wizard outfit', 'a {} in a police outfit', 'a {} wearing a santa hat', 'a {} in a jail', 'a {} looking into a mirror']


In [6]:
def run_model(task_number):
    assert task_number >= 1
    per_task_outs = {}
    for curr_task_number in range(task_number):
        task_outs = {}
        model_path = f"./models/{AFTER_TASK_MODEL[curr_task_number+1]}"
        prompts = [
            prompt.format(TOKEN_TASK[curr_task_number+1]) for prompt in PROMPTS
        ]
        out_prompts, out_samples = get_model_outs(pretrained_model_name_or_path=model_path, prompts=prompts, samples_per_prompt=8)
        task_outs["prompts"] = out_prompts
        task_outs["samples"] = out_samples
        per_task_outs[curr_task_number+1] = task_outs
    return per_task_outs

In [7]:
task_outs_1 = run_model(1)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  return F.conv2d(input, weight, bias, self.stride,


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [26]:
task_outs_2 = run_model(2)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
task_outs_3 = run_model(3)

In [None]:
task_outs_4 = run_model(4)

In [None]:
task_outs_5 = run_model(5)

# Metrics

In [18]:
grid = image_grid(imgs, rows=8, cols=5)


In [None]:
img_path_1 = '/net/tscratch/people/plgkzaleska/ziplora-analysis/data/dreambooth/dataset/rc_car/01.jpg'
img_path_2 = '/net/tscratch/people/plgkzaleska/ziplora-analysis/data/dreambooth/dataset/rc_car/02.jpg'
img_path_3 = '/net/tscratch/people/plgkzaleska/ziplora-analysis/data/dreambooth/dataset/rc_car/03.jpg'

In [None]:
image1 = Image.open(img_path_1)
image2 = Image.open(img_path_2)
image3 = Image.open(img_path_3)

In [None]:
# display images in figure
fig, ax = plt.subplots(1, 3, figsize=(15, 15))
ax[0].imshow(image1)
ax[1].imshow(image2)
ax[2].imshow(image3)
# hide the axes
for ax in ax:
    ax.axis('off')
plt.show()

#### **CLIP-I**

In [None]:
clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

In [None]:
inputs1 = clip_processor(images=image1, return_tensors="pt").to(device)
image_features1 = clip_model.get_image_features(**inputs1)

inputs2 = clip_processor(images=image2, return_tensors="pt").to(device)
image_features2 = clip_model.get_image_features(**inputs2)

In [None]:
sim = F.cosine_similarity(image_features1[0], image_features2[0], dim=0)
print(f"Similarity: {sim.item():.3f}")

#### **CLIP-T**

In [None]:
clip_tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
input_text = "a photo of a toy car"
text = clip_tokenizer(input_text, return_tensors="pt", padding=True).to(device)

In [None]:
input_img = clip_processor(images=image1, return_tensors="pt").to(device)

In [None]:
img_embeddings = clip_model.get_image_features(**input_img)
text_features = clip_model.get_text_features(**text)

In [None]:
sim = F.cosine_similarity(img_embeddings[0], text_features[0], dim=0)
print(f"Similarity: {sim.item():.3f}")

#### **DINO**

In [None]:
dino_model = AutoModel.from_pretrained("facebook/dino-vits16", add_pooling_layer=False).to(device)

In [None]:
T = transforms.Compose([
    transforms.Resize(256, interpolation=3),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

In [None]:
# get DINO features
inputs1 = T(image1).unsqueeze(0).to(device)
image_features1 = dino_model(inputs1).last_hidden_state

inputs2 = T(image2).unsqueeze(0).to(device)
image_features2 = dino_model(inputs2).last_hidden_state

In [None]:
sim = F.cosine_similarity(image_features1[0, 0], image_features2[0, 0], dim=0)
print('Similarity:', sim.item())