In [None]:
# Clone the repositories
!git clone https://github.com/openai/CLIP
# Install the requirements
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 ninja==1.11.1.1
!pip install ftfy==6.1.3 regex==2023.12.25 tqdm==4.66.2
# Download the pre-trained models
!mkdir pretrained_models
!curl -L --output pretrained_models/ViT-B-32.pt 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt'

# Download dataset:
!gdown -O dataset.zip https://drive.google.com/uc?id=1JtRIbZDuZlBEA6vp870eW3ei0r32uYqF
# https://drive.google.com/file/d/1JtRIbZDuZlBEA6vp870eW3ei0r32uYqF/view?usp=sharing
# !gdown -O dataset2.zip https://drive.google.com/uc?id=1opIDLclS5g0GBG8xGwlTtYL2sa8YKxnV
!unzip -q dataset.zip

# Download stylegan2-db inference output:
!gdown -O stylegan2-db.zip https://drive.google.com/uc?id=1-KHWXHULA6LhKqmuhnx57ZaIOGT7rM0d
# https://drive.google.com/file/d/1-KHWXHULA6LhKqmuhnx57ZaIOGT7rM0d/view?usp=sharing
!unzip -q stylegan2-db.zip

# Download dalle-mini-vqgan inference output:
!gdown -O dalle-mini-vqgan.zip https://drive.google.com/uc?id=1yh5zYzs0Z4mS9Ra1vz02gsyjZKWYk_j7
# https://drive.google.com/file/d/1yh5zYzs0Z4mS9Ra1vz02gsyjZKWYk_j7/view?usp=sharing
!unzip -q dalle-mini-vqgan.zip

# Download dalle-mini-vqgan inference output:
!gdown -O dalle2.zip https://drive.google.com/uc?id=1RtYQzK6U41FZdbFN2T0wN4m2udH7yrNX
# https://drive.google.com/file/d/1RtYQzK6U41FZdbFN2T0wN4m2udH7yrNX/view?usp=sharing
!unzip -q dalle2.zip

# Download Stable Diffusion Inference output:
!gdown -O stable-diffusion.zip https://drive.google.com/uc?id=1qVu0aC_YFJKWFxV9_uqyXHsR84TuNPJz
#https://drive.google.com/file/d/1qVu0aC_YFJKWFxV9_uqyXHsR84TuNPJz/view?usp=sharing
!unzip -q stable-diffusion.zip

Cloning into 'CLIP'...
remote: Enumerating objects: 251, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 251 (delta 3), reused 2 (delta 0), pack-reused 243[K
Receiving objects: 100% (251/251), 8.93 MiB | 14.04 MiB/s, done.
Resolving deltas: 100% (127/127), done.
Collecting torch==2.1.2
  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.16.2
  Downloading torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.1.2
  Downloading torchaudio-2.1.2-cp310-cp310-manylinux1_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m100.5 MB/s[0m eta [

In [5]:
import torch
import CLIP.clip.clip as clip
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

dtype_dict = {'name': str, 'caption': str}
df = pd.read_csv('dataset/image_captions_cleaned.txt', sep=",", header=0, dtype=dtype_dict)
print('Shape is: ' + str(df.shape))
max_iters = df.shape[0]
df.head()

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 142MiB/s]


Shape is: (2048, 2)


Unnamed: 0,name,caption
0,1,a man with a black jacket
1,2,a young child with a very look on his face
2,3,a woman with a big smile on her face
3,4,a man in a white shirt
4,5,a man with a bandana on his head


In [6]:
def clip_score(image_folder, max_iter_override = None):

    total_clip_score = 0

    if max_iter_override is not None:
        max_iters = max_iter_override

    for idx in tqdm(range(max_iters)):
        prompt = df.iloc[idx, 1]
        photo = image_folder + df.iloc[idx, 0] + '.png'

        image = preprocess(Image.open(photo)).unsqueeze(0).to(device)
        text = clip.tokenize([prompt]).to(device)

        image_features = clip_model.encode_image(image)
        text_features = clip_model.encode_text(text)

        # Compute similarity score between image and text features using cosine similarity
        raw_similarity = (image_features @ text_features.T).diag().mean().item()
        #print(f"Raw CLIP Score: {raw_similarity}")
        scale_factor = 0.1  # Adjust this to change the sensitivity
        similarity = torch.sigmoid(torch.tensor(raw_similarity * scale_factor)).item()
        #clip_scores.append(similarity)  # Store the similarity score
        total_clip_score += similarity
    return total_clip_score/max_iters

In [None]:
# CLIP Score on Dataset
image_folder = 'dataset/images/'
average_clip_score = clip_score(image_folder)
print(f"Average CLIP Score: {average_clip_score}")

  0%|          | 0/2048 [00:00<?, ?it/s]

Average CLIP Score: 0.9271204391843639


In [None]:
# CLIP Score on our StyleGANv2 Model:
image_folder = 'stylegan2-db/'
average_clip_score = clip_score(image_folder)
print(f"Average CLIP Score: {average_clip_score}")

  0%|          | 0/2048 [00:00<?, ?it/s]

Average CLIP Score: 0.8936234570574015


In [None]:
# CLIP Score on DALLE mini + VQGAN Model:
image_folder = 'dalle-mini-vqgan/'
average_clip_score = clip_score(image_folder)
print(f"Average CLIP Score: {average_clip_score}")

  0%|          | 0/2048 [00:00<?, ?it/s]

Average CLIP Score: 0.9560539780941326


In [None]:
# CLIP Score on OPENAI DALLE 2 Model:
image_folder = 'dalle2/'
average_clip_score = clip_score(image_folder, max_iter_override=128)
print(f"Average CLIP Score: {average_clip_score}")

  0%|          | 0/128 [00:00<?, ?it/s]

Average CLIP Score: 0.9446187829598784


In [7]:
# CLIP Score on Stable Diffusion v1.4 Model:
image_folder = 'stable-diffusion/'
average_clip_score = clip_score(image_folder, max_iter_override=128)
print(f"Average CLIP Score: {average_clip_score}")

  0%|          | 0/128 [00:00<?, ?it/s]

Average CLIP Score: 0.9492019736208022
