Getting Clip to run

In [42]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

path = "data\coco_285.png"
image = Image.open(path)

inputs = processor(text=["a photo of a cat", "a photo of a dog", "A big burly grizzly bear is show with grass in the background."], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) 

In [50]:
# read data\algonauts_2023_caption_data.csv
# loop over first 100 rows in the first column
# download the images from https://cocodataset.org/#explore?id= by using the row value as the image id
# save the images in data/cocotest

# read data
import pandas as pd
df = pd.read_csv("data/algonauts_2023_caption_data.csv")
df.head()

# download images
import requests
import os
from PIL import Image
from io import BytesIO

for i in range(100):
    # preprend 0s to the image id to have a 12 digit number
    id = str(df.iloc[i,0]).zfill(12)
    print(id)
    # try train and val images
    try:
        url = f"http://images.cocodataset.org/train2017/{id}.jpg"
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save("data/cocotest/{}.jpg".format(df.iloc[i,0]))
    except:
        url = f"http://images.cocodataset.org/val2017/{id}.jpg"
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save("data/cocotest/{}.jpg".format(df.iloc[i,0]))

000000532481
000000245764
000000385029
000000311303
000000393226
000000475150
000000554328
000000438269
000000008211
000000327701
000000376856
000000221213
000000372349
000000024610
000000558114
000000213033
000000466986
000000213035
000000368684
000000188465
000000333069
000000475191
000000499768
000000076468
000000131131
000000275727
000000499775
000000254016
000000393282
000000106563
000000376900
000000120572
000000507975
000000023023
000000241677
000000262227
000000008277
000000324614
000000491613
000000532575
000000090208
000000311394
000000490171
000000327780
000000290833
000000522940
000000029984
000000163951
000000360564
000000049269
000000032887
000000434297
000000450686
000000032901
000000540932
000000565391
000000335800
000000196759
000000180383
000000572303
000000082085
000000524456
000000090284
000000188592
000000278705
000000394611
000000336053
000000114871
000000537270
000000128372
000000286907
000000286908
000000296231
000000508101
000000450758
000000065736
000000131273

In [57]:
# get the text descriptions from the caption column of the first 100 rows of df and save them in a list
# load all images from the cocotest folder and save them in a list
# use the captions and images as input for the CLIP model
# run the model and show a plot for each image with the the highest 5 predictions

# get the text descriptions from the caption column of the first 100 rows of df and save them in a list
captions = df.iloc[:100,3].tolist()

# load all images from the cocotest folder and save them in a list
from PIL import Image
import os

images = []
for filename in os.listdir("data/cocotest"):
    img = Image.open("data/cocotest/{}".format(filename))
    images.append(img)


# use the captions and images as input for the CLIP model
inputs = processor(text=captions, images=images, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)




In [None]:

#  show a plot for each image with the the highest 5 predictions
import matplotlib.pyplot as plt
import numpy as np
import torch

for i in range(100):
    plt.figure(figsize=(10, 10))
    plt.imshow(images[i])
    plt.axis('off')
    plt.show()
    print("Top predictions:")
    values, indices = torch.topk(probs[i], 5)
    for j in range(5):
        print(f"{captions[indices[j]]} with a probability of {values[j]}")

In [6]:
from transformers import CLIPConfig, CLIPModel

# Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
configuration = CLIPConfig()

# Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
model = CLIPModel(configuration)

# Accessing the model configuration
configuration = model.config

# We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
from transformers import CLIPTextConfig, CLIPVisionConfig

# Initializing a CLIPText and CLIPVision configuration
config_text = CLIPTextConfig()
config_vision = CLIPVisionConfig()

config = CLIPConfig.from_text_vision_configs(config_text, config_vision)

In [None]:
#inspecting the model
model.config

In [37]:
from transformers import AutoTokenizer, CLIPTextModel

model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

outputs = model(**inputs)
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output
last_hidden_state.shape, pooled_output.shape
# check shape of inputs
inputs


Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.2.self_attn.out_proj.bias', 'vision_model.encoder.layers.2.mlp.fc2.weight', 'visual_projection.weight', 'vision_model.encoder.layers.0.self_attn.k_proj.bias', 'vision_model.encoder.layers.6.self_attn.out_proj.weight', 'vision_model.encoder.layers.8.mlp.fc2.weight', 'vision_model.encoder.layers.7.self_attn.v_proj.bias', 'vision_model.encoder.layers.8.layer_norm2.weight', 'vision_model.encoder.layers.1.self_attn.v_proj.bias', 'vision_model.encoder.layers.1.self_attn.q_proj.bias', 'vision_model.encoder.layers.6.self_attn.q_proj.bias', 'vision_model.encoder.layers.6.self_attn.q_proj.weight', 'vision_model.encoder.layers.7.self_attn.v_proj.weight', 'vision_model.encoder.layers.6.self_attn.k_proj.weight', 'vision_model.encoder.layers.11.self_attn.k_proj.weight', 'vision_model.encoder.layers.11.mlp.fc2.weight', 'vision_model.encoder.layers.4.layer_n

{'input_ids': tensor([[49406,   320,  1125,   539,   320,  2368, 49407],
        [49406,   320,  1125,   539,   320,  1929, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}

In [33]:
inputs

{'input_ids': tensor([[49406,   320,  1125,   539,   320,  2368, 49407],
        [49406,   320,  1125,   539,   320,  1929, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}

In [41]:
from torchsummary import summary
print(model)

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0): CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05, ele