<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/Clip_Visual_Language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q transformers

CLIP Visual-Language Lib:

In [5]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=[" a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
print("probability for the question of a photo of a cat or dog:",probs)

probability for the question of a photo of a cat or dog: tensor([[0.9949, 0.0051]], grad_fn=<SoftmaxBackward0>)


CLIP Visual-Language Custom:

In [6]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel

class CLIPModel_CUSTOM(nn.Module):
    def __init__(self, ):
        super(CLIPModel_CUSTOM, self).__init__()
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.config = model.config
        self.text_model = model.text_model
        self.vision_model = model.vision_model
        self.visual_projection = model.visual_projection
        self.text_projection = model.text_projection
        self.logit_scale = model.logit_scale


    def forward(
        self,
        input_ids = None,
        pixel_values = None,
        attention_mask = None,
        position_ids = None,
        return_loss = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)

        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        image_embeds = vision_outputs[1]
        image_embeds = self.visual_projection(image_embeds)

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # normalized features
        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
        logits_per_image = logits_per_text.t()
        return logits_per_image, logits_per_text


model = CLIPModel_CUSTOM()
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=[" a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
logits_per_image, logits_per_text = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], pixel_values=inputs['pixel_values'])
probs_per_image = logits_per_image.softmax(dim=1)
print("logits_per_image:",logits_per_image, "logits_per_text:", logits_per_text)

print("probability for the question of a photo of a cat or dog:",probs_per_image)


logits_per_image: tensor([[24.5701, 19.3049]], grad_fn=<TBackward0>) logits_per_text: tensor([[24.5701],
        [19.3049]], grad_fn=<MulBackward0>)
probability for the question of a photo of a cat or dog: tensor([[0.9949, 0.0051]], grad_fn=<SoftmaxBackward0>)


CLIP Language-Language Custom Scaler Probability:

In [76]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel

class CLIPModel_Text(nn.Module):
    def __init__(self, ):
        super(CLIPModel_Text, self).__init__()
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.config = model.config
        self.text_model = model.text_model
        self.text_projection = model.text_projection
        self.logit_scale = model.logit_scale


    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        position_ids = None,
        return_loss = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # normalized features
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

        # cosine similarity as logits
        prob_per_pair = torch.matmul(text_embeds[0], text_embeds[1]) #* logit_scale

        return prob_per_pair


model = CLIPModel_Text()
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
inputs = processor(text=[" a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True)
prob_per_pair = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
print("prob per pair:",prob_per_pair)

inputs = processor(text=["hello, how are you?", "a photo of a dog"], return_tensors="pt", padding=True)
prob_per_pair = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
print("prob per pair:",prob_per_pair)


prob per pair: tensor(0.9310, grad_fn=<DotBackward0>)
prob per pair: tensor(0.8267, grad_fn=<DotBackward0>)


CLIP Language-Language Custom Vector Probability or Feature Map Single Pair:

In [13]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel

class CLIPModel_Text(nn.Module):
    def __init__(self, ):
        super(CLIPModel_Text, self).__init__()
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.config = model.config
        self.text_model = model.text_model
        self.text_projection = model.text_projection
        self.logit_scale = model.logit_scale


    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        position_ids = None,
        return_loss = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # normalized features
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

        # cosine similarity as logits
        prob_per_pair = text_embeds[0] * text_embeds[1] #torch.mm(text_embeds[0], text_embeds[1]) #* logit_scale

        return prob_per_pair


model = CLIPModel_Text()
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
inputs = processor(text=["hello, how are you?", "a photo of a dog"], return_tensors="pt", padding=True)
prob_per_pair = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
print("prob per pair:",prob_per_pair.shape, prob_per_pair.max(), prob_per_pair.min(), prob_per_pair.sum())


inputs = processor(text=[" a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True)
prob_per_pair = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
print("prob per pair:",prob_per_pair.shape, prob_per_pair.max(), prob_per_pair.min(), prob_per_pair.sum())

prob per pair: torch.Size([512]) tensor(0.3729, grad_fn=<MaxBackward1>) tensor(-0.0018, grad_fn=<MinBackward1>) tensor(0.8267, grad_fn=<SumBackward0>)
prob per pair: torch.Size([512]) tensor(0.3537, grad_fn=<MaxBackward1>) tensor(-0.0006, grad_fn=<MinBackward1>) tensor(0.9310, grad_fn=<SumBackward0>)


CLIP Language-Language Custom Vector Probability or Feature Map Double Pair:

In [56]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel

class CLIPModel_Text(nn.Module):
    def __init__(self, ):
        super(CLIPModel_Text, self).__init__()
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.config = model.config
        self.text_model = model.text_model
        self.text_projection = model.text_projection
        self.logit_scale = model.logit_scale


    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        position_ids = None,
        return_loss = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # normalized features
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
        print(text_embeds.shape)
        # cosine similarity as logits
        prob_per_pair1 = text_embeds[0] * text_embeds[1] #torch.mm(text_embeds[0], text_embeds[1]) #* logit_scale\
        prob_per_pair2 = text_embeds[2] * text_embeds[3]

        return torch.stack([prob_per_pair1, prob_per_pair2])


model = CLIPModel_Text()
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

pairs_all = ["a photo of a dog", "a photo of a dog"]
pair2 = ["a photo of a cat cow", "a photo of a dog"]
pairs_all.extend(pair2)

pairs = processor(text=pairs_all, return_tensors="pt", padding=True)
prob_per_pair = model(input_ids=pairs['input_ids'], attention_mask=pairs['attention_mask'])
print("Logit per pair:",prob_per_pair.shape, prob_per_pair.sum(dim=1))

torch.Size([4, 512])
Logit per pair: torch.Size([2, 512]) tensor([1.0000, 0.8543], grad_fn=<SumBackward1>)


CLIP Language-Language Custom Vector Probability or Feature Map Double Pair [Efficient]:

In [75]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel

class CLIPModel_Text(nn.Module):
    def __init__(self, ):
        super(CLIPModel_Text, self).__init__()
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.config = model.config
        self.text_model = model.text_model
        self.text_projection = model.text_projection
        self.logit_scale = model.logit_scale


    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        position_ids = None,
        return_loss = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # normalized features
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
        print(text_embeds.shape)
        # cosine similarity as logits
        total_samples = len(text_embeds)
        prob_per_pair = text_embeds[torch.arange(0,total_samples,2),:] * text_embeds[torch.arange(1,total_samples,2),:]

        return prob_per_pair


model = CLIPModel_Text()
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

pairs_all = ["a photo of a dog", "a photo of a dog"]
pair2 = ["a photo of a cat cow", "a photo of a dog"]
pairs_all.extend(pair2)

pairs = processor(text=pairs_all, return_tensors="pt", padding=True)
prob_per_pair = model(input_ids=pairs['input_ids'], attention_mask=pairs['attention_mask'])
print("Logit per pair:",prob_per_pair.shape, prob_per_pair.sum(dim=1))

torch.Size([4, 512])
Logit per pair: torch.Size([2, 512]) tensor([1.0000, 0.8543], grad_fn=<SumBackward1>)


In [None]:
from transformers import AutoTokenizer, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
text_features = model.get_text_features(**inputs)
print('text_features:', text_features.shape)

text_features: torch.Size([2, 512])


In [None]:
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(images=image, return_tensors="pt")

image_features = model.get_image_features(**inputs)

print('image_features:', image_features.shape)

image_features: torch.Size([1, 512])


In [None]:
from transformers import AutoTokenizer, CLIPTextModel

model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

outputs = model(**inputs)
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output  # pooled (EOS token) states
print('pooled_output:', pooled_output.shape)

pooled_output: torch.Size([2, 512])


In [None]:
from transformers import AutoTokenizer, CLIPTextModelWithProjection

model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

outputs = model(**inputs)
text_embeds = outputs.text_embeds
print('text_embeds', text_embeds.shape)

text_embeds torch.Size([2, 512])
