In [19]:
from transformers import CLIPModel, CLIPTokenizerFast, CLIPImageProcessor, CLIPProcessor
from torchvision.transforms import ToPILImage
from torch.utils.data import DataLoader
from torchvision.io import read_image
from torch.nn import CrossEntropyLoss
from dataset import TextImagePairSet
from pathlib import Path
from torchvision import transforms
from torchvision.io import read_video
from PIL import Image
from torch.optim import AdamW
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch

In [36]:
clip:CLIPModel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer:CLIPTokenizerFast = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
vision_processor:CLIPImageProcessor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor:CLIPProcessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [3]:
for x in clip.parameters():
    x.requires_grad = False

In [107]:
clip

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [None]:
print(sum(x.numel() for x in clip.visual_projection.parameters()))
print(sum(x.numel() for x in clip.text_projection.parameters()))

In [4]:
trainset = TextImagePairSet("./data/train/data.csv", "./data/train")
devset = TextImagePairSet("./data/dev/data.csv", "./data/dev")
testset = TextImagePairSet("./data/test/data.csv", "./data/test")

trainLoader = DataLoader(trainset, batch_size=1)
devLoader = DataLoader(devset, batch_size=1)
testLoader = DataLoader(testset, batch_size=1)

In [5]:
loss_fn = torch.nn.CrossEntropyLoss()
fc = torch.nn.Linear(in_features=768 + 512, out_features=1, bias=True)
optimizer = AdamW(fc.parameters(), 1e-5)

In [None]:
x = None
l = []

In [None]:

def train(trainLoader, clip, fc, optimizer, loss_fn):
    epoch_loss = 0
    fc.train()
    for X, y, label in trainLoader:                       
        X, tok_label = vision_processor(X, return_tensors="pt"), tokenizer(label, padding=True, return_tensors="pt")
        l.append(X['pixel_values'])
        optimizer.zero_grad()
        clip_outputs = clip(**X, **tok_label)
        
        y_hat = torch.nn.Softmax(dim=0)(clip_outputs['logits_per_image'])
       
        loss = loss_fn(y_hat, torch.Tensor([[y]]))
        
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss        

In [None]:
def validate(devLoader, clip, fc, loss_fn):
    epoch_loss = 0
    fc.eval()
    with torch.no_grad():
        for X, y, label in devLoader:            
            X, tok_label = vision_processor(X, return_tensors="pt"), tokenizer(label, padding=True, return_tensors="pt")
            clip_outputs = clip(**X, **tok_label)
            y_hat = torch.nn.Softmax(dim=0)(clip_outputs['logits_per_image'])
            loss = loss_fn(y_hat, y)
            loss.backward()
            
            epoch_loss += loss.item()
    return epoch_loss        

In [None]:
def test(testLoader, clip, fc, loss_fn):
    epoch_loss = 0
    fc.eval()
    with torch.no_grad():
        for X, y, label in testLoader:
            X, tok_label = vision_processor(X, return_tensors="pt"), tokenizer(label, padding=True, return_tensors="pt")
            clip_outputs = clip(**X, **tok_label)
            y_hat = torch.nn.Softmax(dim=0)(clip_outputs['logits_per_image'])
            
            loss = loss_fn(y_hat, y)
            loss.backward()            

            epoch_loss += loss.item()
    return epoch_loss       


In [None]:
epochs = 10

train_loss = [0]*10
dev_loss = [0]*10
cos_sim_fn = torch.nn.CosineSimilarity(dim=0)

for i in tqdm(range(epochs)):
    print("antes")
    train_loss[i] = train(trainLoader, clip, fc, optimizer, loss_fn)
    print("depois")
    dev_loss[i] = validate(devLoader, clip, fc, loss_fn)    

In [None]:
print(le_image.shape)
es_una_label = tokenizer(["texto maluco que eu nao sei"], padding=True, return_tensors="pt")

In [None]:
clip_outputs = clip(**{"pixel_values" : le_image}, **es_una_label)

In [None]:
clip_outputs.keys()
#y_hat = torch.nn.Softmax(dim=0)(clip_outputs['logits_per_image'])

In [None]:
torch.max(clip_outputs['vision_model_output']['last_hidden_state'])

In [None]:
print(clip_outputs['logits_per_image'])

In [None]:
clip_outputs['logits_per_image'].shape

In [109]:
x = read_image("./data/train/1.png")

In [110]:
inputs = processor(text="mulher em um fundo verde fazendo sinais de libras", images=x, return_tensors="pt", padding=True)
x_pross = vision_processor(x, return_tensors="pt")
tok_y = tokenizer(["mulher em um fundo verde fazendo sinais de libras"], padding=True, return_tensors="pt")
outputs = clip(**x_pross, **tok_y)

In [111]:
outputs.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

In [44]:
from torch.optim import AdamW
import torch.optim
import torch

In [78]:
configs = {"model": "torch.optim.AdamW", "lr" : 1e-4, 'betas' : (0.9, 0.99)

In [79]:
model = configs.pop("model")

In [63]:
model

'torch.optim.AdamW'

In [69]:
#exec("import torch")
adam = eval(model)
adam

torch.optim.adamw.AdamW

In [83]:
eval('torch.optim.AdamW')(**{}, params=clip.parameters())

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [86]:
eval('torch.optim.AdamW')(clip.parameters()) 

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)