In [1]:
# Load the trained model

import torch
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from datasets import load_from_disk
from transformers import CLIPTokenizer, CLIPProcessor, AutoTokenizer
from transformers import AutoProcessor, FlavaModel
from PIL import Image
import base64
import io
import numpy as np

from tqdm.notebook import tqdm
import pandas as pd
from time import time

In [2]:
%load_ext autoreload
%autoreload 2

In [5]:
from models.clip import CLIPClassifier, CLIPProcessDataset

In [6]:

# model = torch.load('clip_entire_model_added_sigmoid_gradient_clip.pt')
model = torch.load('model_output/clip_entire_model_added_sigmoid_gradclip_laion-CLIP-ViT-B-32-laion2B-s34B-b79K-cross.pt',
                   map_location=torch.device('cuda'))
model.eval()


CLIPClassifier(
  (image_encoder): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(50, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0): CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Li

In [7]:
# class ImageCaptioningDataset(Dataset):
#     def __init__(self, dataset):

#         self.dataset = dataset
#         self.image_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
#         self.text_processor = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')

#     def __len__(self):
#         return len(self.dataset)

#     def __getitem__(self, idx):
#         item = self.dataset[idx]
#         # import pdb; pdb.set_trace()
#         # encoding = self.processor(images=base64str_to_PILobj(item["image"]), text=item["text"], padding="max_length", return_tensors="pt")
#         # # remove batch dimension
#         # encoding = {k:v.squeeze() for k,v in encoding.items()}
#         # # import pdb; pdb.set_trace()
#         # encoding['label'] = item['label']
#         # encoding['image'] = item["image"]
#         # return encoding
#         pixel_values = self.image_processor(images=base64str_to_PILobj(item["image"]).convert("RGB"),
#                                             return_tensors="pt")['pixel_values']
#         # caption_output = self.text_processor(item["caption"], 
#         #                                      padding=True,
#         #                                      return_tensors="pt",
#         #                                      truncation=True)
#         text_output = self.text_processor(item['text'],
#                                           padding='max_length', 
#                                           return_tensors="pt", 
#                                           truncation=True)
#         # pdb.set_trace()
#         # print(idx, pixel_values.shape)
#         # import pdb; pdb.set_trace()
#         label = torch.LongTensor([item['label']])
#         # import pdb; pdb.set_trace()
#         return {
#             'pixel_values': pixel_values,
#             'input_ids': text_output['input_ids'],
#             'attention_mask': text_output['attention_mask'],
#             'labels': label,
#             # 'input_ids_caption': caption_output['input_ids'],
#             # 'attention_mask_caption': caption_output['attention_mask_caption'],
#             'idx_memes': item['id'],
#             'image': item['image']
#         }

# def base64str_to_PILobj(base64_string):
#     '''
#     Args
#     - base64_string (str): based64 encoded representing an image

#     Output
#     - PIL object (use .show() to display)
#     '''
#     image_data = base64.b64decode(base64_string)
#     img = Image.open(io.BytesIO(image_data))
#     #img.show()
#     return img


In [8]:

batch_size = 32

combined = load_from_disk('./processed_data/combined_hateful_memes_dataset')

train_data = combined['train']
train_dataset = CLIPProcessDataset(train_data)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

dev_seen_data = combined['dev_seen']
dev_seen_dataset = CLIPProcessDataset(dev_seen_data)
dev_seen_loader = DataLoader(dev_seen_dataset, shuffle=True, batch_size=batch_size)

dev_unseen_data = combined['dev_unseen']
dev_unseen_dataset = CLIPProcessDataset(dev_unseen_data)
dev_unseen_loader = DataLoader(dev_unseen_dataset, shuffle=True, batch_size=batch_size)


In [9]:
from tqdm.notebook import tqdm
import pandas as pd
device='cuda'

def get_result_df(data_loader, sample_n=None, type_='train'):
    fina_res_dict_train = {
        'pred': [],
        'pred_score': [],
        'idx': [],
        'img': [],
        'labels': [],
        'type': []
    }

    with torch.no_grad():
        for batch in tqdm(data_loader):

            output = model(batch, device=device)
            predicted = torch.as_tensor((output - 0.5) > 0, dtype=torch.int32)
            # _, predicted = nn.sigmoid(output)#torch.max(output.data, 1)

            fina_res_dict_train['pred'].extend(predicted.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict_train['pred_score'].extend(output.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict_train['idx'].extend(batch['idx_memes']) 
            fina_res_dict_train['labels'].extend(batch['labels'].detach().cpu().numpy().reshape(-1).tolist()) 
            fina_res_dict_train['img'].extend(batch['image'])
            fina_res_dict_train['type'].extend([type_ for _ in range(len(batch['image']))])
            
            # break
            if sample_n is not None and len(fina_res_dict_train['idx'])>=sample_n:
                break

    return pd.DataFrame(fina_res_dict_train)


In [10]:
df = get_result_df(dev_unseen_loader, type_='dev_unseen')
df

  0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,pred,pred_score,idx,img,labels,type
0,1,0.585058,75639,iVBORw0KGgoAAAANSUhEUgAAAiYAAAFmCAIAAADWHMbgAA...,1,dev_unseen
1,0,0.001186,78134,iVBORw0KGgoAAAANSUhEUgAAAsoAAAMgCAIAAAC1XNNHAA...,0,dev_unseen
2,0,0.340448,46971,iVBORw0KGgoAAAANSUhEUgAAARQAAAGQCAIAAADgMwjgAA...,1,dev_unseen
3,1,0.853451,27635,iVBORw0KGgoAAAANSUhEUgAAATkAAAGQCAIAAABakICAAA...,1,dev_unseen
4,0,0.454527,19530,iVBORw0KGgoAAAANSUhEUgAAAPAAAAGQCAIAAAAftHorAA...,1,dev_unseen
...,...,...,...,...,...,...
535,0,0.000075,96284,iVBORw0KGgoAAAANSUhEUgAAAiYAAAFzCAIAAACFB1XIAA...,1,dev_unseen
536,0,0.000484,57369,iVBORw0KGgoAAAANSUhEUgAAAZAAAAGQCAIAAAAP3aGbAA...,0,dev_unseen
537,0,0.012439,43810,iVBORw0KGgoAAAANSUhEUgAAAzkAAAI6CAIAAACLkxFUAA...,1,dev_unseen
538,0,0.000270,89642,iVBORw0KGgoAAAANSUhEUgAAAzkAAAIlCAIAAAB5E6EaAA...,1,dev_unseen


In [11]:
from sklearn.metrics import roc_auc_score
roc_auc_score(df['labels'], df['pred_score'])

0.7450588235294117

In [6]:
device = 'cuda'#'cuda' if torch.cuda.is_available() else 'cpu'
model.eval()
correct_normal = 0
total = 0
model = model.to(device)
accuracy = {}
fina_res_dict = {
    'pred': [],
    'pred_score': [],
    'idx': [],
    'img': [],
    'labels': [],
    'type': []
}
gen_df = True

t0 = time()
with torch.no_grad():
    for batch in tqdm(dev_seen_loader):
        # print(batch)
        # input_ids = batch['input_ids'].to(device)
        # #token_type_ids  = batch['token_type_ids'].to(device)
        # attention_mask = batch['attention_mask'].to(device)
        # pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        labels = labels.view(-1)
        labels = labels.to(device)

        output = model(batch, device=device)
        predicted = torch.as_tensor((output - 0.5) > 0, dtype=torch.int32)
        # _, predicted = nn.sigmoid(output)#torch.max(output.data, 1)

        total += labels.size(0)
        this_batch_corrected = (predicted==labels.reshape(-1,1)).sum().item()
        correct_normal += this_batch_corrected
        # print(f'{this_batch_corrected}/{labels.size(0)} correct for this batch. total corrected by far={correct_normal}/{total}')
        # break
        if gen_df:
            fina_res_dict['pred'].extend(predicted.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict['pred_score'].extend(output.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict['idx'].extend(batch['idx_memes']) 
            fina_res_dict['labels'].extend(batch['labels'].detach().cpu().numpy().reshape(-1).tolist()) 
            fina_res_dict['img'].extend(batch['image'])
            fina_res_dict['type'].extend(['dev_seen' for _ in range(len(batch['image']))])
        # break
    accuracy['dev_seen'] = correct_normal/total
print(f"dev_seen completed in {(time()-t0):.2f}s. total count={total}. Accuracy={accuracy['dev_seen']}")

correct_normal = 0
total = 0
t0 = time()
with torch.no_grad():
    for batch in tqdm(dev_unseen_loader):
        # print(batch)
        # input_ids = batch['input_ids'].to(device)
        # #token_type_ids  = batch['token_type_ids'].to(device)
        # attention_mask = batch['attention_mask'].to(device)
        # pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        labels = labels.view(-1)
        labels = labels.to(device)

        output = model(batch, device=device)
        predicted = torch.as_tensor((output - 0.5) > 0, dtype=torch.int32)
        # _, predicted = nn.sigmoid(output)#torch.max(output.data, 1)

        total += labels.size(0)
        this_batch_corrected = (predicted==labels.reshape(-1,1)).sum().item()
        correct_normal += this_batch_corrected
        # print(f'{this_batch_corrected}/{labels.size(0)} correct for this batch. total corrected by far={correct_normal}/{total}')
        # break
        if gen_df:
            fina_res_dict['pred'].extend(predicted.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict['pred_score'].extend(output.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict['idx'].extend(batch['idx_memes']) 
            fina_res_dict['labels'].extend(batch['labels'].detach().cpu().numpy().reshape(-1).tolist()) 
            fina_res_dict['img'].extend(batch['image'])
            fina_res_dict['type'].extend(['dev_unseen' for _ in range(len(batch['image']))])
        # break
    accuracy['dev_unseen'] = correct_normal/total

print(f"dev_unseen completed {(time()-t0):.2f}s. total count={total}. Accuracy={accuracy['dev_unseen']}")
correct_normal = 0
total = 0
t0 = time()
with torch.no_grad():
    for batch in tqdm(train_loader):
        # print(batch)
        # input_ids = batch['input_ids'].to(device)
        # #token_type_ids  = batch['token_type_ids'].to(device)
        # attention_mask = batch['attention_mask'].to(device)
        # pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        labels = labels.view(-1)
        labels = labels.to(device)

        output = model(batch, device=device)
        predicted = torch.as_tensor((output - 0.5) > 0, dtype=torch.int32)
        # _, predicted = nn.sigmoid(output)#torch.max(output.data, 1)

        total += labels.size(0)
        this_batch_corrected = (predicted==labels.reshape(-1,1)).sum().item()
        correct_normal += this_batch_corrected
        # print(f'{this_batch_corrected}/{labels.size(0)} correct for this batch. total corrected by far={correct_normal}/{total}')

        if gen_df:
            fina_res_dict['pred'].extend(predicted.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict['pred_score'].extend(output.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict['idx'].extend(batch['idx_memes']) 
            fina_res_dict['labels'].extend(batch['labels'].detach().cpu().numpy().reshape(-1).tolist()) 
            fina_res_dict['img'].extend(batch['image'])
            fina_res_dict['type'].extend(['train' for _ in range(len(batch['image']))])
        # break

        # break
    accuracy['train'] = correct_normal/total
print(f"train completed {(time()-t0):.2f}s. total count={total}. Accuracy={accuracy['train']}")


  0%|          | 0/16 [00:00<?, ?it/s]

dev_seen completed in 19.24s. total count=500. Accuracy=0.652


  0%|          | 0/17 [00:00<?, ?it/s]

dev_unseen completed 20.18s. total count=540. Accuracy=0.7092592592592593


  0%|          | 0/266 [00:00<?, ?it/s]

train completed 319.57s. total count=8500. Accuracy=0.8874117647058823


In [7]:

df = pd.DataFrame(fina_res_dict)
df


Unnamed: 0,pred,pred_score,idx,img,labels,type
0,0,0.062818,68530,iVBORw0KGgoAAAANSUhEUgAAAhMAAAMgCAIAAAAWWHfEAA...,1,dev_seen
1,0,0.301817,64510,iVBORw0KGgoAAAANSUhEUgAAAiYAAAFyCAIAAABOW4ZtAA...,0,dev_seen
2,0,0.032403,65832,iVBORw0KGgoAAAANSUhEUgAAAUAAAAMgCAIAAADdmSyOAA...,1,dev_seen
3,0,0.074875,46971,iVBORw0KGgoAAAANSUhEUgAAARQAAAGQCAIAAADgMwjgAA...,1,dev_seen
4,0,0.013474,03798,iVBORw0KGgoAAAANSUhEUgAAAzkAAAIVCAIAAAB9aaa3AA...,0,dev_seen
...,...,...,...,...,...,...
9535,0,0.028382,13469,iVBORw0KGgoAAAANSUhEUgAAAzkAAAIlCAIAAAB5E6EaAA...,0,train
9536,1,0.945997,72489,iVBORw0KGgoAAAANSUhEUgAAAiYAAAFuCAIAAAA6T0SNAA...,1,train
9537,1,0.936970,13875,iVBORw0KGgoAAAANSUhEUgAAAiYAAAFuCAIAAAA6T0SNAA...,1,train
9538,0,0.025401,34520,iVBORw0KGgoAAAANSUhEUgAAAhQAAAMgCAIAAAD0hGy9AA...,0,train


In [10]:

from sklearn.metrics import roc_auc_score

# clip_entire_model_added_sigmoid_gradclip.pt
for type_ in ['train', 'dev_seen', 'dev_unseen']:
    temp_df = df[df['type']==type_]
    auc = roc_auc_score(temp_df['labels'], temp_df['pred_score'])
    print('%s AUC: %.3f' % (type_, auc))

accuracy


train AUC: 0.948
dev_seen AUC: 0.760
dev_unseen AUC: 0.750


{'dev_seen': 0.652,
 'dev_unseen': 0.7092592592592593,
 'train': 0.8874117647058823}

In [20]:

from sklearn.metrics import roc_auc_score

# clip_entire_model_added_sigmoid_gradclip_maplayers_5.pt
for type_ in ['train', 'dev_seen', 'dev_unseen']:
    temp_df = df[df['type']==type_]
    auc = roc_auc_score(temp_df['labels'], temp_df['pred_score'])
    print('%s AUC: %.3f' % (type_,auc))


train AUC: 0.898
dev_seen AUC: 0.707
dev_unseen AUC: 0.682


In [14]:

from sklearn.metrics import roc_auc_score

for type_ in ['train', 'dev_seen', 'dev_unseen']:
    temp_df = df[df['type']==type_]
    auc = roc_auc_score(temp_df['labels'], temp_df['pred_score'])
    print('%s AUC: %.3f' % (type_,auc))


train AUC: 0.897
dev_seen AUC: 0.728
dev_unseen AUC: 0.704


In [23]:
all_labels = train_data['label']

from collections import Counter
Counter(all_labels)

Counter({0: 5481, 1: 3019})

In [30]:
dir(train_data.data[75210])

IndexError: index out of bounds

In [24]:
3019/5481

0.5508118956394819