In [1]:
import torch
# import clip
from transformers import AutoProcessor, CLIPModel
from PIL import Image
import numpy as np
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device is {device}")
# model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

Device is cuda


In [2]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score

import os

# Load Model and Processor

In [3]:
model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14')
processor = AutoProcessor.from_pretrained('openai/clip-vit-large-patch14')

In [4]:
model.to(device)

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

# Load Some Examples

In [5]:
# Illustrating example of directly using non-hateful and hateful
# image = preprocess(Image.open("./dataset/img/01235.png")).unsqueeze(0).to(device)
# text = clip.tokenize(["non-hateful", "hateful"]).to(device)
inputs = processor(text=["non-hateful", "hateful"], images = Image.open("./dataset/img/01235.png"), return_tensors='pt', padding=True).to(device)
with torch.no_grad():
    # image_features = model.encode_image(image)
    # text_features = model.encode_text(text)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    # logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

Unused or unrecognized kwargs: padding.


Label probs: [[0.75677943 0.24322055]]


In [6]:
logits_per_image.shape

torch.Size([1, 2])

In [7]:
logits_per_image

tensor([[17.3501, 16.2150]], device='cuda:0')

In [8]:
print(probs)

[[0.75677943 0.24322055]]


In [9]:
np.argmax(probs)

0

# Test Model's Capability of Zero-Shot prediction on the train, dev and test JSONs

In [10]:
model.float()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [11]:
def test_model_zero_shot_capability(model, processor, device, jsonl_path):
    labels = []
    model_probs = []
    model_preds = []
    # 0 -> Non-hateful, 1 -> Hateful
    # text = clip.tokenize(["non-hateful", "hateful"]).to(device)
    with open(jsonl_path, 'r') as json_f:
        json_list = list(json_f)
    for json_str in tqdm(json_list):
        result = json.loads(json_str)
        img_path, label = result['img'], result['label']
        labels.append(label)
        # image = preprocess(Image.open(os.path.join('./dataset', img_path))).unsqueeze(0).to(device)
        with torch.no_grad():
            inputs = processor(text=['non-hateful', 'hateful'], images=Image.open(os.path.join('./dataset', img_path)), return_tensors='pt', padding=True).to(device)
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            probs = logits_per_image.softmax(dim=-1).cpu().numpy()
            class_1_prob = probs[0][1]
            model_probs.append(class_1_prob)
            model_preds.append(np.argmax(probs))
    return roc_auc_score(labels, model_probs), accuracy_score(labels, model_preds)


In [12]:
# Previously ROC = 0.453558, Acc = 0.512962
dev_roc, dev_acc = test_model_zero_shot_capability(model, processor, device, './dataset/dev_unseen.jsonl')
print(f'Dev ROC: {dev_roc}')
print(f'Dev Accuracy: {dev_acc}')

  0%|          | 0/540 [00:00<?, ?it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 3/540 [00:00<00:25, 21.41it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 6/540 [00:00<00:25, 21.24it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  2%|▏         | 9/540 [00:00<00:26, 20.29it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  2%|▏         | 12/540 [00:00<00:26, 20.20it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  3%|▎         | 15/540 [00:00<00:25, 20.74it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  3%

Dev ROC: 0.539514705882353
Dev Accuracy: 0.6





In [14]:
# Previously ROC = 0.4644, Acc = 0.478
dev_roc, dev_acc = test_model_zero_shot_capability(model, processor, device, './dataset/dev_seen.jsonl')
print(f'Dev ROC: {dev_roc}')
print(f'Dev Accuracy: {dev_acc}')

  0%|          | 0/500 [00:00<?, ?it/s]Unused or unrecognized kwargs: padding.


  0%|          | 1/500 [00:00<02:16,  3.66it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 3/500 [00:00<00:54,  9.09it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 6/500 [00:00<00:36, 13.47it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  2%|▏         | 8/500 [00:00<00:32, 15.10it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  2%|▏         | 11/500 [00:00<00:28, 17.20it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  3%|▎         | 13/500 [00:00<00:27, 17.78it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  3%|▎         | 15/500 [00:00<00:27, 17.73it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  3%|▎         | 17/500 [00:

Dev ROC: 0.5213230705221552
Dev Accuracy: 0.496





In [15]:
# Previously ROC = 0.514215, Acc = 0.516
test_roc, test_acc = test_model_zero_shot_capability(model, processor, device, './dataset/test_seen.jsonl')
print(f'test ROC: {test_roc}')
print(f'test Accuracy: {test_acc}')

  0%|          | 0/1000 [00:00<?, ?it/s]Unused or unrecognized kwargs: padding.


Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 3/1000 [00:00<00:50, 19.56it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 5/1000 [00:00<00:51, 19.49it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 7/1000 [00:00<00:52, 18.82it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 10/1000 [00:00<00:51, 19.29it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 12/1000 [00:00<00:51, 19.37it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|▏         | 14/1000 [00:00<00:50, 19.41it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  2%|▏         | 16/1000 [00:00<00:50, 19.45it/s]Unused or unrecognized kwargs: padding.
Unused or unrecogniz

test ROC: 0.5453881552621048
test Accuracy: 0.524





In [16]:
# Previously ROC = 0.51722, Acc = 0.534
test_roc, test_acc = test_model_zero_shot_capability(model, processor, device, './dataset/test_unseen.jsonl')
print(f'test ROC: {test_roc}')
print(f'test Accuracy: {test_acc}')

  0%|          | 0/2000 [00:00<?, ?it/s]

Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 3/2000 [00:00<01:36, 20.63it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 6/2000 [00:00<01:47, 18.52it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 8/2000 [00:00<01:45, 18.89it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 10/2000 [00:00<01:45, 18.79it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 12/2000 [00:00<01:48, 18.28it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 14/2000 [00:00<01:47, 18.56it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  1%|          | 16/2000 [00:00<01:49, 18.08it/s]Unused or unrecogniz

test ROC: 0.5372421333333333
test Accuracy: 0.606





In [17]:
# Previously ROC = 0.476339, Acc = 0.521647
train_roc, train_acc = test_model_zero_shot_capability(model, processor, device, './dataset/train.jsonl')
print(f'train ROC: {train_roc}')
print(f'train Accuracy: {train_acc}')

  0%|          | 0/8500 [00:00<?, ?it/s]Unused or unrecognized kwargs: padding.


Unused or unrecognized kwargs: padding.
  0%|          | 2/8500 [00:00<07:13, 19.62it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 4/8500 [00:00<08:08, 17.40it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 6/8500 [00:00<07:49, 18.09it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 8/8500 [00:00<08:15, 17.15it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 10/8500 [00:00<07:55, 17.86it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 12/8500 [00:00<08:02, 17.60it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 14/8500 [00:00<08:11, 17.26it/s]Unused or unrecognized kwargs: padding.
Unused or unrecognized kwargs: padding.
  0%|          | 16/8500 [00:00<08:20, 16.96it/s]Unused or un

train ROC: 0.49242473275893794
train Accuracy: 0.5775294117647058



