In [1]:
import torch
import clip
from PIL import Image
import numpy as np
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device is {device}")
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

Device is cuda


In [2]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score

import os

# Load Some Examples

In [3]:
# Illustrating example of directly using non-hateful and hateful
image = preprocess(Image.open("./dataset/img/01235.png")).unsqueeze(0).to(device)
text = clip.tokenize(["non-hateful", "hateful"]).to(device)

with torch.no_grad():
    # image_features = model.encode_image(image)
    # text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

Label probs: [[0.676  0.3242]]


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


In [4]:
logits_per_image.shape

torch.Size([1, 2])

In [5]:
logits_per_image

tensor([[24.0938, 23.3594]], device='cuda:0', dtype=torch.float16)

In [6]:
logits_per_text.shape

torch.Size([2, 1])

In [7]:
torch.transpose(logits_per_text, 0, 1).shape

torch.Size([1, 2])

In [8]:
image.shape

torch.Size([1, 3, 224, 224])

In [9]:
text.shape

torch.Size([2, 77])

In [10]:
print(probs)

[[0.676  0.3242]]


In [11]:
np.argmax(probs)

0

# Test Model's Capability of Zero-Shot prediction on the train, dev and test JSONs

In [12]:
model.float()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [13]:
def test_model_zero_shot_capability(model, jsonl_path):
    labels = []
    model_probs = []
    model_preds = []
    # 0 -> Non-hateful, 1 -> Hateful
    text = clip.tokenize(["non-hateful", "hateful"]).to(device)
    with open(jsonl_path, 'r') as json_f:
        json_list = list(json_f)
    for json_str in tqdm(json_list):
        result = json.loads(json_str)
        img_path, label = result['img'], result['label']
        labels.append(label)
        # Read image
        image = preprocess(Image.open(os.path.join('./dataset', img_path))).unsqueeze(0).to(device)
        with torch.no_grad():
            logits_per_image, _ = model(image, text)
            probs = logits_per_image.softmax(dim=-1).cpu().numpy()
            class_1_prob = probs[0][1]
            model_probs.append(class_1_prob)
            model_preds.append(np.argmax(probs))
    return roc_auc_score(labels, model_probs), accuracy_score(labels, model_preds)


In [14]:
dev_roc, dev_acc = test_model_zero_shot_capability(model, './dataset/dev_unseen.jsonl')
print(f'Dev ROC: {dev_roc}')
print(f'Dev Accuracy: {dev_acc}')

  1%|          | 4/540 [00:00<00:45, 11.69it/s]

100%|██████████| 540/540 [00:19<00:00, 28.17it/s]

Dev ROC: 0.4535588235294118
Dev Accuracy: 0.512962962962963





In [15]:
dev_roc, dev_acc = test_model_zero_shot_capability(model, './dataset/dev_seen.jsonl')
print(f'Dev ROC: {dev_roc}')
print(f'Dev Accuracy: {dev_acc}')

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [00:14<00:00, 33.49it/s]

Dev ROC: 0.464402874013858
Dev Accuracy: 0.478





In [16]:
test_roc, test_acc = test_model_zero_shot_capability(model, './dataset/test_seen.jsonl')
print(f'test ROC: {test_roc}')
print(f'test Accuracy: {test_acc}')

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:41<00:00, 23.83it/s]

test ROC: 0.5142156862745099
test Accuracy: 0.516





In [17]:
test_roc, test_acc = test_model_zero_shot_capability(model, './dataset/test_unseen.jsonl')
print(f'test ROC: {test_roc}')
print(f'test Accuracy: {test_acc}')

  0%|          | 0/2000 [00:00<?, ?it/s]

100%|██████████| 2000/2000 [01:16<00:00, 26.16it/s]

test ROC: 0.5172202666666667
test Accuracy: 0.534





In [18]:
train_roc, train_acc = test_model_zero_shot_capability(model, './dataset/train.jsonl')
print(f'train ROC: {train_roc}')
print(f'train Accuracy: {train_acc}')

  0%|          | 0/8500 [00:00<?, ?it/s]

100%|██████████| 8500/8500 [06:01<00:00, 23.54it/s]

train ROC: 0.4763393538907239
train Accuracy: 0.5216470588235295



