In [1]:
import torch
# import clip
from transformers import AutoProcessor, BlipModel
from PIL import Image
import numpy as np
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device is {device}")
# model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

Device is cuda


In [2]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score

import os

# Load Model and Processor

In [3]:
model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

Some weights of BlipModel were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_model.encoder.layer.0.attention.self.value.weight', 'text_model.encoder.layer.0.crossattention.output.LayerNorm.bias', 'text_model.

In [4]:
model.to(device)

BlipModel(
  (text_model): BlipTextModel(
    (embeddings): BlipTextEmbeddings(
      (word_embeddings): Embedding(30524, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BlipTextEncoder(
      (layer): ModuleList(
        (0-11): 12 x BlipTextLayer(
          (attention): BlipTextAttention(
            (self): BlipTextSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BlipTextSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
         

# Load Some Examples

In [5]:
# Illustrating example of directly using non-hateful and hateful
# image = preprocess(Image.open("./dataset/img/01235.png")).unsqueeze(0).to(device)
# text = clip.tokenize(["non-hateful", "hateful"]).to(device)
inputs = processor(text=["non-hateful", "hateful"], images = Image.open("./dataset/img/01235.png"), return_tensors='pt', padding=True).to(device)
with torch.no_grad():
    # image_features = model.encode_image(image)
    # text_features = model.encode_text(text)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    # logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

Label probs: [[0.6097522  0.39024785]]


In [6]:
logits_per_image.shape

torch.Size([1, 2])

In [7]:
logits_per_image

tensor([[0.7134, 0.2671]], device='cuda:0')

In [8]:
print(probs)

[[0.6097522  0.39024785]]


In [9]:
np.argmax(probs)

0

# Test Model's Capability of Zero-Shot prediction on the train, dev and test JSONs

In [10]:
def test_model_zero_shot_capability(model, processor, device, jsonl_path):
    labels = []
    model_probs = []
    model_preds = []
    # 0 -> Non-hateful, 1 -> Hateful
    # text = clip.tokenize(["non-hateful", "hateful"]).to(device)
    with open(jsonl_path, 'r') as json_f:
        json_list = list(json_f)
    for json_str in tqdm(json_list):
        result = json.loads(json_str)
        img_path, label = result['img'], result['label']
        labels.append(label)
        # image = preprocess(Image.open(os.path.join('./dataset', img_path))).unsqueeze(0).to(device)
        with torch.no_grad():
            inputs = processor(text=['non-hateful', 'hateful'], images=Image.open(os.path.join('./dataset', img_path)), return_tensors='pt', padding=True).to(device)
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            probs = logits_per_image.softmax(dim=-1).cpu().numpy()
            class_1_prob = probs[0][1]
            model_probs.append(class_1_prob)
            model_preds.append(np.argmax(probs))
    return roc_auc_score(labels, model_probs), accuracy_score(labels, model_preds)


In [11]:
dev_roc, dev_acc = test_model_zero_shot_capability(model, processor, device, './dataset/dev_unseen.jsonl')
print(f'Dev ROC: {dev_roc}')
print(f'Dev Accuracy: {dev_acc}')

100%|██████████| 540/540 [00:21<00:00, 25.39it/s]

Dev ROC: 0.48675
Dev Accuracy: 0.6055555555555555





In [12]:
test_roc, test_acc = test_model_zero_shot_capability(model, processor, device, './dataset/test_unseen.jsonl')
print(f'test ROC: {test_roc}')
print(f'test Accuracy: {test_acc}')

100%|██████████| 2000/2000 [01:12<00:00, 27.48it/s]

test ROC: 0.5236778666666667
test Accuracy: 0.6075



