In [1]:
import torch

from datasets import load_from_disk
from PIL import Image
import base64
import io
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from time import time
from torch.utils.data import DataLoader, Dataset
from transformers import AutoProcessor, AutoTokenizer
from transformers import CLIPModel, CLIPTokenizer, CLIPProcessor
from sklearn.metrics import roc_auc_score, accuracy_score


def base64str_to_PILobj(base64_string):
    '''
    Args
    - base64_string (str): based64 encoded representing an image

    Output
    - PIL object (use .show() to display)
    '''
    image_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(image_data))
    #img.show()
    return img

In [2]:
# from tqdm.notebook import tqdm
import pandas as pd

def get_result_df(data_loader, model, sample_n=None, type_='train'):
    fina_res_dict_train = {
        'pred': [],
        'pred_score': [],
        'idx': [],
        'img': [],
        'labels': [],
        'type': []
    }

    with torch.no_grad():
        for batch in data_loader:

            output = model(batch, device=device)
            predicted = torch.as_tensor((output - 0.5) > 0, dtype=torch.int32)
            # _, predicted = nn.sigmoid(output)#torch.max(output.data, 1)

            fina_res_dict_train['pred'].extend(predicted.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict_train['pred_score'].extend(output.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict_train['idx'].extend(batch['idx_memes']) 
            fina_res_dict_train['labels'].extend(batch['labels'].detach().cpu().numpy().reshape(-1).tolist()) 
            fina_res_dict_train['img'].extend(batch['image'])
            fina_res_dict_train['type'].extend([type_ for _ in range(len(batch['image']))])
            
            # break
            if sample_n is not None and len(fina_res_dict_train['idx'])>=sample_n:
                break

    return pd.DataFrame(fina_res_dict_train)


In [3]:

res_map = {
    # 'clip_entire_model_added_sigmoid_gradclip.pt':{
    #     'model_':'CLIPModel',
    #     'pretrained_model': "openai/clip-vit-large-patch14",
    #     'param': {
    #         'epo': 15,
    #         'head': 'concat',
    #         'map_dim': 32,
    #         'batch_size': 16,
    #         'po_layer': 1
    #     }
    # },
    'clip_entire_model_added_sigmoid_gradclip.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 15,
            'head': 'concat',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 1
        }
    },
    'clip_entire_model_added_sigmoid_gradclip-cross.pt':{ #BEST
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 20,
            'head': 'cross',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 5
        }
    },
    'clip_entire_model_added_sigmoid_gradclip_laion-CLIP-ViT-B-32-laion2B-s34B-b79K-cross.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
        'param': {
            'epo': 20,
            'head': 'cross',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 5
        }
    },
    'clip_entire_model_added_sigmoid_gradclip-att-layer5.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 20,
            'head': 'self-att',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 5
        }
    },
    'clip_entire_model_added_sigmoid_gradclip-cross-layer10.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 20,
            'head': 'cross',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 10
        }
    },
    # 'clip_entire_model_added_sigmoid_gradclip-cross-unfreeze-last-block.pt':{
    #     'model_':'CLIPModel',
    #     'pretrained_model': "openai/clip-vit-large-patch14",
    #     'param': {
    #         'epo': 20,
    #         'head': 'cross',
    #         'map_dim': 1024,
    #         'batch_size': 8,
    #         'po_layer': 5
    #     }
    # }
    
}


In [4]:
combined = load_from_disk('./processed_data/combined_hateful_memes_dataset')

train_data = combined['train']

In [5]:
class CLIPProcessDataset(Dataset):
    def __init__(self, dataset, pretrain_model):
        self.image_size = 224
        self.dataset = dataset
        self.image_processor = CLIPProcessor.from_pretrained(pretrain_model)
        self.text_processor = CLIPTokenizer.from_pretrained(pretrain_model)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        pixel_values = self.image_processor(images=base64str_to_PILobj(item["image"]).convert("RGB").resize((self.image_size, self.image_size)),
                                            return_tensors="pt")['pixel_values']

        text_output = self.text_processor(item['text'],
                                          padding='max_length', 
                                          return_tensors="pt", 
                                          truncation=True)
        label = torch.LongTensor([item['label']])

        return {
            'pixel_values': pixel_values,
            'input_ids': text_output['input_ids'],
            'attention_mask': text_output['attention_mask'],
            'labels': label,
            'idx_memes': item['id'],
            'image': item['image']
        }

In [6]:
batch_size = 64
device='cuda'


for fp, map_dict in res_map.items():
    print('==========')
    print(fp)
    model = torch.load(f'model_output/{fp}')
    
    # Generate process DataSet
    processer = CLIPProcessDataset
    train_dataset = processer(train_data, map_dict['pretrained_model'])
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    
    dev_seen_data = combined['dev_seen']
    dev_seen_dataset = processer(dev_seen_data, map_dict['pretrained_model'])
    dev_seen_loader = DataLoader(dev_seen_dataset, shuffle=True, batch_size=batch_size)

    dev_unseen_data = combined['dev_unseen']
    dev_unseen_dataset = processer(dev_unseen_data, map_dict['pretrained_model'])
    dev_unseen_loader = DataLoader(dev_unseen_dataset, shuffle=True, batch_size=batch_size)

    test_seen_data = combined['test_seen']
    test_seen_dataset = processer(test_seen_data, map_dict['pretrained_model'])
    test_seen_loader = DataLoader(test_seen_dataset, shuffle=True, batch_size=batch_size)

    test_unseen_data = combined['test_unseen']
    test_unseen_dataset = processer(test_unseen_data, map_dict['pretrained_model'])
    test_unseen_loader = DataLoader(test_unseen_dataset, shuffle=True, batch_size=batch_size)
    
    data_dict = {
        'dev_unseen': dev_unseen_loader,
        'dev_seen': dev_seen_loader,
        'test_unseen': test_unseen_loader,
        'test_seen': test_seen_loader,
    }
    
    for type_, type_loader in data_dict.items():
        df = get_result_df(type_loader, model, type_=type_)

        print(type_, roc_auc_score(df['labels'], df['pred_score']), accuracy_score(df['labels'], df['pred']))
    
    del model
    # print('==========')
    # break


clip_entire_model_added_sigmoid_gradclip.pt
dev_unseen 0.7605147058823529 0.7018518518518518
dev_seen 0.7700948936646876 0.69
test_unseen 0.7846026666666668 0.7325
test_seen 0.8000560224089636 0.731
clip_entire_model_added_sigmoid_gradclip-cross.pt
dev_unseen 0.8117941176470589 0.7574074074074074
dev_seen 0.8291433966491176 0.696
test_unseen 0.8240634666666667 0.756
test_seen 0.8336214485794315 0.729
clip_entire_model_added_sigmoid_gradclip_laion-CLIP-ViT-B-32-laion2B-s34B-b79K-cross.pt


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

dev_unseen 0.7381470588235294 0.6907407407407408
dev_seen 0.753900561680882 0.67
test_unseen 0.7659136 0.7325
test_seen 0.7693477390956383 0.691
clip_entire_model_added_sigmoid_gradclip-att-layer5.pt
dev_unseen 0.5291323529411764 0.6296296296296297
dev_seen 0.4927109503768543 0.506
test_unseen 0.5095573333333333 0.625
test_seen 0.5049779911964786 0.51
clip_entire_model_added_sigmoid_gradclip-cross-layer10.pt
dev_unseen 0.7653823529411765 0.7388888888888889
dev_seen 0.7817765758269191 0.692
test_unseen 0.7662570666666667 0.733
test_seen 0.7825490196078432 0.696
