In [1]:
import torch

from datasets import load_from_disk
from PIL import Image
import base64
import io
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from time import time
from torch.utils.data import DataLoader, Dataset
from transformers import AutoProcessor, AutoTokenizer, Blip2Processor, Blip2Model, AutoTokenizer
from sklearn.metrics import roc_auc_score, accuracy_score


def base64str_to_PILobj(base64_string):
    '''
    Args
    - base64_string (str): based64 encoded representing an image

    Output
    - PIL object (use .show() to display)
    '''
    image_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(image_data))
    #img.show()
    return img

In [2]:
%load_ext autoreload

%autoreload 2

In [3]:
# from tqdm.notebook import tqdm
import pandas as pd

def get_result_df(data_loader, model, sample_n=None, type_='train'):
    fina_res_dict_train = {
        'pred': [],
        'pred_score': [],
        'idx': [],
        'img': [],
        'labels': [],
        'type': []
    }

    with torch.no_grad():
        for batch in data_loader:

            output = model(batch, device=device)
            predicted = torch.as_tensor((output - 0.5) > 0, dtype=torch.int32)
            # _, predicted = nn.sigmoid(output)#torch.max(output.data, 1)

            fina_res_dict_train['pred'].extend(predicted.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict_train['pred_score'].extend(output.detach().cpu().numpy().reshape(-1).tolist())
            fina_res_dict_train['idx'].extend(batch['idx_memes']) 
            fina_res_dict_train['labels'].extend(batch['labels'].detach().cpu().numpy().reshape(-1).tolist()) 
            fina_res_dict_train['img'].extend(batch['image'])
            fina_res_dict_train['type'].extend([type_ for _ in range(len(batch['image']))])
            
            # break
            if sample_n is not None and len(fina_res_dict_train['idx'])>=sample_n:
                break

    return pd.DataFrame(fina_res_dict_train)


In [4]:

res_map = {
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip-image-captioning-large-inn.pt':{
    #     'model_':'BlipModel',
    #     'pretrained_model': "Salesforce/blip-image-captioning-large",
    #     'processer': 'BLIPProcessDataset'
    # },
    # 'blip_entire_model_kx_Salesforce-BlipForImageTextRetrieval-blip-itm-large-coco-new.pt':{
    #     'model_': 'BlipForImageTextRetrieval',
    #     'pretrained_model': 'Salesforce/blip-itm-large-coco',
    #     'processer': 'BLIPProcessDataset'
    # },
    'blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-LR-EPO.pt':{ # BEST BlipModel "Salesforce/blip-image-captioning-large"
        'model_': 'BlipModel', 
        'pretrained_model': "Salesforce/blip-image-captioning-large",
        'processer': 'BLIPProcessDataset'
    },
    'blip_entire_model_Salesforce-BlipForImageTextRetrieval-blip-itm-large-coco-new-LR-EPO.pt':{ # BEST BlipForImageTextRetrieval "Salesforce/blip-itm-large-coco"
        'model_': 'BlipForImageTextRetrieval' , 
        'pretrained_model': "Salesforce/blip-itm-large-coco",
        'processer': 'BLIPProcessDataset'
    },
    # 'blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-cross.pt':{ # BEST BlipModel "Salesforce/blip-image-captioning-large" CROSS
    #     'model_': 'BlipModel' , 
    #     'pretrained_model': "Salesforce/blip-image-captioning-large",
    #     'processer': 'BLIPProcessDataset',
    #     'fusion': 'cross'
    # },
    'blip_entire_model_Salesforce-BlipModel-blip2-inn-concat.pt':{ # BEST Blip2Model "Salesforce/blip2-opt-2.7b"
        'model_': 'Blip2Model' , 
        'pretrained_model': "Salesforce/blip2-opt-2.7b",
        'processer': 'BLIP2ProcessDataset'
    },
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat.pt':{
    #     'model_': 'Blip2Model' , 
    #     'pretrained_model': "Salesforce/blip2-flan-t5-xl",
    #     'processer': 'BLIP2ProcessDataset'
    # },
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5.pt':{
    #     'model_': 'Blip2Model' , 
    #     'pretrained_model': "Salesforce/blip2-flan-t5-xl",
    #     'processer': 'BLIP2ProcessDataset'
    # },
    'blip_entire_model_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5-LR-5e-3.pt':{
        'model_': 'Blip2Model' , 
        'pretrained_model': "Salesforce/blip2-flan-t5-xl",
        'processer': 'BLIP2ProcessDataset'
    },
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip2-inn-concat-epo30.pt':{
    #     'model_': 'Blip2Model' , 
    #     'pretrained_model': "Salesforce/blip2-opt-2.7b",
    #     'processer': 'BLIP2ProcessDataset'
    # },
    # 'dino_large_bge.pt':{
    #     'model_': 'facebook/dinov2-large' , 
    #     'pretrained_img_model': 'facebook/dinov2-large',
    #     'pretrained_txt_model': 'BAAI/bge-m3',
    #     'processer': 'DinoProcessDataset',
    #     'fusion': 'concat'
    # },
    
}


In [5]:
combined = load_from_disk('./processed_data/combined_hateful_memes_dataset')

train_data = combined['train']

In [6]:
# from FlagEmbedding import BGEM3FlagModel

# class DinoProcessDataset(Dataset):
#     def __init__(self, dataset, pretrained_img_model, pretrained_txt_model, device='cuda'):
#         self.image_size = 224
#         self.dataset = dataset
#         # self.processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
#         # Image processer
#         self.processor = AutoImageProcessor.from_pretrained(pretrained_model, device=device)
#         self.model = AutoModel.from_pretrained(pretrained_model)
#         self.model.to(device)

#         self.text_model = BGEM3FlagModel(pretrained_txt_model,  use_fp16=False, device=device)

#     def __len__(self):
#         return len(self.dataset)

#     def __getitem__(self, idx):
#         item = self.dataset[idx]
#         device='cuda'
       
#         # pixel_values = self.processor(images=base64str_to_PILobj(item["image"]).convert("RGB").resize((self.image_size, self.image_size)),
#         #                                     return_tensors="pt")['pixel_values']
    
#         # text_output = self.processor(text = item['text'],
#         #                                   padding='max_length', 
#         #                                   return_tensors="pt", 
#         #                                   truncation=True)
#         # img = base64str_to_PILobj(item["image"]).convert("RGB").resize(self.image_size, self.image_size).to(device)
#         inputs = self.processor(images=base64str_to_PILobj(item["image"]).convert("RGB").resize((self.image_size, self.image_size)), 
#                                 return_tensors="pt").to(device)
#         pixel_out = self.model(**inputs)
        
#         pixel_values = pixel_out.last_hidden_state

#         text_embeddings = self.text_model.encode(item['text'])['dense_vecs']#.to(device)

#         label = torch.LongTensor([item['label']])

#         return {
#             'pixel_values': pixel_values,
#             'text_output': text_embeddings,
#             'labels': label,
#             'idx_memes': item['id'],
#             'image': item['image']
#         }

In [6]:
class BLIPProcessDataset(Dataset):
    def __init__(self, dataset, pretrained_model):
        self.image_size = 224
        self.dataset = dataset
        self.processor = AutoProcessor.from_pretrained(pretrained_model)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
       
        pixel_values = self.processor(images=base64str_to_PILobj(item["image"]).convert("RGB").resize((self.image_size, self.image_size)),
                                            return_tensors="pt")['pixel_values']
    
        text_output = self.processor(text=item['text'],
                                     padding='max_length', 
                                     return_tensors="pt", 
                                     truncation=True
                                     )
        # print(text_output.keys())
        # import pdb; pdb.set_trace()

        label = torch.LongTensor([item['label']])

        return {
            'pixel_values': pixel_values,
            'input_ids': text_output['input_ids'],
            'attention_mask': text_output['attention_mask'],
            'labels': label,
            'idx_memes': item['id'],
            'image': item['image']
        }

In [7]:
class BLIP2ProcessDataset(Dataset):
    def __init__(self, dataset, pretrained_model):
        self.image_size = 518#224
        self.dataset = dataset
        self.processor = Blip2Processor.from_pretrained(pretrained_model)
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
       
        pixel_values = self.processor(images=base64str_to_PILobj(item["image"]).convert("RGB").resize((self.image_size, self.image_size)),
                                            return_tensors="pt")['pixel_values']
        
        text_output = self.tokenizer(text=item['text'],
                                     padding='max_length', 
                                     return_tensors="pt", 
                                     max_length=512,
                                     truncation=True
                                     )
        # print(text_output.keys())
        # import pdb; pdb.set_trace()

        label = torch.LongTensor([item['label']])

        return {
            'pixel_values': pixel_values,
            'input_ids': text_output['input_ids'],
            'attention_mask': text_output['attention_mask'],
            'labels': label,
            'idx_memes': item['id'],
            'image': item['image'],
            'pad_token_id': self.tokenizer.pad_token_id,
        }


In [9]:
batch_size = 64
device='cuda'

done = []
for fp, map_dict in res_map.items():
    
    print('==========')
    print(fp)
    if 'flan' in fp:
        import blip2_feasExtract_flan as blip2_feasExtract
        models.blip2_feasExtract = blip2_feasExtract
    model = torch.load(f'model_output/selected/{fp}')
    
    # Generate process DataSet
    processer = eval(map_dict['processer'])
    train_dataset = processer(train_data, map_dict['pretrained_model'])
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    
    dev_seen_data = combined['dev_seen']
    dev_seen_dataset = processer(dev_seen_data, map_dict['pretrained_model'])
    dev_seen_loader = DataLoader(dev_seen_dataset, shuffle=True, batch_size=batch_size)

    dev_unseen_data = combined['dev_unseen']
    dev_unseen_dataset = processer(dev_unseen_data, map_dict['pretrained_model'])
    dev_unseen_loader = DataLoader(dev_unseen_dataset, shuffle=True, batch_size=batch_size)

    test_seen_data = combined['test_seen']
    test_seen_dataset = processer(test_seen_data, map_dict['pretrained_model'])
    test_seen_loader = DataLoader(test_seen_dataset, shuffle=True, batch_size=batch_size)

    test_unseen_data = combined['test_unseen']
    test_unseen_dataset = processer(test_unseen_data, map_dict['pretrained_model'])
    test_unseen_loader = DataLoader(test_unseen_dataset, shuffle=True, batch_size=batch_size)
    
    data_dict = {
        'dev_unseen': dev_unseen_loader,
        'dev_seen': dev_seen_loader,
        'test_unseen': test_unseen_loader,
        'test_seen': test_seen_loader,
    }
    
    for type_, type_loader in data_dict.items():
        df = get_result_df(type_loader, model, type_=type_)

        print(type_, roc_auc_score(df['labels'], df['pred_score']), accuracy_score(df['labels'], df['pred']))
    
    del model
    # print('==========')
    # break
    done.append(fp)



blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-LR-EPO.pt
dev_unseen 0.6379558823529412 0.6407407407407407
dev_seen 0.6485893968731498 0.592
test_unseen 0.678768 0.661
test_seen 0.6574309723889555 0.603
blip_entire_model_Salesforce-BlipForImageTextRetrieval-blip-itm-large-coco-new-LR-EPO.pt
dev_unseen 0.6908676470588235 0.6462962962962963
dev_seen 0.699876782256645 0.546
test_unseen 0.6792970666666667 0.652
test_seen 0.6891156462585034 0.565
blip_entire_model_Salesforce-BlipModel-blip2-inn-concat.pt


RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 31.74 GiB total capacity; 19.32 GiB already allocated; 1.55 GiB free; 20.95 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [10]:
done

['blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-LR-EPO.pt',
 'blip_entire_model_Salesforce-BlipForImageTextRetrieval-blip-itm-large-coco-new-LR-EPO.pt']

In [10]:
batch_size = 8
device='cuda'

for fp, map_dict in res_map.items():
    if fp in done:
        continue
    print('==========')
    print(fp)
    model = torch.load(f'model_output/selected/{fp}')
    
    # Generate process DataSet
    processer = eval(map_dict['processer'])
    train_dataset = processer(train_data, map_dict['pretrained_model'])
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    
    dev_seen_data = combined['dev_seen']
    dev_seen_dataset = processer(dev_seen_data, map_dict['pretrained_model'])
    dev_seen_loader = DataLoader(dev_seen_dataset, shuffle=True, batch_size=batch_size)

    dev_unseen_data = combined['dev_unseen']
    dev_unseen_dataset = processer(dev_unseen_data, map_dict['pretrained_model'])
    dev_unseen_loader = DataLoader(dev_unseen_dataset, shuffle=True, batch_size=batch_size)

    test_seen_data = combined['test_seen']
    test_seen_dataset = processer(test_seen_data, map_dict['pretrained_model'])
    test_seen_loader = DataLoader(test_seen_dataset, shuffle=True, batch_size=batch_size)

    test_unseen_data = combined['test_unseen']
    test_unseen_dataset = processer(test_unseen_data, map_dict['pretrained_model'])
    test_unseen_loader = DataLoader(test_unseen_dataset, shuffle=True, batch_size=batch_size)
    
    data_dict = {
        'dev_unseen': dev_unseen_loader,
        'dev_seen': dev_seen_loader,
        'test_unseen': test_unseen_loader,
        'test_seen': test_seen_loader,
    }
    
    for type_, type_loader in data_dict.items():
        df = get_result_df(type_loader, model, type_=type_)

        print(type_, roc_auc_score(df['labels'], df['pred_score']), accuracy_score(df['labels'], df['pred']))
    
    del model
    # print('==========')
    # break
    done.append(fp)


blip_entire_model_Salesforce-BlipModel-blip2-inn-concat.pt
dev_unseen 0.7262352941176472 0.6907407407407408
dev_seen 0.7405226352594774 0.64
test_unseen 0.7454357333333334 0.6975
test_seen 0.7449099639855942 0.652
blip_entire_model_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5-LR-5e-3.pt


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

dev_unseen 0.7332205882352941 0.7092592592592593
dev_seen 0.7509881422924901 0.68
test_unseen 0.764216 0.7155
test_seen 0.7682553021208484 0.688


In [26]:
# # To free up CUDA mem after every load
# import gc
# torch.cuda.empty_cache()
# gc.collect()

# print(torch.cuda.memory_reserved(0))
# print(torch.cuda.memory_allocated(0))

26449281024
23763481088
