In [1]:
# Calculate #Params
import torch

from datasets import load_from_disk
from PIL import Image
import base64
import io
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from time import time
from torch.utils.data import DataLoader, Dataset
from transformers import AutoProcessor, AutoTokenizer
from transformers import CLIPModel, CLIPTokenizer, CLIPProcessor
from sklearn.metrics import roc_auc_score



In [2]:

res_map = {
    # 'clip_entire_model_added_sigmoid_gradclip.pt':{
    #     'model_':'CLIPModel',
    #     'pretrained_model': "openai/clip-vit-large-patch14",
    #     'param': {
    #         'epo': 15,
    #         'head': 'concat',
    #         'map_dim': 32,
    #         'batch_size': 16,
    #         'po_layer': 1
    #     }
    # },
    'clip_entire_model_added_sigmoid_gradclip.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 15,
            'head': 'concat',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 1
        }
    },
    'clip_entire_model_added_sigmoid_gradclip-cross.pt':{ #BEST
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 20,
            'head': 'cross',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 5
        }
    },
    'clip_entire_model_added_sigmoid_gradclip_laion-CLIP-ViT-B-32-laion2B-s34B-b79K-cross.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
        'param': {
            'epo': 20,
            'head': 'cross',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 5
        }
    },
    'clip_entire_model_added_sigmoid_gradclip-att-layer5.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 20,
            'head': 'self-att',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 5
        }
    },
    'clip_entire_model_added_sigmoid_gradclip-cross-layer10.pt':{
        'model_':'CLIPModel',
        'pretrained_model': "openai/clip-vit-large-patch14",
        'param': {
            'epo': 20,
            'head': 'cross',
            'map_dim': 1024,
            'batch_size': 64,
            'po_layer': 10
        }
    },
    # 'clip_entire_model_added_sigmoid_gradclip-cross-unfreeze-last-block.pt':{
    #     'model_':'CLIPModel',
    #     'pretrained_model': "openai/clip-vit-large-patch14",
    #     'param': {
    #         'epo': 20,
    #         'head': 'cross',
    #         'map_dim': 1024,
    #         'batch_size': 8,
    #         'po_layer': 5
    #     }
    # }
    
}



In [7]:
for fp in res_map.keys():
    # print(model)
    model = torch.load(f'model_output/{fp}')
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(fp, '{:,}'.format(pytorch_total_params))


clip_entire_model_added_sigmoid_gradclip.pt 3,936,257
clip_entire_model_added_sigmoid_gradclip-cross.pt 1,075,580,929
clip_entire_model_added_sigmoid_gradclip_laion-CLIP-ViT-B-32-laion2B-s34B-b79K-cross.pt 1,075,056,641
clip_entire_model_added_sigmoid_gradclip-att-layer5.pt 24,922,113
clip_entire_model_added_sigmoid_gradclip-cross-layer10.pt 1,094,473,729


In [2]:
# For BLIP
res_map = {
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip-image-captioning-large-inn.pt':{
    #     'model_':'BlipModel',
    #     'pretrained_model': "Salesforce/blip-image-captioning-large",
    #     'processer': 'BLIPProcessDataset'
    # },
    # 'blip_entire_model_kx_Salesforce-BlipForImageTextRetrieval-blip-itm-large-coco-new.pt':{
    #     'model_': 'BlipForImageTextRetrieval',
    #     'pretrained_model': 'Salesforce/blip-itm-large-coco',
    #     'processer': 'BLIPProcessDataset'
    # },
    # 'blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-LR-EPO.pt':{ # BEST BlipModel "Salesforce/blip-image-captioning-large"
    #     'model_': 'BlipModel', 
    #     'pretrained_model': "Salesforce/blip-image-captioning-large",
    #     'processer': 'BLIPProcessDataset'
    # },
    'blip_entire_model_Salesforce-BlipForImageTextRetrieval-blip-itm-large-coco-new-LR-EPO.pt':{ # BEST BlipForImageTextRetrieval "Salesforce/blip-itm-large-coco"
        'model_': 'BlipForImageTextRetrieval' , 
        'pretrained_model': "Salesforce/blip-itm-large-coco",
        'processer': 'BLIPProcessDataset'
    },
    'blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-cross.pt':{ # BEST BlipModel "Salesforce/blip-image-captioning-large" CROSS
        'model_': 'BlipModel' , 
        'pretrained_model': "Salesforce/blip-image-captioning-large",
        'processer': 'BLIPProcessDataset',
        'fusion': 'cross'
    },
    'blip_entire_model_Salesforce-BlipModel-blip2-inn-concat.pt':{ # BEST Blip2Model "Salesforce/blip2-opt-2.7b"
        'model_': 'Blip2Model' , 
        'pretrained_model': "Salesforce/blip2-opt-2.7b",
        'processer': 'BLIP2ProcessDataset'
    },
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat.pt':{
    #     'model_': 'Blip2Model' , 
    #     'pretrained_model': "Salesforce/blip2-flan-t5-xl",
    #     'processer': 'BLIP2ProcessDataset'
    # },
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5.pt':{
    #     'model_': 'Blip2Model' , 
    #     'pretrained_model': "Salesforce/blip2-flan-t5-xl",
    #     'processer': 'BLIP2ProcessDataset'
    # },
    'blip_entire_model_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5-LR-5e-3.pt':{
        'model_': 'Blip2Model' , 
        'pretrained_model': "Salesforce/blip2-flan-t5-xl",
        'processer': 'BLIP2ProcessDataset'
    },
    # 'blip_entire_model_kx_Salesforce-BlipModel-blip2-inn-concat-epo30.pt':{
    #     'model_': 'Blip2Model' , 
    #     'pretrained_model': "Salesforce/blip2-opt-2.7b",
    #     'processer': 'BLIP2ProcessDataset'
    # },
    # 'dino_large_bge.pt':{
    #     'model_': 'facebook/dinov2-large' , 
    #     'pretrained_img_model': 'facebook/dinov2-large',
    #     'pretrained_txt_model': 'BAAI/bge-m3',
    #     'processer': 'DinoProcessDataset',
    #     'fusion': 'concat'
    # },
    
}


In [3]:
for fp in res_map.keys():
    # print(model)
    model = torch.load(f'model_output/selected/{fp}')
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(fp, '{:,}'.format(pytorch_total_params))


blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-LR-EPO.pt 43,393
blip_entire_model_Salesforce-BlipForImageTextRetrieval-blip-itm-large-coco-new-LR-EPO.pt 1,488,642
blip_entire_model_Salesforce-BlipModel-blip-image-captioning-large-inn-cross.pt 43,393


RuntimeError: CUDA out of memory. Tried to allocate 26.00 MiB (GPU 0; 31.75 GiB total capacity; 14.98 GiB already allocated; 19.75 MiB free; 15.22 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [4]:

for fp in ['blip_entire_model_Salesforce-BlipModel-blip2-inn-concat.pt']:
    # print(model)
    model = torch.load(f'model_output/selected/{fp}')
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(fp, '{:,}'.format(pytorch_total_params))


blip_entire_model_Salesforce-BlipModel-blip2-inn-concat.pt 5,903,361


RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 31.75 GiB total capacity; 22.41 GiB already allocated; 18.75 MiB free; 22.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:

    
for fp in ['blip_entire_model_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5-LR-5e-3.pt']:
    # print(model)
    model = torch.load(f'model_output/selected/{fp}')
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(fp, '{:,}'.format(pytorch_total_params))


blip_entire_model_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5-LR-5e-3.pt 46,674,945


-----------

In [8]:
sum(p.numel() for p in model.parameters() if not p.requires_grad)
3,942,446,592

3942446592

In [9]:
# blip_entire_model_Salesforce-BlipModel-blip2-flan-t5-xlinn-concat-layer5-LR-5e-3.pt
for name, p in model.named_parameters():
    if p.requires_grad:
        print(name, p.numel())

image_map.0.weight 1441792
image_map.0.bias 1024
image_map.3.weight 1048576
image_map.3.bias 1024
image_map.6.weight 1048576
image_map.6.bias 1024
image_map.9.weight 1048576
image_map.9.bias 1024
image_map.12.weight 1048576
image_map.12.bias 1024
text_map.0.weight 32899072
text_map.0.bias 1024
text_map.3.weight 1048576
text_map.3.bias 1024
text_map.6.weight 1048576
text_map.6.bias 1024
text_map.9.weight 1048576
text_map.9.bias 1024
text_map.12.weight 1048576
text_map.12.bias 1024
qformer_map.0.weight 786432
qformer_map.0.bias 1024
pre_output.1.weight 3145728
pre_output.1.bias 1024
classifier.weight 1024
classifier.bias 1


In [10]:
1441792+1024+1048576

2491392

In [11]:
758*758*758

435519512

In [3]:
model

CustomBLIP(
  (blip): Blip2Model(
    (vision_model): Blip2VisionModel(
      (embeddings): Blip2VisionEmbeddings(
        (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
      )
      (encoder): Blip2Encoder(
        (layers): ModuleList(
          (0): Blip2EncoderLayer(
            (self_attn): Blip2Attention(
              (dropout): Dropout(p=0.0, inplace=False)
              (qkv): Linear(in_features=1408, out_features=4224, bias=True)
              (projection): Linear(in_features=1408, out_features=1408, bias=True)
            )
            (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
            (mlp): Blip2MLP(
              (activation_fn): GELUActivation()
              (fc1): Linear(in_features=1408, out_features=6144, bias=True)
              (fc2): Linear(in_features=6144, out_features=1408, bias=True)
            )
            (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          )
          (1