In [None]:
# Code Overview
# Extracts text, vision, and multmodal embeddings from the multimodal DeCLUTR-ViT backbone trained with CLIP, CLIP-ITM, and BLIP2 objectives. 

In [None]:
"""
Python version: 3.10
Description: Performs Multimodal Authorship Attribution using CLIP training strategy
"""

# %% Importing Libraries
import os
import re
import sys
import argparse
import time
import datetime
import random
from pathlib import Path
from PIL import Image

import pandas as pd

from tqdm import tqdm
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader

from pytorch_lightning.loggers import WandbLogger

import lightning as L
import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.tuner.tuning import Tuner
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from transformers import AutoTokenizer, ViTImageProcessor

# Custom library
sys.path.append('../process/')
from utilities import map_images_with_text_for_clip_model
from loadData import FineTuneCLIPstyleModelDataset

import warnings
warnings.filterwarnings('ignore')

In [95]:
# Suppress TorchDynamo errors and fall back to eager execution
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [96]:
import os

class Args:
    def __init__(self):
        self.logged_entry_name = "multimodal-latent-fusion-seed:1111"
        self.data_dir = '/workspace/persistent/HTClipper/data/processed'
        self.image_dir = "/workspace/persistent/HTClipper/data/IMAGES"
        self.save_dir = os.path.join(os.getcwd(), "/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/pre-training/")
        self.model_dir_name = None
        self.pairing_mode = "non-associated"
        self.model_type = "CLIP"  # Can be "CLIP", "BLIP2", or "CLIPITM"
        self.batch_size = 32
        self.nb_epochs = 40
        self.patience = 3
        self.nb_negatives = 1
        self.seed = 1111
        self.warmup_steps = 0
        self.grad_steps = 1
        self.learning_rate = 6e-4
        self.train_data_percentage = 1.0
        self.adam_epsilon = 1e-6
        self.min_delta_change = 0.01
        self.weight_decay = 0.01
        self.augment_data = False
        self.nb_augmented_samples = 1
        self.loss = 'CE'
        self.temp = 0.5
        self.repr = "CLS"

# Instantiate the arguments
args = Args()

In [97]:
class FineTuneCLIPClassifier(pl.LightningModule):
    def __init__(self, pretrained_model, finetune_mode, extract_representation_from, num_classes, weight_decay, eps, warmup_steps, num_training_steps, 
                learning_rate, loss_fn, temperature):
        super(FineTuneCLIPClassifier, self).__init__()
        self.text_model = pretrained_model.text_model
        self.image_model = pretrained_model.image_model
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.eps = eps
        self.warmup_steps = warmup_steps
        self.num_training_steps = num_training_steps
        self.loss_fn_name = loss_fn
        self.extract_representation_from = extract_representation_from
        self.temperature = temperature
        self.finetune_mode = finetune_mode

        self.validation_outputs = []  # To store validation outputs
        self.test_outputs = []  # To store test outputs

        # Classification head
        self.classifier = nn.Linear(self.text_model.config.hidden_size, num_classes)

        # Loss function
        self.ce_loss = nn.CrossEntropyLoss()

        if self.loss_fn_name == "CE+SupCon":
            self.supcon_loss = SupConLoss(self.temperature)

    def forward(self, input_ids, attention_mask, pixel_values):
        # Get text embeddings
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)

        if self.extract_representation_from == "CLS":
            # Use CLS token embedding
            text_embeddings = text_outputs.last_hidden_state[:, 0, :]
        elif self.extract_representation_from == "EOS":
            # Get the positions of the last non-padding tokens
            sequence_lengths = attention_mask.sum(dim=1) - 1
            text_embeddings = text_outputs.last_hidden_state[torch.arange(input_ids.size(0)), sequence_lengths, :]
        else:
            raise ValueError("extract_representation_from must be 'CLS' or 'EOS'")

        # Get image embeddings
        image_outputs = self.image_model(pixel_values=pixel_values)
        image_embeddings = image_outputs.last_hidden_state[:, 0, :]  # Use CLS token embedding

        # Take the mean of text and image embeddings
        embeddings = (text_embeddings + image_embeddings) / 2

        logits = self.classifier(embeddings)
        return logits, embeddings

    def training_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        if self.loss_fn_name == "CE+SupCon":
            features = F.normalize(embeddings, dim=1)
            supcon_loss = self.supcon_loss(features, batch["labels"])
            loss += supcon_loss

        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        if self.loss_fn_name == "CE+SupCon":
            features = F.normalize(embeddings, dim=1)
            supcon_loss = self.supcon_loss(features, batch["labels"])
            loss += supcon_loss

        preds = torch.argmax(logits, dim=1)
        # acc = (preds == batch["labels"]).float().mean()

        self.validation_outputs.append({'preds': preds, 'labels': batch["labels"]})
        self.log("val_loss", loss)
        # self.log("val_acc", acc)
        return loss

    def test_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        preds = torch.argmax(logits, dim=1)
        # acc = (preds == batch["labels"]).float().mean()
        self.test_outputs.append({'preds': preds, 'labels': batch["labels"]})
        self.log("test_loss", loss)
        # self.log("test_acc", acc)
        return loss

    # At the end of validation epoch, calculate accuracy and F1 scores
    def on_validation_epoch_end(self):
        val_preds = torch.cat([x['preds'] for x in self.validation_outputs])
        val_labels = torch.cat([x['labels'] for x in self.validation_outputs])
        val_acc = balanced_accuracy_score(val_labels.cpu().numpy(), val_preds.cpu().numpy())
        val_f1_weighted = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='weighted')
        val_f1_micro = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
        val_f1_macro = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='macro')
        self.log('val_acc', val_acc, on_step=False, on_epoch=True)
        self.log('val_f1_weighted', val_f1_weighted, on_step=False, on_epoch=True)
        self.log('val_f1_micro', val_f1_micro, on_step=False, on_epoch=True)
        self.log('val_f1_macro', val_f1_macro, on_step=False, on_epoch=True)
        self.validation_outputs = []

    # At the end of the test epoch, calculate accuracy and F1 scores
    def on_test_epoch_end(self):
        test_preds = torch.cat([x['preds'] for x in self.test_outputs])
        test_labels = torch.cat([x['labels'] for x in self.test_outputs])
        test_acc = balanced_accuracy_score(test_labels.cpu().numpy(), test_preds.cpu().numpy())
        test_f1_weighted = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='weighted')
        test_f1_micro = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_f1_macro = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='macro')
        self.log('test_acc', test_acc, on_step=False, on_epoch=True)
        self.log('test_f1_weighted', test_f1_weighted, on_step=False, on_epoch=True)
        self.log('test_f1_micro', test_f1_micro, on_step=False, on_epoch=True)
        self.log('test_f1_macro', test_f1_macro, on_step=False, on_epoch=True)
        self.test_outputs = []

    def configure_optimizers(self):
        # Freeze layers if needed
        if self.finetune_mode == "finetune_layers":
            for name, param in self.text_model.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

            for name, param in self.image_model.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

        elif self.finetune_mode == "all":
            # Unfreeze all layers
            for param in self.text_model.parameters():
                param.requires_grad = True
            for param in self.image_model.parameters():
                param.requires_grad = True

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if p.requires_grad and not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if p.requires_grad and any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters, lr=self.learning_rate, eps=self.eps
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.num_training_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]
    
    def get_embedding(self, input_ids=None, attention_mask=None, pixel_values=None, embedding_type='multimodal'):
        """
        Generate text, image, or multimodal embeddings.

        Args:
            input_ids: Tokenized input text (for text embeddings).
            attention_mask: Attention mask for input text (for text embeddings).
            pixel_values: Preprocessed image (for image embeddings).
            embedding_type: Specify 'text', 'image', or 'multimodal' to generate the respective embeddings.

        Returns:
            Embeddings as torch.Tensor.
        """
        if embedding_type == 'text':
            if input_ids is None or attention_mask is None:
                raise ValueError("input_ids and attention_mask are required for text embeddings.")
            text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
            # Use EOS token embedding
            text_embeddings = text_outputs.last_hidden_state[:, -1, :]
            # text_embeddings = F.normalize(text_embeddings, p=2, dim=-1)
            return text_embeddings

        elif embedding_type == 'image':
            if pixel_values is None:
                raise ValueError("pixel_values are required for image embeddings.")
            image_outputs = self.image_model(pixel_values=pixel_values)
            # Use CLS token embedding
            image_embeddings = image_outputs.last_hidden_state[:, 0, :]
            # image_embeddings = F.normalize(image_embeddings, p=2, dim=-1)
            return image_embeddings

        elif embedding_type == 'multimodal':
            if input_ids is None or attention_mask is None or pixel_values is None:
                raise ValueError("input_ids, attention_mask, and pixel_values are required for multimodal embeddings.")
            text_embeddings = self.get_embedding(input_ids, attention_mask, embedding_type='text')
            image_embeddings = self.get_embedding(pixel_values=pixel_values, embedding_type='image')
            # Average embeddings
            embeddings = (text_embeddings + image_embeddings) / 2
            # embeddings = F.normalize(embeddings, p=2, dim=-1)
            return embeddings

        else:
            raise ValueError("Invalid embedding_type. Choose 'text', 'image', or 'multimodal'.")

In [98]:
class FineTuneCLIPITMClassifier(pl.LightningModule):
    def __init__(self, pretrained_model, finetune_mode, num_classes, weight_decay, eps, warmup_steps, num_training_steps, 
                learning_rate, loss_fn, temperature):
        super(FineTuneCLIPITMClassifier, self).__init__()
        self.text_model = pretrained_model.text_model
        self.image_model = pretrained_model.image_model
        self.qformer = pretrained_model.qformer
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.eps = eps
        self.warmup_steps = warmup_steps
        self.num_training_steps = num_training_steps
        self.loss_fn_name = loss_fn
        self.finetune_mode = finetune_mode
        self.temperature = temperature

        self.validation_outputs = []  # To store validation outputs
        self.test_outputs = []  # To store test outputs

        # Classification head
        self.classifier = nn.Linear(self.text_model.config.hidden_size, num_classes)

        # Loss function
        self.ce_loss = nn.CrossEntropyLoss()

        if self.loss_fn_name == "CE+SupCon":
            self.supcon_loss = SupConLoss(self.temperature)

    def forward(self, input_ids, attention_mask, pixel_values):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = text_outputs.last_hidden_state[:, 0, :]  # [CLS] token

        image_outputs = self.image_model(pixel_values=pixel_values)
        image_embeddings = image_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Pass through the custom Q-Former
        query_embeddings = self.qformer(image_embeddings)
        query_embeddings = query_embeddings.mean(dim=1)  # Mean pooling over queries

        # Take the mean of text and image embeddings
        embeddings = (text_embeddings + query_embeddings) / 2

        logits = self.classifier(embeddings)
        return logits, embeddings

    def training_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        if self.loss_fn_name == "CE+SupCon":
            features = F.normalize(embeddings, dim=1)
            supcon_loss = self.supcon_loss(features, batch["labels"])
            loss += supcon_loss

        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        if self.loss_fn_name == "CE+SupCon":
            features = F.normalize(embeddings, dim=1)
            supcon_loss = self.supcon_loss(features, batch["labels"])
            loss += supcon_loss

        preds = torch.argmax(logits, dim=1)
        # acc = (preds == batch["labels"]).float().mean()
        self.validation_outputs.append({'preds': preds, 'labels': batch["labels"]})

        self.log("val_loss", loss)
        # self.log("val_acc", acc)
        return loss

    def test_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        preds = torch.argmax(logits, dim=1)
        # acc = (preds == batch["labels"]).float().mean()
        self.test_outputs.append({'preds': preds, 'labels': batch["labels"]})

        self.log("test_loss", loss)
        # self.log("test_acc", acc)
        return loss

    def on_validation_epoch_end(self):
        val_preds = torch.cat([x['preds'] for x in self.validation_outputs])
        val_labels = torch.cat([x['labels'] for x in self.validation_outputs])
        val_acc = balanced_accuracy_score(val_labels.cpu().numpy(), val_preds.cpu().numpy())
        val_f1_weighted = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='weighted')
        val_f1_micro = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
        val_f1_macro = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='macro')
        self.log('val_acc', val_acc, on_step=False, on_epoch=True)
        self.log('val_f1_weighted', val_f1_weighted, on_step=False, on_epoch=True)
        self.log('val_f1_micro', val_f1_micro, on_step=False, on_epoch=True)
        self.log('val_f1_macro', val_f1_macro, on_step=False, on_epoch=True)
        self.validation_outputs = []

    # At the end of the test epoch, calculate accuracy and F1 scores
    def on_test_epoch_end(self):
        test_preds = torch.cat([x['preds'] for x in self.test_outputs])
        test_labels = torch.cat([x['labels'] for x in self.test_outputs])
        test_acc = balanced_accuracy_score(test_labels.cpu().numpy(), test_preds.cpu().numpy())
        test_f1_weighted = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='weighted')
        test_f1_micro = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_f1_macro = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='macro')
        self.log('test_acc', test_acc, on_step=False, on_epoch=True)
        self.log('test_f1_weighted', test_f1_weighted, on_step=False, on_epoch=True)
        self.log('test_f1_micro', test_f1_micro, on_step=False, on_epoch=True)
        self.log('test_f1_macro', test_f1_macro, on_step=False, on_epoch=True)
        self.test_outputs = []

    def configure_optimizers(self):
        # Freeze layers if needed
        if self.finetune_mode == "finetune_layers":
            for name, param in self.text_model.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

            for name, param in self.image_model.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

            for name, param in self.qformer.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

        elif self.finetune_mode == "all":
            # Unfreeze all layers
            for param in self.text_model.parameters():
                param.requires_grad = True
            for param in self.image_model.parameters():
                param.requires_grad = True
            for param in self.qformer.parameters():
                param.requires_grad = True

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if p.requires_grad and not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if p.requires_grad and any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters, lr=self.learning_rate, eps=self.eps
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.num_training_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]
    
    """
    def get_text_embeddings(self, input_ids, attention_mask):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = text_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return text_embeddings

    def get_image_embeddings(self, pixel_values):
        # Extract image embeddings before Q-Former
        image_outputs = self.image_model(pixel_values=pixel_values)
        image_embeddings = image_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return image_embeddings

    def get_multimodal_embeddings(self, pixel_values):
        # Get image embeddings from the image model
        image_outputs = self.image_model(pixel_values=pixel_values)
        image_embeddings = image_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Pass through the Q-Former
        query_embeddings = self.qformer(image_embeddings)
        # Aggregate the query embeddings, e.g., mean pooling
        multimodal_embeddings = query_embeddings.mean(dim=1)  # Shape: [batch_size, hidden_size]
        return multimodal_embeddings
    """
    
    # Add the get_embeddings function
    def get_embedding(self, input_ids=None, attention_mask=None, pixel_values=None, embedding_type='multimodal'):
        """
        Generate text, image, or multimodal embeddings.

        Args:
            input_ids: Tokenized input text (for text embeddings).
            attention_mask: Attention mask for input text (for text embeddings).
            pixel_values: Preprocessed image (for image embeddings).
            embedding_type: Specify 'text', 'image', or 'multimodal' to generate the respective embeddings.

        Returns:
            Embeddings as torch.Tensor.
        """
        if embedding_type == 'text':
            if input_ids is None or attention_mask is None:
                raise ValueError("input_ids and attention_mask are required for text embeddings.")
            text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
            # Use CLS token embedding
            text_embeddings = text_outputs.last_hidden_state[:, 0, :]
            text_embeddings = F.normalize(text_embeddings, p=2, dim=-1)
            return text_embeddings

        elif embedding_type == 'image':
            if pixel_values is None:
                raise ValueError("pixel_values are required for image embeddings.")
            image_outputs = self.image_model(pixel_values=pixel_values)
            image_embeddings = image_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
            # Pass through Q-Former
            query_embeddings = self.qformer(image_embeddings)
            # Mean pooling over queries
            image_embeddings = query_embeddings.mean(dim=1)
            image_embeddings = F.normalize(image_embeddings, p=2, dim=-1)
            return image_embeddings

        elif embedding_type == 'multimodal':
            if input_ids is None or attention_mask is None or pixel_values is None:
                raise ValueError("input_ids, attention_mask, and pixel_values are required for multimodal embeddings.")
            text_embeddings = self.get_embedding(input_ids, attention_mask, embedding_type='text')
            image_embeddings = self.get_embedding(pixel_values=pixel_values, embedding_type='image')
            # Average embeddings
            embeddings = (text_embeddings + image_embeddings) / 2
            embeddings = F.normalize(embeddings, p=2, dim=-1)
            return embeddings

        else:
            raise ValueError("Invalid embedding_type. Choose 'text', 'image', or 'multimodal'.")

In [99]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import lightning.pytorch as pl

from transformers import AutoModel, ViTModel, get_linear_schedule_with_warmup
from transformers import BertConfig, BertModel
from transformers import T5ForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput

# import bitsandbytes as bnb
# from deepspeed.ops.adam import DeepSpeedCPUAdam

from sklearn.metrics import balanced_accuracy_score, f1_score, classification_report

sys.path.append("../architectures/")
from multimodalLayer import SupConLoss

class FineTuneBLIP2Classifier(pl.LightningModule):
    def __init__(self, pretrained_model, finetune_mode, num_classes, weight_decay, eps, warmup_steps, num_training_steps, 
                learning_rate, loss_fn, temperature):        
        super(FineTuneBLIP2Classifier, self).__init__()
        self.text_model = pretrained_model.text_model
        self.image_model = pretrained_model.image_model
        self.qformer = pretrained_model.qformer
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.eps = eps
        self.warmup_steps = warmup_steps
        self.num_training_steps = num_training_steps
        self.loss_fn_name = loss_fn
        self.finetune_mode = finetune_mode
        self.temperature = temperature

        # Classification head
        self.classifier = nn.Linear(self.text_model.config.hidden_size, num_classes)

        self.validation_outputs = []  # To store validation outputs
        self.test_outputs = []  # To store test outputs

        # Loss function
        self.ce_loss = nn.CrossEntropyLoss()

        if self.loss_fn_name == "CE+SupCon":
            self.supcon_loss = SupConLoss(self.temperature)

    def forward(self, input_ids, attention_mask, pixel_values):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = text_outputs.last_hidden_state[:, 0, :]  # [CLS] token

        image_outputs = self.image_model(pixel_values=pixel_values)
        image_embeddings = image_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Pass through the custom Q-Former
        query_embeddings = self.qformer(image_embeddings)
        query_embeddings = query_embeddings.mean(dim=1)  # Mean pooling over queries

        # Take the mean of text and image embeddings
        embeddings = (text_embeddings + query_embeddings) / 2

        logits = self.classifier(embeddings)
        return logits, embeddings

    def training_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        if self.loss_fn_name == "CE+SupCon":
            features = F.normalize(embeddings, dim=1)
            supcon_loss = self.supcon_loss(features, batch["labels"])
            loss += supcon_loss

        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        if self.loss_fn_name == "CE+SupCon":
            features = F.normalize(embeddings, dim=1)
            supcon_loss = self.supcon_loss(features, batch["labels"])
            loss += supcon_loss

        preds = torch.argmax(logits, dim=1)
        # acc = (preds == batch["labels"]).float().mean()
        self.validation_outputs.append({'preds': preds, 'labels': batch["labels"]})

        self.log("val_loss", loss)
        # self.log("val_acc", acc)
        return loss

    def test_step(self, batch, batch_idx):
        logits, embeddings = self(
            batch["input_ids"], batch["attention_mask"], batch["pixel_values"]
        )
        loss = self.ce_loss(logits, batch["labels"])

        preds = torch.argmax(logits, dim=1)
        # acc = (preds == batch["labels"]).float().mean()
        self.test_outputs.append({'preds': preds, 'labels': batch["labels"]})

        self.log("test_loss", loss)
        # self.log("test_acc", acc)
        return loss

    def on_validation_epoch_end(self):
        val_preds = torch.cat([x['preds'] for x in self.validation_outputs])
        val_labels = torch.cat([x['labels'] for x in self.validation_outputs])
        val_acc = balanced_accuracy_score(val_labels.cpu().numpy(), val_preds.cpu().numpy())
        val_f1_weighted = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='weighted')
        val_f1_micro = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
        val_f1_macro = f1_score(val_labels.cpu().numpy(), val_preds.cpu().numpy(), average='macro')
        self.log('val_acc', val_acc, on_step=False, on_epoch=True)
        self.log('val_f1_weighted', val_f1_weighted, on_step=False, on_epoch=True)
        self.log('val_f1_micro', val_f1_micro, on_step=False, on_epoch=True)
        self.log('val_f1_macro', val_f1_macro, on_step=False, on_epoch=True)
        self.validation_outputs = []

    # At the end of the test epoch, calculate accuracy and F1 scores
    def on_test_epoch_end(self):
        test_preds = torch.cat([x['preds'] for x in self.test_outputs])
        test_labels = torch.cat([x['labels'] for x in self.test_outputs])
        test_acc = balanced_accuracy_score(test_labels.cpu().numpy(), test_preds.cpu().numpy())
        test_f1_weighted = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='weighted')
        test_f1_micro = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_f1_macro = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='macro')
        self.log('test_acc', test_acc, on_step=False, on_epoch=True)
        self.log('test_f1_weighted', test_f1_weighted, on_step=False, on_epoch=True)
        self.log('test_f1_micro', test_f1_micro, on_step=False, on_epoch=True)
        self.log('test_f1_macro', test_f1_macro, on_step=False, on_epoch=True)
        self.test_outputs = []

    def configure_optimizers(self):
        # Freeze layers if needed
        if self.finetune_mode == "finetune_layers":
            for name, param in self.text_model.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

            for name, param in self.image_model.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

            for name, param in self.qformer.named_parameters():
                layer_number = int(name.split(".")[2]) if "layer" in name else None
                if layer_number not in args.layers_to_finetune:
                    param.requires_grad = False

        elif self.finetune_mode == "all":
            # Unfreeze all layers
            for param in self.text_model.parameters():
                param.requires_grad = True
            for param in self.image_model.parameters():
                param.requires_grad = True
            for param in self.qformer.parameters():
                param.requires_grad = True

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if p.requires_grad and not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if p.requires_grad and any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters, lr=self.learning_rate, eps=self.eps
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.num_training_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]
    
    """
    def get_text_embeddings(self, input_ids, attention_mask):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = text_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return text_embeddings

    def get_image_embeddings(self, pixel_values):
        # Extract image embeddings before Q-Former
        image_outputs = self.image_model(pixel_values=pixel_values)
        image_embeddings = image_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return image_embeddings

    def get_multimodal_embeddings(self, pixel_values):
        # Get image embeddings from the image model
        image_outputs = self.image_model(pixel_values=pixel_values)
        image_embeddings = image_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Pass through the Q-Former
        query_embeddings = self.qformer(image_embeddings)
        # Aggregate the query embeddings, e.g., mean pooling
        multimodal_embeddings = query_embeddings.mean(dim=1)  # Shape: [batch_size, hidden_size]
        return multimodal_embeddings
    """
    
    # Add the get_embeddings function
    def get_embedding(self, input_ids=None, attention_mask=None, pixel_values=None, embedding_type='multimodal'):
        """
        Generate text, image, or multimodal embeddings.

        Args:
            input_ids: Tokenized input text (for text embeddings).
            attention_mask: Attention mask for input text (for text embeddings).
            pixel_values: Preprocessed image (for image embeddings).
            embedding_type: Specify 'text', 'image', or 'multimodal' to generate the respective embeddings.

        Returns:
            Embeddings as torch.Tensor.
        """
        if embedding_type == 'text':
            if input_ids is None or attention_mask is None:
                raise ValueError("input_ids and attention_mask are required for text embeddings.")
            text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
            # Use CLS token embedding
            text_embeddings = text_outputs.last_hidden_state[:, 0, :]
            text_embeddings = F.normalize(text_embeddings, p=2, dim=-1)
            return text_embeddings

        elif embedding_type == 'image':
            if pixel_values is None:
                raise ValueError("pixel_values are required for image embeddings.")
            image_outputs = self.image_model(pixel_values=pixel_values)
            image_embeddings = image_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
            # Pass through Q-Former
            query_embeddings = self.qformer(image_embeddings)
            # Mean pooling over queries
            image_embeddings = query_embeddings.mean(dim=1)
            image_embeddings = F.normalize(image_embeddings, p=2, dim=-1)
            return image_embeddings

        elif embedding_type == 'multimodal':
            if input_ids is None or attention_mask is None or pixel_values is None:
                raise ValueError("input_ids, attention_mask, and pixel_values are required for multimodal embeddings.")
            text_embeddings = self.get_embedding(input_ids, attention_mask, embedding_type='text')
            image_embeddings = self.get_embedding(pixel_values=pixel_values, embedding_type='image')
            # Average embeddings
            embeddings = (text_embeddings + image_embeddings) / 2
            embeddings = F.normalize(embeddings, p=2, dim=-1)
            return embeddings

        else:
            raise ValueError("Invalid embedding_type. Choose 'text', 'image', or 'multimodal'.")

In [100]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
df = pd.read_csv("/workspace/persistent/HTClipper/data/processed/south.csv")
df['region'] = 'south'

# Map images with text
df = map_images_with_text_for_clip_model(df, img_dir=args.image_dir).drop_duplicates()

# Identify and remove classes with fewer than 2 instances
class_counts = df['VENDOR'].value_counts()
valid_classes = class_counts[class_counts >= 3].index
df_filtered = df[df['VENDOR'].isin(valid_classes)].reset_index(drop=True)

# Encode the labels
label_encoder = LabelEncoder()
df_filtered['label'] = label_encoder.fit_transform(df_filtered['VENDOR'])

In [101]:
num_classes = 1463

In [102]:
args.model_type = "CLIP"
args.repr = "EOS"
args.loss = "CE"

In [107]:
def load_clipstyle_models(args):

    # Initialize the model
    if args.model_type == "CLIP":

        sys.path.append('../architectures/')
        from CLIPLayer import CLIPModel

        # Initialize the pre-trained model
        model = CLIPModel(weight_decay=args.weight_decay, eps=args.adam_epsilon, warmup_steps=100, num_training_steps=1000)
        checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/pre-training/CLIP/non-associated/seed:1111/lr-0.0001/NTXENT/0.1/negatives-5", "final_model.ckpt"), map_location=device)
        # Load the state dictionary into the model
        model.load_state_dict(checkpoint['state_dict'], strict=False)

        # Loading the classifier
        model = FineTuneCLIPClassifier(pretrained_model=model, finetune_mode="all", num_classes=num_classes, weight_decay=args.weight_decay, eps=args.adam_epsilon, 
                                       warmup_steps=100, num_training_steps=1000, learning_rate=args.learning_rate, loss_fn=args.loss, temperature=0.1, 
                                       extract_representation_from=args.repr)
        if args.loss == "CE" and args.repr == "CLS":
            checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/finetuned/CLIP/finetune-all/representations_CLS/seed:1111/lr-0.0001/temp-0.1/CE", "final_model.ckpt"))
        elif args.loss == "CE+SupCon" and args.repr == "CLS":
            checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/finetuned/CLIP/finetune-all/representations_CLS/seed:1111/lr-0.0001/temp-0.1/CE+SupCon", "final_model.ckpt"))
        elif args.loss == "CE+SupCon" and args.repr == "EOS":
            checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/finetuned/CLIP/finetune-all/representations_EOS/seed:1111/lr-0.0001/temp-0.1/CE+SupCon", "final_model.ckpt"))
        else:
            checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/finetuned/CLIP/finetune-all/representations_EOS/seed:1111/lr-0.0001/temp-0.1/CE", "final_model.ckpt"))

        # Load the state dictionary into the model
        model.load_state_dict(checkpoint['state_dict'], strict=False) 
        # Set the model to evaluation mode
        model.eval()
        # Move the model to the desired device
        model = model.to(device)

    # Initialize the model
    if args.model_type == "CLIPITM":

        sys.path.append('../architectures/')
        from CLIPITMLayer import CLIPITMModel

        model = CLIPITMModel(weight_decay=args.weight_decay, eps=args.adam_epsilon, warmup_steps=100, num_training_steps=1000)
        checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/pre-training/CLIPITM/non-associated/seed:1111/lr-0.0001/NTXENT/BERTqformer/0.1/negatives-5", "final_model.ckpt"), map_location=device)
        # Load the state dictionary into the model
        model.load_state_dict(checkpoint['state_dict'])

        # Loading the classifier
        model = FineTuneCLIPITMClassifier(pretrained_model=model, finetune_mode="all", num_classes=num_classes, weight_decay=args.weight_decay, eps=args.adam_epsilon, 
                                       warmup_steps=100, num_training_steps=1000, learning_rate=args.learning_rate, loss_fn=args.loss, temperature=0.1)

        if args.loss == "CE":
            checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/finetuned/CLIPITM/finetune-all/representations_CLS/seed:1111/lr-0.0001/temp-0.1/CE", "final_model.ckpt"))
        else:
            raise Exception("Model Still to be trained ....")

        # Load the state dictionary into the model
        model.load_state_dict(checkpoint['state_dict'], strict=False) 
        # Set the model to evaluation mode
        model.eval()
        # Move the model to the desired device
        model = model.to(device)

    elif args.model_type == "BLIP2":
        sys.path.append('../architectures/')
        from BLIP2Layer import BLIP2Model

        model = BLIP2Model(weight_decay=args.weight_decay, eps=args.adam_epsilon, warmup_steps=100, num_training_steps=1000)
        checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/pre-training/BLIP2/non-associated/seed:1111/lr-0.0001/NTXENT/0.1/negatives-5", "final_model.ckpt"), map_location=device)
        # Load the state dictionary into the model
        model.load_state_dict(checkpoint['state_dict'])

        # Loading the classifier
        model = FineTuneBLIP2Classifier(pretrained_model=model, finetune_mode="all", num_classes=num_classes, weight_decay=args.weight_decay, eps=args.adam_epsilon, 
                                       warmup_steps=100, num_training_steps=1000, learning_rate=args.learning_rate, loss_fn=args.loss, temperature=0.1)  

        if args.loss == "CE":
            checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/finetuned/BLIP2/finetune-all/representations_CLS/seed:1111/lr-0.0001/temp-0.1/CE", "final_model.ckpt"))
        elif args.loss == "CE+SupCon":
            checkpoint = torch.load(os.path.join("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/finetuned/BLIP2/finetune-all/representations_CLS/seed:1111/lr-0.0001/temp-0.1/CE+SupCon", "final_model.ckpt"))
        else:
            raise Exception("Model Still to be trained ....")

        # Load the state dictionary into the model
        model.load_state_dict(checkpoint['state_dict'], strict=False) 
        
    return model

In [104]:
args.model_type = "CLIP"
args.repr = "EOS"
args.loss = "CE"

model = load_clipstyle_models(args)

# Set the model to evaluation mode
model.eval()
# Move the model to the desired device
model = model.to(device)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Helper functions

In [105]:
from tqdm import tqdm

In [106]:
# %% Initialize the tokenizers and models
text_tokenizer = AutoTokenizer.from_pretrained('johngiorgi/declutr-small')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

In [49]:
# Function to map images with text for CLIP model
def map_images_with_text_for_clip_model(df, img_dir, filter_by="vendor"):
    # Initialize a list to store the new rows
    new_rows = []

    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        text = row['TEXT']
        all_images = str(row['IMAGES']).split('|')
        if filter_by == "vendor":
            vendor = row['VENDOR']
        elif filter_by == "id":
            vendor = row['ID']
        region = row['region']
        
        # Create a new entry for each image
        for image in all_images:
            full_image_path = os.path.join(img_dir, region, "image", "image", image)
            
            # Only add the row if the image exists at the specified path
            if os.path.exists(full_image_path):
                new_rows.append({
                    'TEXT': text,
                    'IMAGES': full_image_path,  # Store the full image path
                    'VENDOR': vendor,
                    'region' : region
                })

    # Create a new dataframe from the list of new rows
    return pd.DataFrame(new_rows)

In [31]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from torch.utils.data import DataLoader
from pathlib import Path
import torch


# Function to process each dataset
def process_dataset_for_ClassifierModel(region_name, data_dir, image_dir, model, model_name, text_tokenizer, image_processor, filter_by="vendor", batch_size=32):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    assert model_name in ["CLIP", "CLIP-EOS", "CLIP-EOS-CESupCon", "CLIPITM", "BLIP2", "BLIP2-CESupCon"]
    assert filter_by in ["vendor", "id"]

    # Load the dataset
    df = pd.read_csv(os.path.join(data_dir, f"{region_name}.csv"))
    df['region'] = region_name
    df = map_images_with_text_for_clip_model(df, img_dir=image_dir, filter_by=filter_by).drop_duplicates()

    # Filter the dataframe if needed
    df_filtered = df
    # Encode the labels
    label_encoder = LabelEncoder()
    df_filtered['label'] = label_encoder.fit_transform(df_filtered['VENDOR'])

    # Split the data into train and test sets
    train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=1111)

    # Create datasets and dataloaders
    train_dataset = FineTuneCLIPstyleModelDataset(train_df, text_tokenizer, image_processor)
    test_dataset = FineTuneCLIPstyleModelDataset(test_df, text_tokenizer, image_processor)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    # Collect embeddings and labels
    def get_embeddings(dataloader):
        image_embeddings = []
        text_embeddings = []
        multimodal_embeddings = []
        labels = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc='Fetching Embeddings', leave=False):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                pixel_values = batch['pixel_values'].to(device)
                batch_labels = batch['labels']

                # Get embeddings
                text_embeds = model.get_embedding(input_ids=input_ids, attention_mask=attention_mask, embedding_type='text')
                image_embeds = model.get_embedding(pixel_values=pixel_values, embedding_type='image')
                # Compute multimodal embeddings
                multimodal_embeds = model.get_embedding(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    pixel_values=pixel_values,
                    embedding_type='multimodal'
                )

                text_embeddings.append(text_embeds.cpu().numpy())
                image_embeddings.append(image_embeds.cpu().numpy())
                multimodal_embeddings.append(multimodal_embeds.cpu().numpy())
                labels.extend(batch_labels)

        text_embeddings = np.concatenate(text_embeddings)
        image_embeddings = np.concatenate(image_embeddings)
        multimodal_embeddings = np.concatenate(multimodal_embeddings)
        return text_embeddings, image_embeddings, multimodal_embeddings, labels

    # Get embeddings for train and test sets
    train_text_embeddings, train_image_embeddings, train_multimodal_embeddings, train_labels = get_embeddings(train_dataloader)
    test_text_embeddings, test_image_embeddings, test_multimodal_embeddings, test_labels = get_embeddings(test_dataloader)

    output_dir = os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/finetuned_declutr_vit/", model_name)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Save embeddings and labels
    np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_{filter_by}.npy'), train_text_embeddings)
    np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_{filter_by}.npy'), train_image_embeddings)
    np.save(os.path.join(output_dir, f'train_multimodal_embeddings_{region_name}_{filter_by}.npy'), train_multimodal_embeddings)
    np.save(os.path.join(output_dir, f'train_labels_{region_name}_{filter_by}.npy'), train_labels)

    np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_{filter_by}.npy'), test_text_embeddings)
    np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_{filter_by}.npy'), test_image_embeddings)
    np.save(os.path.join(output_dir, f'test_multimodal_embeddings_{region_name}_{filter_by}.npy'), test_multimodal_embeddings)
    np.save(os.path.join(output_dir, f'test_labels_{region_name}_{filter_by}.npy'), test_labels)

    print(f"Processed region: {region_name}")
    print(f"Number of training samples: {len(train_labels)}")
    print(f"Number of testing samples: {len(test_labels)}\n")
    
    return train_text_embeddings, train_image_embeddings, train_multimodal_embeddings, train_labels, test_text_embeddings, test_image_embeddings, test_multimodal_embeddings, test_labels

In [127]:
# List of regions to process
regions = ['south', 'midwest', 'west', 'northeast']

# , 'northeast'
# Process each region
for region in regions:
    print("------------------------------" + region + "-------------------------")
    train_text_embeddings, train_image_embeddings, train_multimodal_embeddings, train_labels, test_text_embeddings, test_image_embeddings, test_multimodal_embeddings, test_labels = process_dataset_for_ClassifierModel(
    region_name=region,
    data_dir=args.data_dir,
    image_dir=args.image_dir,
    model=model,
    model_name="CLIP-EOS-CESupCon",
    text_tokenizer=text_tokenizer,
    # t5_tokenizer=t5_tokenizer,
    image_processor=image_processor,
    filter_by = "vendor",
    batch_size=32
    )

------------------------------south-------------------------


                                                                        

Processed region: south
Number of training samples: 52435
Number of testing samples: 13109

------------------------------midwest-------------------------


                                                                      

Processed region: midwest
Number of training samples: 29297
Number of testing samples: 7325

------------------------------west-------------------------


                                                                      

Processed region: west
Number of training samples: 11104
Number of testing samples: 2777

------------------------------northeast-------------------------


                                                                      

Processed region: northeast
Number of training samples: 11797
Number of testing samples: 2950



# New functions that only extracts emnbeddings from unique text and image ads

In [211]:
from tqdm import tqdm
from PIL import Image

from tqdm import tqdm
from PIL import Image

def process_dataset_for_CLIPModel(region_name, data_dir, image_dir, model, text_tokenizer, image_processor, filter_by="vendor", batch_size=32):
    assert filter_by in ["vendor", "ids"]
    # Load the dataset
    df = pd.read_csv(os.path.join(data_dir, f"{region_name}.csv"))
    df['region'] = region_name
    df = map_images_with_text_for_clip_model(df, img_dir=image_dir, filter_by=filter_by).drop_duplicates()

    df_filtered = df

    # Get unique text embeddings
    unique_texts = df_filtered['TEXT'].unique()
    text_embeddings = {}
    text_labels = []

    # Extract text embeddings with tqdm progress bar
    for text in tqdm(unique_texts, desc="Extracting Text Embeddings"):
        inputs = text_tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
        text_embed = model.get_embeddings(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], embedding_type='text')
        text_embeddings[text] = text_embed.detach().cpu().numpy()
        text_labels.append(df_filtered[df_filtered['TEXT'] == text]['VENDOR'].values[0])  # Get vendor for the text

    # Get unique images and their embeddings
    unique_images = df_filtered['IMAGES'].unique()
    image_embeddings = {}
    image_labels = []
    seen_embeddings = set()  # To track unique embeddings

    # Extract image embeddings with tqdm progress bar
    for image_path in tqdm(unique_images, desc="Extracting Image Embeddings"):
        # Load the image
        image = Image.open(image_path).convert("RGB")  # Convert to RGB format
        image_tensor = image_processor(images=image, return_tensors="pt")['pixel_values'].to(device)  # Preprocess the image
        image_embed = model.get_embeddings(pixel_values=image_tensor, embedding_type='image')

        # Convert the embedding to a tuple to make it hashable for the set
        embedding_tuple = tuple(image_embed.detach().cpu().numpy().flatten())

        if embedding_tuple not in seen_embeddings:
            seen_embeddings.add(embedding_tuple)  # Track the unique embedding
            image_embeddings[image_path] = image_embed.detach().cpu().numpy()  # Store the unique embedding
            image_labels.append(df_filtered[df_filtered['IMAGES'] == image_path]['VENDOR'].values[0])  # Get vendor for the image

    # Train-test split
    train_text_embeddings, test_text_embeddings, train_text_labels, test_text_labels = train_test_split(
        list(text_embeddings.values()), text_labels, test_size=0.2, random_state=1111
    )
    train_image_embeddings, test_image_embeddings, train_image_labels, test_image_labels = train_test_split(
        list(image_embeddings.values()), image_labels, test_size=0.2, random_state=1111
    )

    output_dir = os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/trained_declutr_vit/", "CLIP")
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Ensure embeddings are in the desired shape (batch_size, embedding_dim)
    train_text_embeddings = np.array(train_text_embeddings).squeeze()  # Shape: (train_size, 768)
    test_text_embeddings = np.array(test_text_embeddings).squeeze()    # Shape: (test_size, 768)
    train_image_embeddings = np.array(train_image_embeddings).squeeze()  # Shape: (train_size, 768)
    test_image_embeddings = np.array(test_image_embeddings).squeeze()    # Shape: (test_size, 768)

    if filter_by == "vendor":
        np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_vendors.npy'), np.array(train_text_embeddings))
        np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_vendors.npy'), np.array(train_image_embeddings))
        np.save(os.path.join(output_dir, f'train_text_labels_{region_name}_vendors.npy'), np.array(train_text_labels))
        np.save(os.path.join(output_dir, f'train_image_labels_{region_name}_vendors.npy'), np.array(train_image_labels))

        np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_vendors.npy'), np.array(test_text_embeddings))
        np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_vendors.npy'), np.array(test_image_embeddings))
        np.save(os.path.join(output_dir, f'test_text_labels_{region_name}_vendors.npy'), np.array(test_text_labels))
        np.save(os.path.join(output_dir, f'test_image_labels_{region_name}_vendors.npy'), np.array(test_image_labels))
        
    else:
        np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_ids.npy'), np.array(train_text_embeddings))
        np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_ids.npy'), np.array(train_image_embeddings))
        np.save(os.path.join(output_dir, f'train_text_labels_{region_name}_ids.npy'), np.array(train_text_labels))
        np.save(os.path.join(output_dir, f'train_image_labels_{region_name}_ids.npy'), np.array(train_image_labels))

        np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_ids.npy'), np.array(test_text_embeddings))
        np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_ids.npy'), np.array(test_image_embeddings))
        np.save(os.path.join(output_dir, f'test_text_labels_{region_name}_ids.npy'), np.array(test_text_labels))
        np.save(os.path.join(output_dir, f'test_image_labels_{region_name}_ids.npy'), np.array(test_image_labels))

    print(f"Processed region: {region_name}")
    print(f"Number of training samples: {len(train_text_labels)}")
    print(f"Number of testing samples: {len(test_text_labels)}\n")
    
    return train_text_embeddings, train_image_embeddings, train_text_labels, train_image_labels, test_text_embeddings, test_image_embeddings, test_text_labels, test_image_labels

def process_dataset_for_CLIPITMModel(region_name, data_dir, image_dir, model, text_tokenizer, image_processor, filter_by="vendor", batch_size=32):
    # Load the dataset
    df = pd.read_csv(os.path.join(data_dir, f"{region_name}.csv"))
    df['region'] = region_name
    df = map_images_with_text_for_clip_model(df, img_dir=image_dir, filter_by=filter_by).drop_duplicates()

    df_filtered = df

    # Get unique text embeddings
    unique_texts = df_filtered['TEXT'].unique()
    text_embeddings = {}
    text_labels = []

    # Extract text embeddings with tqdm progress bar
    for text in tqdm(unique_texts, desc="Extracting Text Embeddings"):
        inputs = text_tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
        text_embed = model.get_embeddings(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], embedding_type='text')
        text_embeddings[text] = text_embed.detach().cpu().numpy()
        text_labels.append(df_filtered[df_filtered['TEXT'] == text]['VENDOR'].values[0])  # Get vendor for the text

    # Get unique images and their embeddings
    unique_images = df_filtered['IMAGES'].unique()
    image_embeddings = {}
    image_labels = []
    seen_embeddings = set()  # To track unique embeddings

    # Extract image embeddings with tqdm progress bar
    for image_path in tqdm(unique_images, desc="Extracting Image Embeddings"):
        # Load the image
        image = Image.open(image_path).convert("RGB")  # Convert to RGB format
        image_tensor = image_processor(images=image, return_tensors="pt")['pixel_values'].to(device)  # Preprocess the image
        image_embed = model.get_embeddings(pixel_values=image_tensor, embedding_type='image')

        # Convert the embedding to a tuple to make it hashable for the set
        embedding_tuple = tuple(image_embed.detach().cpu().numpy().flatten())

        if embedding_tuple not in seen_embeddings:
            seen_embeddings.add(embedding_tuple)  # Track the unique embedding
            image_embeddings[image_path] = image_embed.detach().cpu().numpy()  # Store the unique embedding
            image_labels.append(df_filtered[df_filtered['IMAGES'] == image_path]['VENDOR'].values[0])  # Get vendor for the image

    # Train-test split
    train_text_embeddings, test_text_embeddings, train_text_labels, test_text_labels = train_test_split(
        list(text_embeddings.values()), text_labels, test_size=0.2, random_state=1111
    )
    train_image_embeddings, test_image_embeddings, train_image_labels, test_image_labels = train_test_split(
        list(image_embeddings.values()), image_labels, test_size=0.2, random_state=1111
    )

    output_dir = os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/trained_declutr_vit/", "CLIPITM")
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Ensure embeddings are in the desired shape (batch_size, embedding_dim)
    train_text_embeddings = np.array(train_text_embeddings).squeeze()  # Shape: (train_size, 768)
    test_text_embeddings = np.array(test_text_embeddings).squeeze()    # Shape: (test_size, 768)
    train_image_embeddings = np.array(train_image_embeddings).squeeze()  # Shape: (train_size, 768)
    test_image_embeddings = np.array(test_image_embeddings).squeeze()    # Shape: (test_size, 768)

    if filter_by == "vendor":
        np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_vendors.npy'), train_text_embeddings)
        np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_vendors.npy'), train_image_embeddings)
        np.save(os.path.join(output_dir, f'train_text_labels_{region_name}_vendors.npy'), np.array(train_text_labels))
        np.save(os.path.join(output_dir, f'train_image_labels_{region_name}_vendors.npy'), np.array(train_image_labels))

        np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_vendors.npy'), test_text_embeddings)
        np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_vendors.npy'), test_image_embeddings)
        np.save(os.path.join(output_dir, f'test_text_labels_{region_name}_vendors.npy'), np.array(test_text_labels))
        np.save(os.path.join(output_dir, f'test_image_labels_{region_name}_vendors.npy'), np.array(test_image_labels))

    else:
        np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_ids.npy'), train_text_embeddings)
        np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_ids.npy'), train_image_embeddings)
        np.save(os.path.join(output_dir, f'train_text_labels_{region_name}_ids.npy'), np.array(train_text_labels))
        np.save(os.path.join(output_dir, f'train_image_labels_{region_name}_ids.npy'), np.array(train_image_labels))

        np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_ids.npy'), test_text_embeddings)
        np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_ids.npy'), test_image_embeddings)
        np.save(os.path.join(output_dir, f'test_text_labels_{region_name}_ids.npy'), np.array(test_text_labels))
        np.save(os.path.join(output_dir, f'test_image_labels_{region_name}_ids.npy'), np.array(test_image_labels))

    print(f"Processed region: {region_name}")
    print(f"Number of training samples: {len(train_text_labels)}")
    print(f"Number of testing samples: {len(test_text_labels)}\n")
    
    return train_text_embeddings, train_image_embeddings, train_text_labels, train_image_labels, test_text_embeddings, test_image_embeddings, test_text_labels, test_image_labels

def process_dataset_for_BLIP2Model(region_name, data_dir, image_dir, model, text_tokenizer, image_processor, filter_by="vendor", batch_size=32):
    # Load the dataset
    df = pd.read_csv(os.path.join(data_dir, f"{region_name}.csv"))
    df['region'] = region_name
    df = map_images_with_text_for_blip2_model(df, img_dir=image_dir, filter_by=filter_by).drop_duplicates()

    df_filtered = df

    # Get unique text embeddings
    unique_texts = df_filtered['TEXT'].unique()
    text_embeddings = {}
    text_labels = []

    # Extract text embeddings with tqdm progress bar
    for text in tqdm(unique_texts, desc="Extracting Text Embeddings"):
        inputs = text_tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
        text_embed = model.get_embeddings(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], embedding_type='text')
        text_embeddings[text] = text_embed.detach().cpu().numpy()
        text_labels.append(df_filtered[df_filtered['TEXT'] == text]['VENDOR'].values[0])  # Get vendor for the text

    # Get unique images and their embeddings
    unique_images = df_filtered['IMAGES'].unique()
    image_embeddings = {}
    image_labels = []
    seen_embeddings = set()  # To track unique embeddings

    # Extract image embeddings with tqdm progress bar
    for image_path in tqdm(unique_images, desc="Extracting Image Embeddings"):
        # Load the image
        image = Image.open(image_path).convert("RGB")  # Convert to RGB format
        image_tensor = image_processor(images=image, return_tensors="pt")['pixel_values'].to(device)  # Preprocess the image
        image_embed = model.get_embeddings(pixel_values=image_tensor, embedding_type='image')

        # Convert the embedding to a tuple to make it hashable for the set
        embedding_tuple = tuple(image_embed.detach().cpu().numpy().flatten())

        if embedding_tuple not in seen_embeddings:
            seen_embeddings.add(embedding_tuple)  # Track the unique embedding
            image_embeddings[image_path] = image_embed.detach().cpu().numpy()  # Store the unique embedding
            image_labels.append(df_filtered[df_filtered['IMAGES'] == image_path]['VENDOR'].values[0])  # Get vendor for the image

    # Train-test split
    train_text_embeddings, test_text_embeddings, train_text_labels, test_text_labels = train_test_split(
        list(text_embeddings.values()), text_labels, test_size=0.2, random_state=1111
    )
    train_image_embeddings, test_image_embeddings, train_image_labels, test_image_labels = train_test_split(
        list(image_embeddings.values()), image_labels, test_size=0.2, random_state=1111
    )

    output_dir = os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/trained_declutr_vit/", "BLIP2")
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Ensure embeddings are in the desired shape (batch_size, embedding_dim)
    train_text_embeddings = np.array(train_text_embeddings).squeeze()  # Shape: (train_size, 768)
    test_text_embeddings = np.array(test_text_embeddings).squeeze()    # Shape: (test_size, 768)
    train_image_embeddings = np.array(train_image_embeddings).squeeze()  # Shape: (train_size, 768)
    test_image_embeddings = np.array(test_image_embeddings).squeeze()    # Shape: (test_size, 768)

    if filter_by == "vendor":
        np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_vendors.npy'), train_text_embeddings)
        np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_vendors.npy'), train_image_embeddings)
        np.save(os.path.join(output_dir, f'train_text_labels_{region_name}_vendors.npy'), np.array(train_text_labels))
        np.save(os.path.join(output_dir, f'train_image_labels_{region_name}_vendors.npy'), np.array(train_image_labels))

        np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_vendors.npy'), test_text_embeddings)
        np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_vendors.npy'), test_image_embeddings)
        np.save(os.path.join(output_dir, f'test_text_labels_{region_name}_vendors.npy'), np.array(test_text_labels))
        np.save(os.path.join(output_dir, f'test_image_labels_{region_name}_vendors.npy'), np.array(test_image_labels))

    else:
        np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_ids.npy'), train_text_embeddings)
        np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_ids.npy'), train_image_embeddings)
        np.save(os.path.join(output_dir, f'train_text_labels_{region_name}_ids.npy'), np.array(train_text_labels))
        np.save(os.path.join(output_dir, f'train_image_labels_{region_name}_ids.npy'), np.array(train_image_labels))

        np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_ids.npy'), test_text_embeddings)
        np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_ids.npy'), test_image_embeddings)
        np.save(os.path.join(output_dir, f'test_text_labels_{region_name}_ids.npy'), np.array(test_text_labels))
        np.save(os.path.join(output_dir, f'test_image_labels_{region_name}_ids.npy'), np.array(test_image_labels))

    print(f"Processed region: {region_name}")
    print(f"Number of training samples: {len(train_text_labels)}")
    print(f"Number of testing samples: {len(test_text_labels)}\n")
    
    return train_text_embeddings, train_image_embeddings, train_text_labels, train_image_labels, test_text_embeddings, test_image_embeddings, test_text_labels, test_image_labels

In [None]:
# List of regions to process
regions = ['south', 'midwest', 'west', 'northeast']

# , 'northeast'
# Process each regiond
for region in regions:
    print("-"*50 + region + "-"*50)
    _, _, _, _, _, _, _, _ = process_dataset_for_BLIP2Model(
        region_name=region,
        data_dir=args.data_dir,
        image_dir=args.image_dir,
        model=model,
        text_tokenizer=text_tokenizer,
        image_processor=image_processor,
        filter_by = "vendor",
        batch_size=32
    )

--------------------------------------------------south--------------------------------------------------


Extracting Text Embeddings: 100%|██████████| 13677/13677 [02:05<00:00, 109.31it/s]
Extracting Image Embeddings:  16%|█▌        | 10529/65544 [09:36<52:20, 17.52it/s]  

In [None]:
# List of regions to process
regions = ['south', 'midwest', 'west', 'northeast']

# , 'northeast'
# Process each region
for region in regions:
    print("-"*50 + region + "-"*50)
    _, _, _, _, _, _, _, _ = process_dataset_for_BLIP2Model(
        region_name=region,
        data_dir=args.data_dir,
        image_dir=args.image_dir,
        model=model,
        text_tokenizer=text_tokenizer,
        image_processor=image_processor,
        filter_by = "ids",
        batch_size=32
    )

In [160]:
test_image_embeddings.shape, torch.tensor(test_image_labels).shape

((1528, 768), torch.Size([1528]))

In [35]:
# Function to map images with text for CLIP model
def map_images_with_text_for_clip_model(df, img_dir, filter_by="vendor"):
    # Initialize a list to store the new rows
    new_rows = []

    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        text = row['TEXT']
        all_images = str(row['IMAGES']).split('|')
        if filter_by == "vendor":
            vendor = row['VENDOR']
        elif filter_by == "id":
            vendor = row['ID']
        region = row['region']
        
        # Create a new entry for each image
        for image in all_images:
            full_image_path = os.path.join(img_dir, region, "image", "image", image)
            
            # Only add the row if the image exists at the specified path
            if os.path.exists(full_image_path):
                new_rows.append({
                    'TEXT': text,
                    'IMAGES': full_image_path,  # Store the full image path
                    'VENDOR': vendor,
                    'region' : region
                })

    # Create a new dataframe from the list of new rows
    return pd.DataFrame(new_rows)

'/workspace/persistent/HTClipper/data/IMAGES'

In [92]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from torch.utils.data import DataLoader
from pathlib import Path
import torch


# Function to process each dataset
def process_dataset_for_ClassifierModel(region_name, data_dir, image_dir, model, model_name, text_tokenizer, image_processor, filter_by="vendor", batch_size=32):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # assert model_name in ["CLIP", "CLIP-EOS", "CLIP-EOS-CESupCon", "CLIPITM", "BLIP2", "BLIP2-CESupCon"]
    assert filter_by in ["vendor", "id"]

    # Load the dataset
    df = pd.read_csv(os.path.join(data_dir, f"{region_name}.csv"))
    df['region'] = region_name
    
    # Encode the labels
    label_encoder = LabelEncoder()
    df['VENDOR'] = label_encoder.fit_transform(df['VENDOR'])

    # Identify and keep vendors with at least 2 instances
    class_counts = df['VENDOR'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    df_filtered = df[df['VENDOR'].isin(valid_classes)]

    # Re-encode labels after filtering
    df_filtered['VENDOR'] = label_encoder.fit_transform(df_filtered['VENDOR'])

    df_filtered = df_filtered[["TEXT", "IMAGES", "VENDOR", "region"]].drop_duplicates()

    # Dynamically adjust test_size based on the number of classes
    min_test_size = len(df_filtered['VENDOR'].unique()) / len(df_filtered)
    test_size = max(0.2, min_test_size)  # Ensure the test size is at least 20% or large enough to include all classes

    train_df, test_df = train_test_split(
        df_filtered, test_size=test_size, random_state=args.seed, stratify=df_filtered['VENDOR'], shuffle=True
    )

    # Apply map_images_with_text separately to avoid overlap of text-image pairs across splits
    train_df = map_images_with_text_for_clip_model(train_df, img_dir=image_dir, filter_by=filter_by).drop_duplicates()
    test_df = map_images_with_text_for_clip_model(test_df, img_dir=image_dir, filter_by=filter_by).drop_duplicates()

    # Create datasets and dataloaders
    train_dataset = FineTuneCLIPstyleModelDataset(train_df, text_tokenizer, image_processor)
    test_dataset = FineTuneCLIPstyleModelDataset(test_df, text_tokenizer, image_processor)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    # Collect embeddings and labels
    def get_embeddings(dataloader):
        image_embeddings = []
        text_embeddings = []
        multimodal_embeddings = []
        labels = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc='Fetching Embeddings', leave=False):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                pixel_values = batch['pixel_values'].to(device)
                batch_labels = batch['labels']

                # Get embeddings
                text_embeds = model.get_embedding(input_ids=input_ids, attention_mask=attention_mask, embedding_type='text')
                image_embeds = model.get_embedding(pixel_values=pixel_values, embedding_type='image')
                # Compute multimodal embeddings
                multimodal_embeds = model.get_embedding(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    pixel_values=pixel_values,
                    embedding_type='multimodal'
                )

                text_embeddings.append(text_embeds.cpu().numpy())
                image_embeddings.append(image_embeds.cpu().numpy())
                multimodal_embeddings.append(multimodal_embeds.cpu().numpy())
                labels.extend(batch_labels)

        text_embeddings = np.concatenate(text_embeddings)
        image_embeddings = np.concatenate(image_embeddings)
        multimodal_embeddings = np.concatenate(multimodal_embeddings)
        return text_embeddings, image_embeddings, multimodal_embeddings, labels

    # Get embeddings for train and test sets
    train_text_embeddings, train_image_embeddings, train_multimodal_embeddings, train_labels = get_embeddings(train_dataloader)
    test_text_embeddings, test_image_embeddings, test_multimodal_embeddings, test_labels = get_embeddings(test_dataloader)

    output_dir = os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/multimodal_baselines/finetuned/", model_name)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Save embeddings and labels
    np.save(os.path.join(output_dir, f'train_text_embeddings_{region_name}_{filter_by}.npy'), train_text_embeddings)
    np.save(os.path.join(output_dir, f'train_image_embeddings_{region_name}_{filter_by}.npy'), train_image_embeddings)
    np.save(os.path.join(output_dir, f'train_multimodal_embeddings_{region_name}_{filter_by}.npy'), train_multimodal_embeddings)
    np.save(os.path.join(output_dir, f'train_labels_{region_name}_{filter_by}.npy'), train_labels)

    np.save(os.path.join(output_dir, f'test_text_embeddings_{region_name}_{filter_by}.npy'), test_text_embeddings)
    np.save(os.path.join(output_dir, f'test_image_embeddings_{region_name}_{filter_by}.npy'), test_image_embeddings)
    np.save(os.path.join(output_dir, f'test_multimodal_embeddings_{region_name}_{filter_by}.npy'), test_multimodal_embeddings)
    np.save(os.path.join(output_dir, f'test_labels_{region_name}_{filter_by}.npy'), test_labels)
    
    return train_text_embeddings, train_image_embeddings, train_multimodal_embeddings, train_labels, test_text_embeddings, test_image_embeddings, test_multimodal_embeddings, test_labels

In [None]:
for model_name in ["CLIP-CE", "CLIPITM-CE", "BLIP2-CE", "BLIP2-CE-SupCon"]:
    model=None
    
    print(model)
    print(f"model_name:{model_name}")
    
    if model_name == "CLIP-CE":
        args.model_type = "CLIP"
        args.repr = "EOS"
        args.loss = "CE"
    elif model_name == "CLIPITM-CE":
        args.model_type = "CLIPITM"
        args.loss = "CE"
    elif model_name == "BLIP2-CE":
        args.model_type = "BLIP2"
        args.loss = "CE"
    else:
        args.model_type = "BLIP2"
        args.loss = "CE+SupCon"

    model = load_clipstyle_models(args)

    # Set the model to evaluation mode
    model.eval()
    # Move the model to the desired device
    model = model.to(device)
    
    for region in ["south", "midwest", "west", "northeast"]:
        print(f"------------------------------------------{region}-------------------------")
        process_dataset_for_ClassifierModel(region_name=region, data_dir=args.data_dir, image_dir=args.image_dir, model=model, model_name=model_name, 
                                            text_tokenizer=text_tokenizer ,image_processor=image_processor, filter_by="vendor", batch_size=32)

None
model_name:CLIP-CE


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


------------------------------------------south-------------------------


                                                                        

------------------------------------------midwest-------------------------


                                                                      

------------------------------------------west-------------------------


                                                                      

------------------------------------------northeast-------------------------


                                                                      

None
model_name:CLIPITM-CE


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


------------------------------------------south-------------------------


                                                                        

------------------------------------------midwest-------------------------


                                                                      

------------------------------------------west-------------------------


                                                                      

------------------------------------------northeast-------------------------


                                                                      

None
model_name:BLIP2-CE


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


------------------------------------------south-------------------------


                                                                        

------------------------------------------midwest-------------------------


Fetching Embeddings: 100%|█████████▉| 915/917 [13:41<00:01,  1.16it/s]

In [110]:
# Function to calculate total trainable parameters
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

for model_name in ["CLIP-CE", "CLIPITM-CE", "BLIP2-CE"]:
    model=None    
    
    if model_name == "CLIP-CE":
        args.model_type = "CLIP"
        args.repr = "EOS"
        args.loss = "CE"
    elif model_name == "CLIPITM-CE":
        args.model_type = "CLIPITM"
        args.loss = "CE"
    elif model_name == "BLIP2-CE":
        args.model_type = "BLIP2"
        args.loss = "CE"
    else:
        args.model_type = "BLIP2"
        args.loss = "CE+SupCon"

    model = load_clipstyle_models(args)
    
    print(f"model_name:{model_name}-param:{count_trainable_parameters(model)}")

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_name:CLIP-CE-param:169632695


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_name:CLIPITM-CE-param:307506359


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_name:BLIP2-CE-param:307506359


In [None]:
169632695