In [None]:
# Code Overview
# Extracts text, vision, and multmodal embeddings from the multimodal DeCLUTR-ViT backbone trained with CE, triplet, CE+triplet, and CE+SupCon objectives. 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
"""
Python version: 3.10
Description: Trains a Declutr-small and ViT-patch16 based classifier to establish baselines for Multimodal Authorship tasks on Backpage advertisements.
"""

# %% Importing Libraries
import os
import re
import sys
import argparse
import time
import datetime
import random
from pathlib import Path
from PIL import Image
from tqdm import tqdm

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader

from pytorch_lightning.loggers import WandbLogger

import lightning as L
import lightning.pytorch as pl


from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.tuner.tuning import Tuner
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from transformers import AutoTokenizer, AutoModel, ViTModel, ViTImageProcessor

# Custom library
sys.path.append('../process/')
from utilities import map_images_with_text, augment_image_training_data
from loadData import MultimodalDataset

sys.path.append('../architectures/')
from multimodalLayer import multimodalFusionModel

import warnings
warnings.filterwarnings('ignore')

# Suppress TorchDynamo errors and fall back to eager execution
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [2]:
from collections import Counter

In [3]:
def parse_args():
    # %% Setting up the Argparser
    parser = argparse.ArgumentParser(description="Trains a Declutr-small and ViT-patch16 based classifier to establish baselines for Multimodal Authorship tasks on Backpage advertisements.")
    parser.add_argument('--logged_entry_name', type=str, default="multimodal-latent-fusion-seed:1111", help="Logged entry name visible on weights and biases")
    parser.add_argument('--data_dir', type=str, default='/workspace/persistent/HTClipper/data/processed', help="""Data directory""")
    parser.add_argument('--city', type=str, default='south', help="""Demography of data, can be only between chicago, atlanta, houston, dallas, detroit, ny, sf or all""")
    parser.add_argument('--fusion_technique', type=str, default='mean', help="""Kind of fusion technique to use. Can be amongst mean, concat, add, multiply, attention, or learned_fusion""")
    parser.add_argument('--save_dir', type=str, default=os.path.join(os.getcwd(), "/workspace/persistent/HTClipper/models/multimodal-baselines/latent_fusion/"), help="""Directory for models to be saved""")
    parser.add_argument('--model_dir_name', type=str, default=None, help="Save the model with the folder name as mentioned.")
    parser.add_argument('--batch_size', type=int, default=32, help="Batch Size")
    parser.add_argument('--nb_epochs', type=int, default=40, help="Number of Epochs")
    parser.add_argument('--patience', type=int, default=3, help="Patience for Early Stopping")
    parser.add_argument('--seed', type=int, default=1111, help='Random seed value')
    parser.add_argument('--warmup_steps', type=int, default=0, help="Warmup proportion")
    parser.add_argument('--grad_steps', type=int, default=4, help="Gradient accumulating step")
    parser.add_argument('--learning_rate', type=float, default=6e-4, help="learning rate")
    parser.add_argument('--train_data_percentage', type=float, default=1.0, help="Percentage of training data to be used")
    parser.add_argument('--adam_epsilon', type=float, default=1e-6, help="Epsilon value for adam optimizer")
    parser.add_argument('--min_delta_change', type=float, default=0.01, help="Minimum change in delta in validation loss for Early Stopping")
    parser.add_argument('--weight_decay', type=float, default=0.01, help="Weight decay")
    parser.add_argument('--augment_data', type=bool, default=False, help='Enables data augmentation')
    parser.add_argument('--nb_augmented_samples', type=int, default=1, help='Number of augmented samples to be generated')
    parser.add_argument('--loss', type=str, default='CE+SupCon', help='Loss function to use. Can be CE, CE+SupCon, or CE+SupCon+ITM')
    parser.add_argument('--temp', type=float, default=0.5, help="Tempertaure variable for the Constrastive loss function")
    
    # Check if running in Jupyter
    if 'ipykernel' in sys.modules:
        args = parser.parse_args([])
    else:
        args = parser.parse_args()

    return args

args = parse_args()

In [4]:
# Setting seed value for reproducibility    
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(args.seed)
random.seed(args.seed)
os.environ['PYTHONHASHSEED'] = str(args.seed)
# Set TOKENIZERS_PARALLELISM to false to disable parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
seed_everything(args.seed)

# Set matrix multiplication precision
# This setting offers a balance between precision and performance. It’s typically a good starting point for mixed precision training
#  with FP16.
torch.set_float32_matmul_precision("high")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# assert args.city in ["chicago", "atlanta", "dallas", "detroit", "houston", "sf", "ny", "all"]
assert args.fusion_technique in ["mean", "add", "concat", "multiply", "attention", "learned_fusion"]
assert args.loss in ["CE", "CE+SupCon", "CE+SupCon+ITM"]

# Creating directories
if args.model_dir_name == None:
    directory = os.path.join(args.save_dir, args.city, "seed:" + str(args.seed), "lr-" + str(args.learning_rate), args.loss, str(args.temp), args.fusion_technique)
else:
    directory = os.path.join(args.save_dir, args.city, "seed:" + str(args.seed), "lr-" + str(args.learning_rate), args.model_dir_name, args.loss, args.temp, args.fusion_technique)
Path(directory).mkdir(parents=True, exist_ok=True)
Path(args.save_dir).mkdir(parents=True, exist_ok=True)

# %% Load your DataFrame
data_dir = os.path.join(args.data_dir, args.city + ".csv")
args.image_dir = os.path.join("/workspace/persistent/HTClipper/data/IMAGES", args.city, "image", "image")
df = pd.read_csv(data_dir)

# mapping every image to it's corresponding text
df = map_images_with_text(df)
# Encode the labels
label_encoder = LabelEncoder()
df['VENDOR'] = label_encoder.fit_transform(df['VENDOR'])

# Identify and remove classes with fewer than 2 instances
# Since we use stratify during splitting, we should atleast have one training example in training and one in test dataset
class_counts = df['VENDOR'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df_filtered = df[df['VENDOR'].isin(valid_classes)]

# Split the data into train, validation, and test sets
train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=args.seed, stratify=df_filtered['VENDOR'])
# train_df, val_df = train_test_split(train_df, test_size=0.05, random_state=args.seed, stratify=train_df['VENDOR'])

# Replacing all the numbers in the training dataset with the letter "N"
train_df['TEXT'] = train_df['TEXT'].apply(lambda x: re.sub(r'\d', 'N', str(x)))

# Augment the training data by adding multiple entries for each image
# train_df = augment_image_training_data(train_df)

# %% Intializing the tokenizers and models
# Since these are the two models that performed individually on the text and image modalities, we establish them as benchmarks and
# only run use them in our further experiments.
text_tokenizer = AutoTokenizer.from_pretrained('johngiorgi/declutr-small')
text_model = AutoModel.from_pretrained('johngiorgi/declutr-small')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

# Create the datasets and dataloaders
train_dataset = MultimodalDataset(train_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=args.augment_data)
# val_dataset = MultimodalDataset(val_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)
test_dataset = MultimodalDataset(test_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)

train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True)
# val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)

num_training_steps = args.nb_epochs * len(train_dataloader)
# Setting the warmup steps to 1/10th the size of training data
warmup_steps = int(0.1 * num_training_steps)

Seed set to 1111
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
# Create an instance of the model
model = multimodalFusionModel(
    text_model=text_model,
    image_model=image_model,
    fusion_technique="mean",
    num_classes=len(label_encoder.classes_),
    learning_rate=args.learning_rate,
    weight_decay=args.weight_decay,
    eps=args.adam_epsilon,
    num_training_steps=num_training_steps,
    warmup_steps=warmup_steps,
    temperature=args.temp,
    loss_function="CE",
    ce_weight = 1.0,
    supcon_weight = 1.0,
    itm_weight = 1.0,
    ntxent_weight = 1.0,
    num_hard_negatives = 5
)

# Load the checkpoint
checkpoint = torch.load("/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/south/seed:1111/lr-0.0001/CE/0.5/mean/final_model.ckpt")

# Load the state dictionary into the model
model.load_state_dict(checkpoint['state_dict'])

# Set the model to evaluation mode
model.eval()

# Move the model to the desired device
model = model.to(device)

# Extract embeddings

In [None]:
# %% Intializing the tokenizers and models
# Since these are the two models that performed individually on the text and image modalities, we establish them as benchmarks and
# only run use them in our further experiments.
text_tokenizer = AutoTokenizer.from_pretrained('johngiorgi/declutr-small')
text_model = AutoModel.from_pretrained('johngiorgi/declutr-small')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def generate_embeddings(model, city, dataloader):
    # Initialize lists to store embeddings
    all_multimodal_embeddings = []
    all_text_embeddings = []
    all_image_embeddings = []
    all_labels = []
    
    # Iterate through all batches in the dataloader with a progress bar
    for batch in tqdm(dataloader, desc=f"Extracting embeddings for {city}"):
        input_ids, attention_mask, pixel_values, labels = batch.get('input_ids'), batch.get('attention_mask'), batch.get('pixel_values'), batch['label']

        # Extract multimodal embeddings
        multimodal_embeddings = model.extract_embeddings(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), pixel_values=pixel_values.to(device))
        all_multimodal_embeddings.append(multimodal_embeddings.cpu().detach())

        # Extract text embeddings
        text_embeddings = model.extract_embeddings(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        all_text_embeddings.append(text_embeddings.cpu().detach())

        # Extract image embeddings
        image_embeddings = model.extract_embeddings(pixel_values=pixel_values.to(device))
        all_image_embeddings.append(image_embeddings.cpu().detach())
        
        # Extract labels
        all_labels.append(labels.cpu().detach())

    # Concatenate all embeddings into 2D tensors
    all_multimodal_embeddings = torch.cat(all_multimodal_embeddings, dim=0)
    all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
    all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    
    return all_multimodal_embeddings, all_text_embeddings, all_image_embeddings, all_labels

def generate_embeddings_for_city(model, city, folder_name):    
    # %% Load your DataFrame
    pickled_dir = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/multimodal_baselines/"
    data_dir = os.path.join(args.data_dir, city + ".csv")
    args.image_dir = os.path.join("/workspace/persistent/HTClipper/data/IMAGES", city, "image", "image")
    df = pd.read_csv(data_dir)

    # Encode the labels
    # label_encoder = LabelEncoder()
    # df['VENDOR'] = label_encoder.fit_transform(df['VENDOR'])

    # Identify and keep vendors with at least 2 instances
    class_counts = df['VENDOR'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    df_filtered = df[df['VENDOR'].isin(valid_classes)]

    # Re-encode labels after filtering
    # df_filtered['VENDOR'] = label_encoder.fit_transform(df_filtered['VENDOR'])

    df_filtered = df_filtered[["TEXT", "IMAGES", "VENDOR"]].drop_duplicates()

    # Dynamically adjust test_size based on the number of classes
    min_test_size = len(df_filtered['VENDOR'].unique()) / len(df_filtered)
    test_size = max(0.2, min_test_size)  # Ensure the test size is at least 20% or large enough to include all classes

    train_df, test_df = train_test_split(
        df_filtered, test_size=test_size, random_state=args.seed, stratify=df_filtered['VENDOR'], shuffle=True
    )

    # Apply map_images_with_text separately to avoid overlap of text-image pairs across splits
    train_df = map_images_with_text(train_df).drop_duplicates()
    test_df = map_images_with_text(test_df).drop_duplicates()

    # Replacing all the numbers in the training dataset with the letter "N"
    train_df['TEXT'] = train_df['TEXT'].apply(lambda x: re.sub(r'\d', 'N', str(x)))

    # Augment the training data by adding multiple entries for each image
    # train_df = augment_image_training_data(train_df)

    # Create the datasets and dataloaders
    train_dataset = MultimodalDataset(train_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=args.augment_data)
    # val_dataset = MultimodalDataset(val_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)
    test_dataset = MultimodalDataset(test_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)
    # val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)
    test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)
    
    directory = os.path.join(pickled_dir, folder_name)
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    multimodal_embeddings, text_embeddings, image_embeddings, labels = generate_embeddings(model, city, train_dataloader)
    label_filename = city + "_labels_train.pt"
    multimodal_data_filename = city + "_multimodaldata_train.pt"
    text_data_filename = city + "_textdata_train.pt"
    image_data_filename = city + "_imagedata_train.pt"
    
    
    torch.save(multimodal_embeddings, os.path.join(directory, multimodal_data_filename))
    torch.save(text_embeddings, os.path.join(directory, text_data_filename))
    torch.save(image_embeddings, os.path.join(directory, image_data_filename))
    torch.save(labels, os.path.join(directory, label_filename))
    
    multimodal_embeddings, text_embeddings, image_embeddings, labels = generate_embeddings(model, city, test_dataloader)
    label_filename = city + "_labels_test.pt"
    multimodal_data_filename = city + "_multimodaldata_test.pt"
    text_data_filename = city + "_textdata_test.pt"
    image_data_filename = city + "_imagedata_test.pt"
    torch.save(multimodal_embeddings, os.path.join(directory, multimodal_data_filename))
    torch.save(text_embeddings, os.path.join(directory, text_data_filename))
    torch.save(image_embeddings, os.path.join(directory, image_data_filename))
    torch.save(labels, os.path.join(directory, label_filename))

In [59]:
"""
def generate_embeddings(city, dataloader):
    # Initialize lists to store embeddings and labels
    all_multimodal_embeddings = []
    all_text_embeddings = []
    all_image_embeddings = []
    all_text_labels = []  # Separate list for text labels
    all_image_labels = []  # Separate list for image labels
    all_labels = []
    
    seen_text_embeddings = set()  # To track unique text embeddings
    seen_image_embeddings = set()  # To track unique image embeddings

    # Iterate through all batches in the dataloader with a progress bar
    for batch in tqdm(dataloader, desc=f"Extracting embeddings for {city}"):
        input_ids, attention_mask, pixel_values, labels = batch.get('input_ids'), batch.get('attention_mask'), batch.get('pixel_values'), batch['label']

        # Extract multimodal embeddings
        multimodal_embeddings = model.extract_embeddings(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), pixel_values=pixel_values.to(device))
        all_multimodal_embeddings.append(multimodal_embeddings.cpu().detach())
        all_labels.append(labels.cpu().detach())

        # Extract text embeddings
        text_embeddings = model.extract_embeddings(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        text_embedding_tuple = tuple(text_embeddings.cpu().detach().numpy().flatten())

        if text_embedding_tuple not in seen_text_embeddings:
            seen_text_embeddings.add(text_embedding_tuple)  # Track the unique embedding
            all_text_embeddings.append(text_embeddings.cpu().detach())
            all_text_labels.append(labels.cpu().detach())  # Append label corresponding to the text

        # Extract image embeddings
        image_embeddings = model.extract_embeddings(pixel_values=pixel_values.to(device))
        image_embedding_tuple = tuple(image_embeddings.cpu().detach().numpy().flatten())

        if image_embedding_tuple not in seen_image_embeddings:
            seen_image_embeddings.add(image_embedding_tuple)  # Track the unique embedding
            all_image_embeddings.append(image_embeddings.cpu().detach())
            all_image_labels.append(labels.cpu().detach())  # Append label corresponding to the image
        
    # Concatenate all embeddings into 2D tensors
    all_multimodal_embeddings = torch.cat(all_multimodal_embeddings, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
    all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
    all_text_labels = torch.cat(all_text_labels, dim=0)
    all_image_labels = torch.cat(all_image_labels, dim=0)
    
    return all_multimodal_embeddings, all_text_embeddings, all_image_embeddings, all_text_labels, all_image_labels, all_labels
"""

"""
def generate_embeddings_for_city(city, folder_name):    
    # Load your DataFrame
    pickled_dir = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/trained_declutr_vit"
    data_dir = os.path.join(args.data_dir, city + ".csv")
    args.image_dir = os.path.join("/workspace/persistent/HTClipper/data/IMAGES", city, "image", "image")
    df = pd.read_csv(data_dir)

    # Mapping every image to its corresponding text
    df = map_images_with_text(df)
    # Encode the labels
    label_encoder = LabelEncoder()
    df['VENDOR'] = label_encoder.fit_transform(df['VENDOR'])

    # Identify and remove classes with fewer than 2 instances
    class_counts = df['VENDOR'].value_counts()
    valid_classes = class_counts[class_counts >= 3].index
    df_filtered = df[df['VENDOR'].isin(valid_classes)]

    # Split the data into train and test sets (80-20 split)
    train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=args.seed, stratify=df_filtered['VENDOR'])

    # Replacing all the numbers in the training dataset with the letter "N"
    train_df['TEXT'] = train_df['TEXT'].apply(lambda x: re.sub(r'\d', 'N', str(x)))

    # Augment the training data by adding multiple entries for each image
    train_df = augment_image_training_data(train_df)

    # Create the datasets and dataloaders
    train_dataset = MultimodalDataset(train_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=args.augment_data)
    test_dataset = MultimodalDataset(test_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)
    test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)
    
    directory = os.path.join(pickled_dir, folder_name)
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    # Generate embeddings for the training data
    multimodal_embeddings_train, text_embeddings_train, image_embeddings_train, text_labels_train, image_labels_train, all_labels_train = generate_embeddings(city, train_dataloader)
    torch.save(multimodal_embeddings_train, os.path.join(directory, f"{city}_multimodaldata_train.pt"))
    torch.save(text_embeddings_train, os.path.join(directory, f"{city}_textdata_train.pt"))
    torch.save(image_embeddings_train, os.path.join(directory, f"{city}_imagedata_train.pt"))
    torch.save(text_labels_train, os.path.join(directory, f"{city}_labels_text_train.pt"))  # Save text labels separately
    torch.save(image_labels_train, os.path.join(directory, f"{city}_labels_image_train.pt"))  # Save image labels separately
    torch.save(all_labels_train, os.path.join(directory, f"{city}_labels_multimodal_train.pt"))  # Save image labels separately
    
    # Generate embeddings for the testing data
    multimodal_embeddings_test, text_embeddings_test, image_embeddings_test, text_labels_test, image_labels_test, all_labels_test = generate_embeddings(city, test_dataloader)
    torch.save(multimodal_embeddings_test, os.path.join(directory, f"{city}_multimodaldata_test.pt"))
    torch.save(text_embeddings_test, os.path.join(directory, f"{city}_textdata_test.pt"))
    torch.save(image_embeddings_test, os.path.join(directory, f"{city}_imagedata_test.pt"))
    torch.save(text_labels_test, os.path.join(directory, f"{city}_labels_text_test.pt"))  # Save text labels separately
    torch.save(image_labels_test, os.path.join(directory, f"{city}_labels_image_test.pt"))  # Save image labels separately
    torch.save(all_labels_test, os.path.join(directory, f"{city}_labels_multimodal_test.pt"))  # Save image labels separately
"""

'\ndef generate_embeddings_for_city(city, folder_name):    \n    # Load your DataFrame\n    pickled_dir = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/trained_declutr_vit"\n    data_dir = os.path.join(args.data_dir, city + ".csv")\n    args.image_dir = os.path.join("/workspace/persistent/HTClipper/data/IMAGES", city, "image", "image")\n    df = pd.read_csv(data_dir)\n\n    # Mapping every image to its corresponding text\n    df = map_images_with_text(df)\n    # Encode the labels\n    label_encoder = LabelEncoder()\n    df[\'VENDOR\'] = label_encoder.fit_transform(df[\'VENDOR\'])\n\n    # Identify and remove classes with fewer than 2 instances\n    class_counts = df[\'VENDOR\'].value_counts()\n    valid_classes = class_counts[class_counts >= 3].index\n    df_filtered = df[df[\'VENDOR\'].isin(valid_classes)]\n\n    # Split the data into train and test sets (80-20 split)\n    train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=arg

In [None]:
for city in ["chicago", "atlanta", "detroit", "houston", "dallas", "ny", "sf"]:
    generate_embeddings_for_city(city, "CE")

In [None]:
for city in ["south", "midwest", "west", "northeast"]:
    print("-"*50 + city + "-"*50)
    generate_embeddings_for_city(city, "E2E/CE-attention")

In [69]:
def load_e2e_ce_model(fusion_technique):
    # Create an instance of the model
    model = multimodalFusionModel(
        text_model=text_model,
        image_model=image_model,
        fusion_technique=fusion_technique,
        num_classes=len(label_encoder.classes_),
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        eps=args.adam_epsilon,
        num_training_steps=num_training_steps,
        warmup_steps=warmup_steps,
        temperature=args.temp,
        loss_function="CE",
        ce_weight = 1.0,
        supcon_weight = 1.0,
        itm_weight = 1.0,
        ntxent_weight = 1.0,
        num_hard_negatives = 5
    )

    # Load the checkpoload_e2e_ce_modelint
    checkpoint = torch.load(f"/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/south/seed:1111/lr-0.0001/CE/0.5/{fusion_technique}/final_model.ckpt")

    # Load the state dictionary into the model
    model.load_state_dict(checkpoint['state_dict'], strict=False)
    return model

In [70]:
for technique in ["attention", "concat", "learned_fusion", "mean"]:
    print(f"fusion technique: {technique}")
    model = None
    print(f"model:{model}")
    
    model = load_e2e_ce_model(technique)
    # Set the model to evaluation mode
    model.eval()

    # Move the model to the desired device
    model = model.to(device)

    for city in ["south", "midwest", "west", "northeast"]:
        print("-"*50 + city + "-"*50)
        generate_embeddings_for_city(model, city, f"E2E/CE-{technique}")

fusion technique: attention
model:None
--------------------------------------------------south--------------------------------------------------


Extracting embeddings for south: 100%|██████████| 409/409 [02:31<00:00,  2.70it/s]
Extracting embeddings for south: 100%|██████████| 104/104 [00:40<00:00,  2.55it/s]


--------------------------------------------------midwest--------------------------------------------------


Extracting embeddings for midwest: 100%|██████████| 230/230 [01:26<00:00,  2.65it/s]
Extracting embeddings for midwest: 100%|██████████| 58/58 [00:23<00:00,  2.48it/s]


--------------------------------------------------west--------------------------------------------------


Extracting embeddings for west: 100%|██████████| 87/87 [00:35<00:00,  2.42it/s]
Extracting embeddings for west: 100%|██████████| 22/22 [00:10<00:00,  2.03it/s]


--------------------------------------------------northeast--------------------------------------------------


Extracting embeddings for northeast: 100%|██████████| 90/90 [01:02<00:00,  1.45it/s]
Extracting embeddings for northeast: 100%|██████████| 26/26 [00:26<00:00,  1.03s/it]


fusion technique: concat
model:None
--------------------------------------------------south--------------------------------------------------


Extracting embeddings for south: 100%|██████████| 409/409 [02:32<00:00,  2.69it/s]
Extracting embeddings for south: 100%|██████████| 104/104 [00:40<00:00,  2.58it/s]


--------------------------------------------------midwest--------------------------------------------------


Extracting embeddings for midwest: 100%|██████████| 230/230 [01:28<00:00,  2.59it/s]
Extracting embeddings for midwest: 100%|██████████| 58/58 [00:23<00:00,  2.44it/s]


--------------------------------------------------west--------------------------------------------------


Extracting embeddings for west: 100%|██████████| 87/87 [00:35<00:00,  2.48it/s]
Extracting embeddings for west: 100%|██████████| 22/22 [00:10<00:00,  2.05it/s]


--------------------------------------------------northeast--------------------------------------------------


Extracting embeddings for northeast: 100%|██████████| 90/90 [00:36<00:00,  2.49it/s]
Extracting embeddings for northeast: 100%|██████████| 26/26 [00:12<00:00,  2.08it/s]


fusion technique: learned_fusion
model:None
--------------------------------------------------south--------------------------------------------------


Extracting embeddings for south: 100%|██████████| 409/409 [02:34<00:00,  2.65it/s]
Extracting embeddings for south: 100%|██████████| 104/104 [00:39<00:00,  2.62it/s]


--------------------------------------------------midwest--------------------------------------------------


Extracting embeddings for midwest: 100%|██████████| 230/230 [01:26<00:00,  2.64it/s]
Extracting embeddings for midwest: 100%|██████████| 58/58 [00:25<00:00,  2.30it/s]


--------------------------------------------------west--------------------------------------------------


Extracting embeddings for west: 100%|██████████| 87/87 [00:35<00:00,  2.44it/s]
Extracting embeddings for west: 100%|██████████| 22/22 [00:10<00:00,  2.01it/s]


--------------------------------------------------northeast--------------------------------------------------


Extracting embeddings for northeast: 100%|██████████| 90/90 [00:37<00:00,  2.41it/s]
Extracting embeddings for northeast: 100%|██████████| 26/26 [00:12<00:00,  2.08it/s]


fusion technique: mean
model:None
--------------------------------------------------south--------------------------------------------------


Extracting embeddings for south: 100%|██████████| 409/409 [02:33<00:00,  2.67it/s]
Extracting embeddings for south: 100%|██████████| 104/104 [00:40<00:00,  2.54it/s]


--------------------------------------------------midwest--------------------------------------------------


Extracting embeddings for midwest: 100%|██████████| 230/230 [01:30<00:00,  2.55it/s]
Extracting embeddings for midwest: 100%|██████████| 58/58 [00:24<00:00,  2.41it/s]


--------------------------------------------------west--------------------------------------------------


Extracting embeddings for west: 100%|██████████| 87/87 [00:34<00:00,  2.51it/s]
Extracting embeddings for west: 100%|██████████| 22/22 [00:10<00:00,  2.03it/s]


--------------------------------------------------northeast--------------------------------------------------


Extracting embeddings for northeast: 100%|██████████| 90/90 [00:36<00:00,  2.45it/s]
Extracting embeddings for northeast: 100%|██████████| 26/26 [00:12<00:00,  2.07it/s]


In [11]:
def load_e2e_ce_supcon_model(temp, fusion_technique="mean"):
    # Create an instance of the model
    model = multimodalFusionModel(
        text_model=text_model,
        image_model=image_model,
        fusion_technique=fusion_technique,
        num_classes=len(label_encoder.classes_),
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        eps=args.adam_epsilon,
        num_training_steps=num_training_steps,
        warmup_steps=warmup_steps,
        temperature=temp,
        loss_function="CE",
        ce_weight = 1.0,
        supcon_weight = 1.0,
        itm_weight = 1.0,
        ntxent_weight = 1.0,
        num_hard_negatives = 5
    )

    # Load the checkpoload_e2e_ce_modelint
    checkpoint = torch.load(f"/workspace/persistent/HTClipper/models/grouped-and-masked/multimodal-baselines/classification/south/seed:1111/lr-0.0001/CE+SupCon/{temp}/mean/final_model.ckpt")

    # Load the state dictionary into the model
    model.load_state_dict(checkpoint['state_dict'], strict=False)
    return model

In [14]:
for temp in [0.5]:
    print(f"temp: {temp}")
    model = None
    print(f"model:{model}")
    
    model = load_e2e_ce_supcon_model(temp, fusion_technique="mean")
    # Set the model to evaluation mode
    model.eval()

    # Move the model to the desired device
    model = model.to(device)

    for city in ["south", "midwest", "west", "northeast"]:
        print("-"*50 + city + "-"*50)
        generate_embeddings_for_city(model, city, f"E2E/CE-SupCon-mean-{temp}")

temp: 0.5
model:None
--------------------------------------------------south--------------------------------------------------


Extracting embeddings for south: 100%|██████████| 409/409 [06:34<00:00,  1.04it/s]
Extracting embeddings for south: 100%|██████████| 104/104 [01:42<00:00,  1.01it/s]


--------------------------------------------------midwest--------------------------------------------------


Extracting embeddings for midwest: 100%|██████████| 230/230 [04:43<00:00,  1.23s/it]
Extracting embeddings for midwest: 100%|██████████| 58/58 [01:10<00:00,  1.22s/it]


--------------------------------------------------west--------------------------------------------------


Extracting embeddings for west: 100%|██████████| 87/87 [01:45<00:00,  1.21s/it]
Extracting embeddings for west: 100%|██████████| 22/22 [00:29<00:00,  1.34s/it]


--------------------------------------------------northeast--------------------------------------------------


Extracting embeddings for northeast: 100%|██████████| 90/90 [01:48<00:00,  1.21s/it]
Extracting embeddings for northeast: 100%|██████████| 26/26 [00:33<00:00,  1.30s/it]


In [20]:
# Create an instance of the model
model = multimodalFusionModel(
    text_model=text_model,
    image_model=image_model,
    fusion_technique="mean",
    num_classes=len(label_encoder.classes_),
    learning_rate=args.learning_rate,
    weight_decay=args.weight_decay,
    eps=args.adam_epsilon,
    num_training_steps=num_training_steps,
    warmup_steps=warmup_steps,
    temperature=args.temp,
    loss_function="CE+NTXent",
    ce_weight=1.0,
    supcon_weight=1.0,
    itm_weight=1.0,
    num_hard_negatives=5
)

# Load the checkpoint
checkpoint = torch.load("/workspace/persistent/HTClipper/models/multimodal-baselines/latent_fusion/chicago/seed:1111/lr-0.0001/CE+NTXent/0.3/mean/final_model.ckpt")

# Load the state dictionary into the model
model.load_state_dict(checkpoint['state_dict'])

# Set the model to evaluation mode
model.eval()

# Move the model to the desired device
model = model.to(device)

In [None]:
for city in ["chicago", "atlanta", "detroit", "houston", "dallas", "ny", "sf"]:
    generate_embeddings_for_city(city, "CE-NTXent")

Extracting embeddings for chicago:   1%|          | 8/1314 [00:09<15:50,  1.37it/s]  

In [31]:
# Create an instance of the model
model = multimodalFusionModel(
    text_model=text_model,
    image_model=image_model,
    fusion_technique=args.fusion_technique,
    num_classes=len(label_encoder.classes_),
    learning_rate=args.learning_rate,
    weight_decay=args.weight_decay,
    eps=args.adam_epsilon,
    num_training_steps=num_training_steps,
    warmup_steps=warmup_steps,
    temperature=args.temp,
    loss_function=args.loss
)

# Load the checkpoint
checkpoint = torch.load("/workspace/persistent/HTClipper/models/multimodal-baselines/latent_fusion/chicago/seed:1111/lr-0.0001/CE+SupCon+ITM/0.1/mean/final_model.ckpt")

# Load the state dictionary into the model
model.load_state_dict(checkpoint['state_dict'])

# Set the model to evaluation mode
model.eval()

# Move the model to the desired device
model = model.to(device)

In [None]:
for city in ["chicago", "atlanta", "detroit", "houston", "dallas", "ny", "sf"]:
    generate_embeddings_for_city(city, "CE+SupCon+ITM")

Extracting embeddings for chicago: 100%|██████████| 1314/1314 [06:28<00:00,  3.38it/s]
Extracting embeddings for chicago: 100%|██████████| 47/47 [00:16<00:00,  2.82it/s]
Extracting embeddings for atlanta:  19%|█▊        | 170/916 [01:58<06:11,  2.01it/s]

# Generating true positive and false positives

In [28]:
model = load_e2e_ce_supcon_model(temp=0.1, fusion_technique="mean")
# Set the model to evaluation mode
model.eval()

# Move the model to the desired device
model = model.to(device)

In [12]:
import lightning as L
import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

trainer = L.Trainer(max_epochs=32, accelerator="gpu", fast_dev_run=False, 
                    accumulate_grad_batches = 4, # To run the backward step after n batches, helps to increase the batch size
                    benchmark = True, # Fastens the training process
                    deterministic=True, # Ensures reproducibility 
                    limit_train_batches=1.0, # trains on 10% of the data,
                    check_val_every_n_epoch = 1, # run val loop every 1 training epochs
                    # callbacks=[model_checkpoint, early_stop_callback], # Enables model checkpoint and early stopping
                    # callbacks=[early_stop_callback],
                    # logger = wandb_logger,
                    # strategy=DeepSpeedStrategy(stage=3, offload_optimizer=True, offload_parameters=True, offload_params_device='cpu'), # Enable CPU Offloading, and offload parameters to CPU
                    # plugins=DeepSpeedPrecisionPlugin(precision='16-mixed') # Mixed Precision system
                    precision='16-mixed' # Mixed Precision system
                    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


In [13]:
trainer.test(model=model, dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 410/410 [01:01<00:00,  6.62it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc             0.940451443195343
      test_f1_macro         0.9314517378807068
      test_f1_micro          0.959435760974884
    test_f1_weighted        0.9595904350280762
        test_loss           0.2999718487262726
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.2999718487262726,
  'test_acc': 0.940451443195343,
  'test_f1_weighted': 0.9595904350280762,
  'test_f1_micro': 0.959435760974884,
  'test_f1_macro': 0.9314517378807068}]

In [None]:
from tqdm import tqdm

# Assuming 'pred' and 'actual' are lists intended to collect predictions and actual labels
pred, actual = ([] for i in range(2))

# Iterate over the test dataloader with a tqdm progress bar
for batch in tqdm(train_dataloader, desc="Extracting Train predictions"):    
    input_ids, attention_mask, pixel_values, labels = batch.get('input_ids'), batch.get('attention_mask'), batch.get('pixel_values'), batch['label']
    logits, _ = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
    preds = torch.argmax(logits, dim=1)
    
    # Append predictions and labels to their respective lists
    pred.append(preds.cpu().numpy())
    actual.append(labels.cpu().numpy())

Extracting Train predictions:  83%|████████▎ | 1356/1639 [1:04:47<14:29,  3.07s/it]

In [None]:
train_pred_labels = [int(item) for array in pred for item in array]
train_actual_labels = [int(item) for array in actual for item in array]

In [None]:
from tqdm import tqdm

# Assuming 'pred' and 'actual' are lists intended to collect predictions and actual labels
pred, actual = ([] for i in range(2))

# Iterate over the test dataloader with a tqdm progress bar
for batch in tqdm(test_dataloader, desc="Extracting Test predictions"):    
    input_ids, attention_mask, pixel_values, labels = batch.get('input_ids'), batch.get('attention_mask'), batch.get('pixel_values'), batch['label']
    logits, _ = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
    preds = torch.argmax(logits, dim=1)
    
    # Append predictions and labels to their respective lists
    pred.append(preds.cpu().numpy())
    actual.append(labels.cpu().numpy())

In [None]:
test_pred_labels = [int(item) for array in pred for item in array]
test_actual_labels = [int(item) for array in actual for item in array]

In [None]:
import pickle

with open('../error_analysis/multimodal_train_class_freq.pkl', 'wb') as f:
    pickle.dump(train_actual_labels, f)
    
with open('../error_analysis/multimodal_test_pred_labels.pkl', 'wb') as f:
    pickle.dump(test_pred_labels, f)
    
with open('../error_analysis/multimodal_test_act_labels.pkl', 'wb') as f:
    pickle.dump(test_actual_labels, f)

# Saving TP and FP results for images with and without faces

In [33]:
# %% Load your DataFrame
df = pd.read_csv("../data/processed/south.csv")
df['region'] = "south"

In [82]:
# Function to map images with text for CLIP model
def map_images_with_text_for_clip_model(df, img_dir="/workspace/persistent/HTClipper/data/IMAGES", filter_by="vendor"):
    # Initialize a list to store the new rows
    new_rows = []

    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        text = row['TEXT']
        all_images = str(row['IMAGES']).split('|')
        characteristics = str(row['FACES']).split('|')
        if filter_by == "vendor":
            vendor = row['VENDOR']
        elif filter_by == "id":
            vendor = row['ID']
        region = row['region']
        
        # Create a new entry for each image
        for index, image in enumerate(all_images):
            full_image_path = os.path.join(img_dir, region, "image", "image", image)
            
            # Only add the row if the image exists at the specified path
            if os.path.exists(full_image_path):
                new_rows.append({
                    'TEXT': text,
                    'IMAGES': full_image_path,  # Store the full image path
                    'VENDOR': vendor,
                    'region' : region,
                    'FACES' : characteristics[index]
                })

    # Create a new dataframe from the list of new rows
    return pd.DataFrame(new_rows)

In [35]:
# mapping every image to it's corresponding text
df = map_images_with_text_for_clip_model(df)

In [37]:
# Encode the labels
label_encoder = LabelEncoder()
df['VENDOR'] = label_encoder.fit_transform(df['VENDOR'])

# Identify and remove classes with fewer than 2 instances
# Since we use stratify during splitting, we should atleast have one training example in training and one in test dataset
class_counts = df['VENDOR'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df_filtered = df[df['VENDOR'].isin(valid_classes)]

In [40]:
# Split the data into train, validation, and test sets
train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=args.seed, stratify=df_filtered['VENDOR'])
# train_df, val_df = train_test_split(train_df, test_size=0.05, random_state=args.seed, stratify=train_df['VENDOR'])

# Replacing all the numbers in the training dataset with the letter "N"
train_df['TEXT'] = train_df['TEXT'].apply(lambda x: re.sub(r'\d', 'N', str(x)))

# Augment the training data by adding multiple entries for each image
# train_df = augment_image_training_data(train_df)

# %% Intializing the tokenizers and models
# Since these are the two models that performed individually on the text and image modalities, we establish them as benchmarks and
# only run use them in our further experiments.
text_tokenizer = AutoTokenizer.from_pretrained('johngiorgi/declutr-small')
text_model = AutoModel.from_pretrained('johngiorgi/declutr-small')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
train_face_vendor_dict = dict(Counter(train_df[train_df['FACES'] == "yes"]['VENDOR']))
train_noface_vendor_dict = dict(Counter(train_df[train_df['FACES'] == "no"]['VENDOR']))

In [53]:
import pickle

with open('../error_analysis/multimodal_face_train_class_freq.pkl', 'wb') as f:
    pickle.dump(train_face_vendor_dict, f)
    
with open('../error_analysis/multimodal_noface_train_class_freq.pkl', 'wb') as f:
    pickle.dump(train_noface_vendor_dict, f)

In [63]:
# Faces Dataset
# Create the datasets and dataloaders
train_dataset = MultimodalDataset(train_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=args.augment_data)
# val_dataset = MultimodalDataset(val_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)
test_dataset = MultimodalDataset(test_df[test_df.FACES == "yes"], text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)

train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True)
# val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)

In [64]:
from tqdm import tqdm

# Assuming 'pred' and 'actual' are lists intended to collect predictions and actual labels
pred, actual = ([] for i in range(2))

# Iterate over the test dataloader with a tqdm progress bar
for batch in tqdm(test_dataloader, desc="Extracting Test predictions"):    
    input_ids, attention_mask, pixel_values, labels = batch.get('input_ids'), batch.get('attention_mask'), batch.get('pixel_values'), batch['label']
    
    logits, _ = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), pixel_values=pixel_values.to(device))
    preds = torch.argmax(logits, dim=1)
    
    # Append predictions and labels to their respective lists
    pred.append(preds.cpu().numpy())
    actual.append(labels.cpu().numpy())

Extracting Test predictions: 100%|██████████| 200/200 [00:51<00:00,  3.91it/s]


In [54]:
test_pred_labels = [int(item) for array in pred for item in array]
test_actual_labels = [int(item) for array in actual for item in array]

In [57]:
with open('../error_analysis/multimodal_faceclassification_text_test_pred_labels.pkl', 'wb') as f:
    pickle.dump(test_pred_labels, f)
    
with open('../error_analysis/mulitmodal_faceclassification_text_test_act_labels.pkl', 'wb') as f:
    pickle.dump(test_actual_labels, f)

In [58]:
# No Faces Dataset
# Create the datasets and dataloaders
train_dataset = MultimodalDataset(train_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=args.augment_data)
# val_dataset = MultimodalDataset(val_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)
test_dataset = MultimodalDataset(test_df[test_df.FACES == "no"], text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)

train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True)
# val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)

In [59]:
from tqdm import tqdm

# Assuming 'pred' and 'actual' are lists intended to collect predictions and actual labels
pred, actual = ([] for i in range(2))

# Iterate over the test dataloader with a tqdm progress bar
for batch in tqdm(test_dataloader, desc="Extracting Test predictions"):    
    input_ids, attention_mask, pixel_values, labels = batch.get('input_ids'), batch.get('attention_mask'), batch.get('pixel_values'), batch['label']
    
    logits, _ = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), pixel_values=pixel_values.to(device))
    preds = torch.argmax(logits, dim=1)
    
    # Append predictions and labels to their respective lists
    pred.append(preds.cpu().numpy())
    actual.append(labels.cpu().numpy())

Extracting Test predictions: 100%|██████████| 210/210 [01:09<00:00,  3.03it/s]


In [60]:
test_pred_labels = [int(item) for array in pred for item in array]
test_actual_labels = [int(item) for array in actual for item in array]

In [61]:
with open('../error_analysis/multimodal_nofaceclassification_text_test_pred_labels.pkl', 'wb') as f:
    pickle.dump(test_pred_labels, f)
    
with open('../error_analysis/mulitmodal_nofaceclassification_text_test_act_labels.pkl', 'wb') as f:
    pickle.dump(test_actual_labels, f)

# Generating Retrieval Data for Multimodal Systems with Images fAces and no faces

In [78]:
model = load_e2e_ce_supcon_model(temp=0.5, fusion_technique="mean")
# Set the model to evaluation mode
model.eval()

# Move the model to the desired device
model = model.to(device)

In [83]:
def generate_faces_embeddings(model, city, dataloader):
    # Initialize lists to store embeddings
    all_multimodal_embeddings = []
    all_text_embeddings = []
    all_image_embeddings = []
    all_labels = []
    
    # Iterate through all batches in the dataloader with a progress bar
    for batch in tqdm(dataloader, desc=f"Extracting embeddings for {city}"):
        input_ids, attention_mask, pixel_values, labels = batch.get('input_ids'), batch.get('attention_mask'), batch.get('pixel_values'), batch['label']

        # Extract multimodal embeddings
        multimodal_embeddings = model.extract_embeddings(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), pixel_values=pixel_values.to(device))
        all_multimodal_embeddings.append(multimodal_embeddings.cpu().detach())

        # Extract text embeddings
        text_embeddings = model.extract_embeddings(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        all_text_embeddings.append(text_embeddings.cpu().detach())

        # Extract image embeddings
        image_embeddings = model.extract_embeddings(pixel_values=pixel_values.to(device))
        all_image_embeddings.append(image_embeddings.cpu().detach())
        
        # Extract labels
        all_labels.append(labels.cpu().detach())

    # Concatenate all embeddings into 2D tensors
    all_multimodal_embeddings = torch.cat(all_multimodal_embeddings, dim=0)
    all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
    all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    
    return all_multimodal_embeddings, all_text_embeddings, all_image_embeddings, all_labels

def generate_face_embeddings_for_city(model, city, mode="face"):    
    # %% Load your DataFrame
    pickled_dir = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/multimodal_baselines/"
    data_dir = os.path.join(args.data_dir, city + ".csv")
    args.image_dir = os.path.join("/workspace/persistent/HTClipper/data/IMAGES", city, "image", "image")
    df = pd.read_csv(data_dir)
    df['region'] = city

    df = map_images_with_text_for_clip_model(df)
    
    # Encode the labels
    label_encoder = LabelEncoder()
    df['VENDOR'] = label_encoder.fit_transform(df['VENDOR'])

    # Identify and remove classes with fewer than 2 instances
    # Since we use stratify during splitting, we should atleast have one training example in training and one in test dataset
    class_counts = df['VENDOR'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    df_filtered = df[df['VENDOR'].isin(valid_classes)]
    
    train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=args.seed, stratify=df_filtered['VENDOR'])

    # Replacing all the numbers in the training dataset with the letter "N"
    train_df['TEXT'] = train_df['TEXT'].apply(lambda x: re.sub(r'\d', 'N', str(x)))

    # Augment the training data by adding multiple entries for each image
    # train_df = augment_image_training_data(train_df)
    
    # Since these are the two models that performed individually on the text and image modalities, we establish them as benchmarks and
    # only run use them in our further experiments.
    text_tokenizer = AutoTokenizer.from_pretrained('johngiorgi/declutr-small')
    text_model = AutoModel.from_pretrained('johngiorgi/declutr-small')
    image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
    image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

    if mode == "face":
        test_df = test_df[test_df.FACES == "yes"]
    else:
        test_df = test_df[test_df.FACES == "no"]
    
    # Create the datasets and dataloaders
    train_dataset = MultimodalDataset(train_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=args.augment_data)
    # val_dataset = MultimodalDataset(val_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)
    test_dataset = MultimodalDataset(test_df, text_tokenizer, image_processor, label_encoder, image_dir=args.image_dir, augment=False)

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)
    # val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)
    test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)
    
    file_dir = f"/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/error_analysis/multimodal_baseline/trained_declutr-vit/{mode}"
    Path(file_dir).mkdir(parents=True, exist_ok=True) 
    
    multimodal_embeddings, text_embeddings, image_embeddings, labels = generate_faces_embeddings(model, city, train_dataloader)
    label_filename = city + "_labels_train.pt"
    multimodal_data_filename = city + "_multimodaldata_train.pt"
    text_data_filename = city + "_textdata_train.pt"
    image_data_filename = city + "_imagedata_train.pt"
    
    
    torch.save(multimodal_embeddings, os.path.join(file_dir, multimodal_data_filename))
    torch.save(text_embeddings, os.path.join(file_dir, text_data_filename))
    torch.save(image_embeddings, os.path.join(file_dir, image_data_filename))
    torch.save(labels, os.path.join(file_dir, label_filename))
    
    multimodal_embeddings, text_embeddings, image_embeddings, labels = generate_faces_embeddings(model, city, test_dataloader)
    label_filename = city + "_labels_test.pt"
    multimodal_data_filename = city + "_multimodaldata_test.pt"
    text_data_filename = city + "_textdata_test.pt"
    image_data_filename = city + "_imagedata_test.pt"
    torch.save(multimodal_embeddings, os.path.join(file_dir, multimodal_data_filename))
    torch.save(text_embeddings, os.path.join(file_dir, text_data_filename))
    torch.save(image_embeddings, os.path.join(file_dir, image_data_filename))
    torch.save(labels, os.path.join(file_dir, label_filename))

In [84]:

for city in ["south", "midwest", "west", "northeast"]:
    print("-"*50 + city + "-"*50)
    generate_face_embeddings_for_city(model, city, mode="face")

--------------------------------------------------south--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for south: 100%|██████████| 410/410 [09:39<00:00,  1.41s/it]
Extracting embeddings for south: 100%|██████████| 50/50 [01:09<00:00,  1.39s/it]


--------------------------------------------------midwest--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for midwest: 100%|██████████| 229/229 [05:03<00:00,  1.33s/it]
Extracting embeddings for midwest: 100%|██████████| 32/32 [00:54<00:00,  1.70s/it]


--------------------------------------------------west--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for west: 100%|██████████| 87/87 [01:55<00:00,  1.32s/it]
Extracting embeddings for west: 100%|██████████| 13/13 [00:20<00:00,  1.61s/it]


--------------------------------------------------northeast--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for northeast: 100%|██████████| 93/93 [02:33<00:00,  1.65s/it]
Extracting embeddings for northeast: 100%|██████████| 15/15 [00:30<00:00,  2.05s/it]


In [85]:

for city in ["south", "midwest", "west", "northeast"]:
    print("-"*50 + city + "-"*50)
    generate_face_embeddings_for_city(model, city, mode="noface")

--------------------------------------------------south--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for south: 100%|██████████| 410/410 [03:43<00:00,  1.83it/s]
Extracting embeddings for south: 100%|██████████| 53/53 [01:30<00:00,  1.71s/it]


--------------------------------------------------midwest--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for midwest: 100%|██████████| 229/229 [02:10<00:00,  1.75it/s]
Extracting embeddings for midwest: 100%|██████████| 26/26 [00:44<00:00,  1.73s/it]


--------------------------------------------------west--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for west: 100%|██████████| 87/87 [00:52<00:00,  1.67it/s]
Extracting embeddings for west: 100%|██████████| 9/9 [00:20<00:00,  2.23s/it]


--------------------------------------------------northeast--------------------------------------------------


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting embeddings for northeast: 100%|██████████| 93/93 [00:50<00:00,  1.86it/s]
Extracting embeddings for northeast: 100%|██████████| 9/9 [00:18<00:00,  2.01s/it]
