In [1]:
# %% Loading libraries
import os
import sys
import argparse
import time
import datetime
import random
import pickle

from PIL import Image
from pathlib import Path
from collections import Counter
from collections import OrderedDict
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torchvision import transforms

from sklearn.model_selection import train_test_split

import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import WandbLogger

import timm
from timm import create_model
from timm.models.vision_transformer import VisionTransformer

# Custom library
sys.path.append('../process/')
from imageUtilities import load_images_and_labels
from loadData import ImageDataModule

sys.path.append('../architectures/')
from visionClassifierLayer import PreTrainedVisionModel

import warnings
warnings.filterwarnings('ignore')



In [2]:
# Simulate the command-line arguments
args_list = [
    '--model_name_or_path', 'vit_base_patch16_224',
    '--logged_entry_name', 'vgg16-seed:1111',
    '--data_dir', '/workspace/persistent/HTClipper/data/processed',
    '--data_type', 'all',
    '--city', 'south',
    '--save_dir', os.path.join(os.getcwd(), "../models/image-baselines"),
    '--batch_size', '32',
    '--nb_epochs', '40',
    '--patience', '3',
    '--seed', '1111',
    '--warmup_steps', '0',
    '--grad_steps', '4',
    '--learning_rate', '6e-4',
    '--train_data_percentage', '1.0',
    '--adam_epsilon', '1e-6',
    '--min_delta_change', '0.01',
    '--weight_decay', '0.01',
    '--augment_data', 'False',
    '--nb_augmented_samples', '5'
]

# Create the argument parser
parser = argparse.ArgumentParser(description="Trains a image classifier to establish baselines for Authorship tasks on Backpage advertisements.")
parser.add_argument('--model_name_or_path', type=str, default="vit_base_patch16_224", help="Name of the model to be trained (can only be between distilbert-base-cased)")
parser.add_argument('--logged_entry_name', type=str, default="vgg16-seed:1111", help="Logged entry name visible on weights and biases")
parser.add_argument('--data_dir', type=str, default='/workspace/persistent/HTClipper/data/processed', help="""Data directory""")
parser.add_argument('--city', type=str, default='south', help="""Demography of data, can be only between chicago, atlanta, houston, dallas, detroit, ny, or sf""")
parser.add_argument('--data_type', type=str, default="all", help="can be faces for the dataset with human faces or nofaces for body parts dataset")
parser.add_argument('--save_dir', type=str, default=os.path.join(os.getcwd(), "../models/image-baselines"), help="""Directory for models to be saved""")
parser.add_argument('--batch_size', type=int, default=32, help="Batch Size")
parser.add_argument('--nb_epochs', type=int, default=40, help="Number of Epochs")
parser.add_argument('--patience', type=int, default=3, help="Patience for Early Stopping")
parser.add_argument('--seed', type=int, default=1111, help='Random seed value')
parser.add_argument('--warmup_steps', type=int, default=0, help="Warmup proportion")
parser.add_argument('--grad_steps', type=int, default=4, help="Gradient accumulating step")
parser.add_argument('--learning_rate', type=float, default=6e-4, help="learning rate")
parser.add_argument('--train_data_percentage', type=float, default=1.0, help="Percentage of training data to be used")
parser.add_argument('--adam_epsilon', type=float, default=1e-6, help="Epsilon value for adam optimizer")
parser.add_argument('--min_delta_change', type=float, default=0.01, help="Minimum change in delta in validation loss for Early Stopping")
parser.add_argument('--weight_decay', type=float, default=0.01, help="Weight decay")
parser.add_argument('--augment_data', type=bool, default=False, help='Enables data augmentation')
parser.add_argument('--nb_augmented_samples', type=int, default=5, help='Number of augmented samples to be generated')

# Parse the arguments using the simulated args_list
args = parser.parse_args(args_list)

In [3]:
# Setting seed value for reproducibility    
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(args.seed)
random.seed(args.seed)
os.environ['PYTHONHASHSEED'] = str(args.seed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
seed_everything(args.seed)

Seed set to 1111


1111

In [4]:
# Making sure that the input variables are right
assert args.data_type in ["faces", "nofaces", "all"]
assert args.model_name_or_path in ['vgg16', 'vgg19', "resnet50", "resnet101", "resnet152", "mobilenet", "mobilenetv2", "densenet121", "densenet169", 
                                "efficientnet-b0", "efficientnet-b1", "efficientnet-b2", "efficientnet-b3", "efficientnet-b4", "efficientnet-b5", "efficientnet-b6",
                                "efficientnet-b7", "efficientnetv2_rw_m", "efficientnetv2_rw_s", "efficientnetv2_rw_t", "convnext_tiny", "convnext_small", 
                                "convnext_base", "convnext_large", "convnext_xlarge", "vit_base_patch16_224", "vit_large_patch16_224", "vit_base_patch32_224", 
                                "vit_large_patch32_224", "inception_v3", "inception_resnet_v2" ]

# Creating directories
directory = os.path.join(args.save_dir, args.model_name_or_path.split("/")[-1], args.city, args.data_type, 
                        "seed:" + str(args.seed), "lr-" + str(args.learning_rate))
Path(directory).mkdir(parents=True, exist_ok=True)
Path(args.save_dir).mkdir(parents=True, exist_ok=True)

In [5]:
# Function to extract cls_token embeddings and labels from the model
def extract_cls_embeddings_and_labels_from_vit(model, dataloader):
    cls_embeddings = []
    all_labels = []
    device = next(model.parameters()).device  # Get model's device

    for batch in dataloader:
        images, labels = batch
        images = images.to(device)  # Transfer images to the device

        with torch.no_grad():
            outputs = model.forward_features(images)
            cls_token_embeddings = outputs[:, 0, :]  # Extract cls_token embeddings

        cls_embeddings.append(cls_token_embeddings.cpu())
        all_labels.append(labels.cpu())

    # Concatenate the embeddings and labels along the first dimension
    cls_embeddings = torch.cat(cls_embeddings, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    return cls_embeddings.numpy(), all_labels.numpy()

In [6]:
# Function to extract embeddings and labels (assuming this is defined somewhere else)
def extract_cls_embeddings_and_labels_from_convnext(model, dataloader, device="cuda"):
    model.eval()
    embeddings = []
    labels = []

    with torch.no_grad():
        for images, label in dataloader:
            images = images.to(device)
            outputs = model(images)
            embeddings.append(outputs.cpu())
            labels.append(label.cpu())

    embeddings = torch.cat(embeddings)
    labels = torch.cat(labels)
    return embeddings, labels

In [7]:
# Define a dictionary to map cities and data types to their respective file paths
file_paths = {
    "chicago": {
        "faces": "chicago_faces.csv",
        "nofaces": "chicago_nofaces.csv",
        "all": "chicago_images.csv"
    },
    "dallas": {
        "faces": "dallas_faces.csv",
        "nofaces": "dallas_nofaces.csv",
        "all": "dallas_images.csv"
    },
    "houston": {
        "faces": "houston_faces.csv",
        "nofaces": "houston_nofaces.csv",
        "all": "houston_images.csv"
    },
    "detroit": {
        "faces": "detroit_faces.csv",
        "nofaces": "detroit_nofaces.csv",
        "all": "detroit_images.csv"
    },
    "atlanta": {
        "faces": "atlanta_faces.csv",
        "nofaces": "atlanta_nofaces.csv",
        "all": "atlanta_images.csv"
    },
    "sf": {
        "faces": "sf_faces.csv",
        "nofaces": "sf_nofaces.csv",
        "all": "sf_images.csv"
    },
    "ny": {
        "faces": "ny_faces.csv",
        "nofaces": "ny_nofaces.csv",
        "all": "ny_images.csv"
    },
    "south" : {
        "all": "south_images.csv"
    },
    "midwest" : {
        "all": "midwest_images.csv"
    },
    "west" : {
        "all": "west_images.csv"
    },
    "northeast" : {
        "all": "northeast_images.csv"
    },
}

def generate_embeddings(model, model_name):
    all_cities = ["south", "midwest", "west", "northeast"]
    
    for city in tqdm(all_cities):
        for data_type in tqdm(["all"], leave=False):
            tqdm.write(f"Processing {city} - {data_type}")  # Print the current city and data type being processed
            # Construct the file path and read the CSV file
            file_path = os.path.join(args.data_dir, file_paths[city][data_type])
            df = pd.read_csv(file_path)

            # Remove vendors that have less than 2 ads
            vendors_of_interest = {k: v for k, v in Counter(df.VENDOR).items() if v > 1}
            df = df[df['VENDOR'].isin(vendors_of_interest.keys())]

            # Remap new vendor IDs
            all_vendors = df.VENDOR.unique()
            vendor_to_idx_dict = {vendor: idx for idx, vendor in enumerate(all_vendors)}
            df["VENDOR"] = df["VENDOR"].replace(vendor_to_idx_dict)

            # Load and preprocess images
            images, labels = load_images_and_labels(df, target_size=(224, 224), augment=False,
                                                    num_augmented_samples=args.nb_augmented_samples)
            assert images.shape[0] == labels.shape[0]

            # Split data into training and test sets
            X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.20, random_state=1111)

            # Instantiate DataModule and Model
            num_classes = df.VENDOR.nunique()
            data_module = ImageDataModule(X_train, y_train, X_test, y_test, X_test, y_test, 
                                          batch_size=args.batch_size, augment_data=False)

            # Setup the data module for training/validation and testing
            data_module.setup('fit')
            data_module.setup('test')

            # Extract embeddings and labels
            if model_name == "pretrained_vit_patch16":
                train_embeddings, train_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.train_dataloader()) 
                test_embeddings, test_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.test_dataloader())
            
            else:
                train_embeddings, train_labels = extract_cls_embeddings_and_labels_from_convnext(model, data_module.train_dataloader()) 
                test_embeddings, test_labels = extract_cls_embeddings_and_labels_from_convnext(model, data_module.test_dataloader())

            assert train_embeddings.shape[0] == train_labels.shape[0]
            assert test_embeddings.shape[0] == test_labels.shape[0]

            # Save the embeddings and labels to disk
            base_path = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/" + model_name
            torch.save(train_embeddings, os.path.join(base_path, f"{model_name}_{city}_{data_type}_train_embeddings.pt"))
            torch.save(train_labels, os.path.join(base_path, f"{model_name}_{city}_{data_type}_train_labels.pt"))
            torch.save(test_embeddings, os.path.join(base_path, f"{model_name}_{city}_{data_type}_test_embeddings.pt"))
            torch.save(test_labels, os.path.join(base_path, f"{model_name}_{city}_{data_type}_test_labels.pt"))

# ViT Transformers

In [13]:
# Load the ViT model
model = timm.create_model(
    'vit_base_patch16_224',
    pretrained=True,
    num_classes=0  # remove classifier nn.Linear
).cuda()
model = model.eval()

# Get model-specific transforms
data_config = timm.data.resolve_model_data_config(model)
transform = timm.data.create_transform(**data_config, is_training=False)

INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)
INFO:timm.models._hub:[timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


In [14]:
generate_embeddings(model, "pretrained_vit_patch16")

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
                                     
  0%|          | 0/4 [00:00<?, ?it/s][A

Processing south - all



100%|██████████| 1/1 [02:02<00:00, 122.76s/it][A
 25%|██▌       | 1/4 [02:02<06:08, 122.76s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                              
 25%|██▌       | 1/4 [02:02<06:08, 122.76s/it]

Processing midwest - all



100%|██████████| 1/1 [01:04<00:00, 64.39s/it][A
 50%|█████     | 2/4 [03:07<02:56, 88.43s/it] [A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                             
 50%|█████     | 2/4 [03:07<02:56, 88.43s/it]

Processing west - all



100%|██████████| 1/1 [00:26<00:00, 26.39s/it][A
 75%|███████▌  | 3/4 [03:33<01:00, 60.11s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                             
 75%|███████▌  | 3/4 [03:33<01:00, 60.11s/it]

Processing northeast - all



100%|██████████| 1/1 [00:27<00:00, 27.69s/it][A
100%|██████████| 4/4 [04:01<00:00, 60.32s/it][A


# Loading the Pre-trained embeddings

# ConvNext Small

In [34]:
# Load the ViT model
model = timm.create_model(
    'convnext_small',
    pretrained=True,
    num_classes=0  # remove classifier nn.Linear
).cuda()
model = model.eval()

# Get model-specific transforms
data_config = timm.data.resolve_model_data_config(model)
transform = timm.data.create_transform(**data_config, is_training=False)

INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/convnext_small.in12k_ft_in1k)
INFO:timm.models._hub:[timm/convnext_small.in12k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


In [15]:
a = generate_embeddings(model, "pretrained_convNext-s")

  0%|          | 0/7 [00:00<?, ?it/s]
                                     [A
  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing chicago - faces



                                     .56s/it][A
  0%|          | 0/7 [00:24<?, ?it/s]        
 33%|███▎      | 1/3 [00:24<00:49, 24.56s/it][A

Processing chicago - nofaces



                                     .62s/it][A
  0%|          | 0/7 [00:42<?, ?it/s]        
 67%|██████▋   | 2/3 [00:42<00:20, 20.62s/it][A

Processing chicago - all



100%|██████████| 3/3 [01:25<00:00, 31.02s/it][A
 14%|█▍        | 1/7 [01:25<08:34, 85.82s/it][A
                                             
 14%|█▍        | 1/7 [01:25<08:34, 85.82s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing dallas - faces



                                             [A
 14%|█▍        | 1/7 [01:42<08:34, 85.82s/it]
 33%|███▎      | 1/3 [00:16<00:33, 16.96s/it][A

Processing dallas - nofaces



                                             [A
 14%|█▍        | 1/7 [01:57<08:34, 85.82s/it]
 67%|██████▋   | 2/3 [00:31<00:15, 15.59s/it][A

Processing dallas - all



100%|██████████| 3/3 [01:01<00:00, 22.01s/it][A
 29%|██▊       | 2/7 [02:27<05:56, 71.36s/it][A
                                             
 29%|██▊       | 2/7 [02:27<05:56, 71.36s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing houston - faces



                                             [A
 29%|██▊       | 2/7 [02:48<05:56, 71.36s/it]
 33%|███▎      | 1/3 [00:21<00:42, 21.00s/it][A

Processing houston - nofaces



                                             [A
 29%|██▊       | 2/7 [03:07<05:56, 71.36s/it]
 67%|██████▋   | 2/3 [00:40<00:20, 20.16s/it][A

Processing houston - all



100%|██████████| 3/3 [01:20<00:00, 28.96s/it][A
 43%|████▎     | 3/7 [03:47<05:01, 75.31s/it][A
                                             
 43%|████▎     | 3/7 [03:47<05:01, 75.31s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing detroit - faces



                                             [A
 43%|████▎     | 3/7 [03:52<05:01, 75.31s/it]
 33%|███▎      | 1/3 [00:05<00:11,  5.65s/it][A

Processing detroit - nofaces



                                             [A
 43%|████▎     | 3/7 [03:59<05:01, 75.31s/it]
 67%|██████▋   | 2/3 [00:12<00:06,  6.27s/it][A

Processing detroit - all



100%|██████████| 3/3 [00:23<00:00,  8.60s/it][A
 57%|█████▋    | 4/7 [04:10<02:44, 54.95s/it][A
                                             
 57%|█████▋    | 4/7 [04:10<02:44, 54.95s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing atlanta - faces



                                             [A
 57%|█████▋    | 4/7 [04:25<02:44, 54.95s/it]
 33%|███▎      | 1/3 [00:14<00:29, 14.66s/it][A

Processing atlanta - nofaces



                                             [A
 57%|█████▋    | 4/7 [04:43<02:44, 54.95s/it]
 67%|██████▋   | 2/3 [00:32<00:16, 16.67s/it][A

Processing atlanta - all



100%|██████████| 3/3 [01:03<00:00, 23.33s/it][A
 71%|███████▏  | 5/7 [05:14<01:56, 58.22s/it][A
                                             
 71%|███████▏  | 5/7 [05:14<01:56, 58.22s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing sf - faces



                                             [A
 71%|███████▏  | 5/7 [05:28<01:56, 58.22s/it]
 33%|███▎      | 1/3 [00:13<00:26, 13.33s/it][A

Processing sf - nofaces



                                             [A
 71%|███████▏  | 5/7 [05:38<01:56, 58.22s/it]
 67%|██████▋   | 2/3 [00:23<00:11, 11.32s/it][A

Processing sf - all



100%|██████████| 3/3 [00:44<00:00, 15.99s/it][A
 86%|████████▌ | 6/7 [05:59<00:53, 53.66s/it][A
                                             
 86%|████████▌ | 6/7 [05:59<00:53, 53.66s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing ny - faces



                                             [A
 86%|████████▌ | 6/7 [06:14<00:53, 53.66s/it]
 33%|███▎      | 1/3 [00:14<00:28, 14.49s/it][A

Processing ny - nofaces



                                             [A
 86%|████████▌ | 6/7 [06:23<00:53, 53.66s/it]
 67%|██████▋   | 2/3 [00:23<00:11, 11.45s/it][A

Processing ny - all



100%|██████████| 3/3 [00:47<00:00, 17.18s/it][A
100%|██████████| 7/7 [06:47<00:00, 58.20s/it][A


# Trained models on chicago dataset

In [19]:
# Define global lists for capturing hidden states
hidden_states = []
hidden_states_vit = []

# Function to load the model from local directory and adjust state_dict keys
def load_model(model_name, checkpoint_path, num_classes=1000):
    model = create_model(model_name, pretrained=False, num_classes=num_classes)
    checkpoint = torch.load(checkpoint_path)
    
    # Adjust the keys in the state_dict
    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    else:
        state_dict = checkpoint
    
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        # Remove 'model.' prefix from keys if present
        if k.startswith('model.'):
            k = k[6:]
        new_state_dict[k] = v
    
    # Modify the final fully connected layer to match the checkpoint's layer size
    if 'head.fc.weight' in new_state_dict:
        num_classes_checkpoint = new_state_dict['head.fc.weight'].shape[0]
        model.head.fc = torch.nn.Linear(model.head.fc.in_features, num_classes_checkpoint)
    elif 'head.weight' in new_state_dict:
        num_classes_checkpoint = new_state_dict['head.weight'].shape[0]
        model.head = torch.nn.Linear(model.head.in_features, num_classes_checkpoint)
    
    model.load_state_dict(new_state_dict)
    model.eval()
    return model

# Hook function to capture hidden states
def get_hidden_states(module, input, output):
    hidden_states.append(output)

# Hook function to capture hidden states for ViT model
def get_vit_hidden_states(module, input, output):
    hidden_states_vit.append(output)
    
# Function to extract embeddings and labels from ConvNext model
def extract_cls_embeddings_and_labels_from_convnext(model, dataloader):
    embeddings = []
    labels = []
    device = next(model.parameters()).device
    model.eval()
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        hidden_states.clear()  # Clear hidden states for each batch
        with torch.no_grad():
            outputs = model(inputs)
            if hidden_states:
                last_hidden_state = hidden_states[-1]
                if last_hidden_state.ndim == 4:
                    mean_pooled = torch.mean(last_hidden_state, dim=[2, 3])  # Mean pooling over spatial dimensions
                else:
                    mean_pooled = last_hidden_state  # If already flattened, use directly
            else:
                mean_pooled = outputs
            embeddings.append(mean_pooled.cpu())
            labels.append(targets.cpu())
    return torch.cat(embeddings), torch.cat(labels)

# Function to extract embeddings and labels from Vision Transformer model
def extract_cls_embeddings_and_labels_from_vit(model, dataloader):
    embeddings = []
    labels = []
    device = next(model.parameters()).device
    model.eval()
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        hidden_states_vit.clear()  # Clear hidden states for each batch
        with torch.no_grad():
            outputs = model(inputs)
            
            if hidden_states_vit:
                last_hidden_state = hidden_states_vit[-1]
                cls_token_embedding = last_hidden_state[:, 0, :]  # CLS token is the first token
            else:
                cls_token_embedding = outputs[:, 0, :]
            embeddings.append(cls_token_embedding.cpu())
            labels.append(targets.cpu())
    return torch.cat(embeddings), torch.cat(labels)

In [12]:
# Define a dictionary to map cities and data types to their respective file paths
file_paths = {
    "chicago": {
        "faces": "chicago_faces.csv",
        "nofaces": "chicago_nofaces.csv",
        "all": "chicago_images.csv"
    },
    "dallas": {
        "faces": "dallas_faces.csv",
        "nofaces": "dallas_nofaces.csv",
        "all": "dallas_images.csv"
    },
    "houston": {
        "faces": "houston_faces.csv",
        "nofaces": "houston_nofaces.csv",
        "all": "houston_images.csv"
    },
    "detroit": {
        "faces": "detroit_faces.csv",
        "nofaces": "detroit_nofaces.csv",
        "all": "detroit_images.csv"
    },
    "atlanta": {
        "faces": "atlanta_faces.csv",
        "nofaces": "atlanta_nofaces.csv",
        "all": "atlanta_images.csv"
    },
    "sf": {
        "faces": "sf_faces.csv",
        "nofaces": "sf_nofaces.csv",
        "all": "sf_images.csv"
    },
    "ny": {
        "faces": "ny_faces.csv",
        "nofaces": "ny_nofaces.csv",
        "all": "ny_images.csv"
    },
    "south" : {
        "all": "south_images.csv"
    },
    "midwest" : {
        "all": "midwest_images.csv"
    },
    "west" : {
        "all": "west_images.csv"
    },
    "northeast" : {
        "all": "northeast_images.csv"
    },
}

def generate_embeddings(model, model_save_dir, trained_on):
    # all_cities = ["chicago", "dallas", "houston", "detroit", "atlanta", "sf", "ny"]
    all_cities = ["south", "midwest", "west", "northeast"]
    # all_cities = ["ny"]
    if trained_on == "faces":
        data_types = ["faces"]
    elif trained_on == "nofaces":
        data_types = ["nofaces"]
    elif trained_on == "all":
        # data_types = ["faces", "nofaces", "all"]
        data_types = ["all"]
    else:
        raise Exception("Implementation only carried out for faces, nofaces, and all datasets.")
    
    for city in tqdm(all_cities):
        for data_type in tqdm(data_types, leave=False):
            tqdm.write(f"Processing {city} - {data_type}")  # Print the current city and data type being processed
            # Construct the file path and read the CSV file
            file_path = os.path.join(args.data_dir, file_paths[city][data_type])
            df = pd.read_csv(file_path)

            # Remove vendors that have less than 2 ads
            vendors_of_interest = {k: v for k, v in Counter(df.VENDOR).items() if v > 1}
            df = df[df['VENDOR'].isin(vendors_of_interest.keys())]

            # Remap new vendor IDs
            all_vendors = df.VENDOR.unique()
            vendor_to_idx_dict = {vendor: idx for idx, vendor in enumerate(all_vendors)}
            df["VENDOR"] = df["VENDOR"].replace(vendor_to_idx_dict)

            # Load and preprocess images
            images, labels = load_images_and_labels(df, target_size=(224, 224), augment=False,
                                                    num_augmented_samples=args.nb_augmented_samples)
            assert images.shape[0] == labels.shape[0]

            # Split data into training and test sets
            X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.20, random_state=1111)

            # Instantiate DataModule and Model
            num_classes = df.VENDOR.nunique()
            data_module = ImageDataModule(X_train, y_train, X_test, y_test, X_test, y_test, 
                                          batch_size=args.batch_size, augment_data=False)

            # Setup the data module for training/validation and testing
            data_module.setup('fit')
            data_module.setup('test')
            
            # Extract embeddings and labels
            hidden_states.clear()  # Clear previous hooks
            hidden_states_vit.clear()  # Clear previous hooks

            # Extract embeddings and labels
            if model_save_dir in ["trained_vit_patch16_chicago", "trained_contra_vit_patch16_all", "trained_contra_vit_patch16_chicago", 
                                  "trained_vit_contraonly_chicago", "trained_vit_contraonly_all", "trained_vit_contraonly_chicago_temp:0.3", 
                                 "trained_vit_contramixed", "trained_vit_contraonly", "trained_vit_patch16", "trained_vit_contramix_CE+triplets", "trained_vit_contramix_triplets"]:
                model.blocks[-1].register_forward_hook(get_vit_hidden_states)
                train_embeddings, train_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.train_dataloader()) 
                test_embeddings, test_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.test_dataloader())
            
            elif model_save_dir == "trained_convNext-s_all" or model_save_dir == "trained_convNext-s_all/all":
                for name, layer in model.named_modules():
                    if isinstance(layer, torch.nn.Sequential):
                        layer.register_forward_hook(get_hidden_states)
                train_embeddings, train_labels = extract_cls_embeddings_and_labels_from_convnext(model, data_module.train_dataloader()) 
                test_embeddings, test_labels = extract_cls_embeddings_and_labels_from_convnext(model, data_module.test_dataloader())
            else:
                raise Exception("Script to be extended for other models. ")
                
            assert train_embeddings.shape[0] == train_labels.shape[0]
            assert test_embeddings.shape[0] == test_labels.shape[0]

            # Save the embeddings and labels to disk
            base_path = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/" + model_save_dir + "/" + data_type
            # /workspace/persistent/HTClipper/models/pickled/embeddings/image_embeddings/trained_convNext-s_all/all/all
            os.makedirs(base_path, exist_ok=True)
            
            # model_save_dir1 = "trained_vit_patch16__all"
            
            torch.save(train_embeddings, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_train_embeddings.pt"))
            torch.save(train_labels, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_train_labels.pt"))
            torch.save(test_embeddings, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_test_embeddings.pt"))
            torch.save(test_labels, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_test_labels.pt"))
            
            
def generate_embeddings_for_semi_supervised_models(model, model_save_dir, trained_on):
    # all_cities = ["chicago", "dallas", "houston", "detroit", "atlanta", "sf", "ny"]
    all_cities = ["south", "midwest", "west", "northeast"]
    # all_cities = ["ny"]
    if trained_on == "faces":
        data_types = ["faces"]
    elif trained_on == "nofaces":
        data_types = ["nofaces"]
    elif trained_on == "all":
        # data_types = ["faces", "nofaces", "all"]
        data_types = ["all"]
    else:
        raise Exception("Implementation only carried out for faces, nofaces, and all datasets.")
    
    for city in tqdm(all_cities):
        for data_type in tqdm(data_types, leave=False):
            tqdm.write(f"Processing {city} - {data_type}")  # Print the current city and data type being processed
            # Construct the file path and read the CSV file
            file_path = os.path.join(args.data_dir, file_paths[city][data_type])
            df = pd.read_csv(file_path)

            # Remove vendors that have less than 2 ads
            vendors_of_interest = {k: v for k, v in Counter(df.VENDOR).items() if v > 1}
            df = df[df['VENDOR'].isin(vendors_of_interest.keys())]

            # Remap new vendor IDs
            all_vendors = df.VENDOR.unique()
            vendor_to_idx_dict = {vendor: idx for idx, vendor in enumerate(all_vendors)}
            df["VENDOR"] = df["VENDOR"].replace(vendor_to_idx_dict)

            # Load and preprocess images
            images, labels = load_images_and_labels(df, target_size=(224, 224), augment=False,
                                                    num_augmented_samples=args.nb_augmented_samples)
            assert images.shape[0] == labels.shape[0]

            # Split data into training and test sets
            X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.20, random_state=1111)

            # Instantiate DataModule and Model
            num_classes = df.VENDOR.nunique()
            data_module = ImageDataModule(X_train, y_train, X_test, y_test, X_test, y_test, 
                                          batch_size=args.batch_size, augment_data=False)

            # Setup the data module for training/validation and testing
            data_module.setup('fit')
            data_module.setup('test')
            
            # Extract embeddings and labels
            hidden_states.clear()  # Clear previous hooks
            hidden_states_vit.clear()  # Clear previous hooks

            # Extract embeddings and labels
            if model_save_dir in ["trained_vit_patch16_chicago", "trained_contra_vit_patch16_all", "trained_contra_vit_patch16_chicago", 
                                  "trained_vit_contraonly_chicago", "trained_vit_contraonly_all", "trained_vit_contraonly_chicago_temp:0.3", 
                                 "trained_vit_contramixed", "trained_vit_contraonly", "trained_vit_patch16", "trained_vit_contramix_CE+triplets", 
                                  "trained_vit_contramix_triplets", "trained_vit_tripletonly"]:
                model.model.blocks[-1].register_forward_hook(get_vit_hidden_states)
                train_embeddings, train_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.train_dataloader()) 
                test_embeddings, test_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.test_dataloader())
            
            elif model_save_dir == "trained_convNext-s_all" or model_save_dir == "trained_convNext-s_all/all":
                for name, layer in model.named_modules():
                    if isinstance(layer, torch.nn.Sequential):
                        layer.register_forward_hook(get_hidden_states)
                train_embeddings, train_labels = extract_cls_embeddings_and_labels_from_convnext(model, data_module.train_dataloader()) 
                test_embeddings, test_labels = extract_cls_embeddings_and_labels_from_convnext(model, data_module.test_dataloader())
            else:
                raise Exception("Script to be extended for other models. ")
                
            assert train_embeddings.shape[0] == train_labels.shape[0]
            assert test_embeddings.shape[0] == test_labels.shape[0]

            # Save the embeddings and labels to disk
            base_path = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/" + model_save_dir + "/" + data_type
            # /workspace/persistent/HTClipper/models/pickled/embeddings/image_embeddings/trained_convNext-s_all/all/all
            os.makedirs(base_path, exist_ok=True)
            
            # model_save_dir1 = "trained_vit_patch16__all"
            
            torch.save(train_embeddings, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_train_embeddings.pt"))
            torch.save(train_labels, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_train_labels.pt"))
            torch.save(test_embeddings, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_test_embeddings.pt"))
            torch.save(test_labels, os.path.join(base_path, f"{model_save_dir}_{city}_{data_type}_test_labels.pt"))

In [29]:
# Load your trained models
# Generate embeddings for both models
# model = load_model('convnext_small', '/workspace/persistent/HTClipper/models/image-baselines/convnext_small/all/faces/seed:1111/lr-0.0001-all-FacesImages/final_model.ckpt')
# generate_embeddings(model, "trained_convNext-s_all", "faces")

# model = load_model('convnext_small', '/workspace/persistent/HTClipper/models/image-baselines/convnext_small/all/nofaces/seed:1111/lr-0.0001-all-NoFacesImages/final_model.ckpt')
# generate_embeddings(model, "trained_convNext-s_all", "nofaces")

# model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/image-baselines/vit_base_patch16_224/chicago/faces/seed:1111/lr-0.0001/final_model.ckpt')
# generate_embeddings(model, "trained_vit_patch16_chicago", "faces")

In [None]:
model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/grouped-and-masked/image-baselines/vit_base_patch16_224/south/all/seed:1111/lr-0.0001-CE+SupCon/final_model.ckpt')
generate_embeddings(model, "trained_vit_patch16", "all")

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
                                     
  0%|          | 0/4 [00:00<?, ?it/s][A

Processing south - all


In [None]:
import sys
sys.path.append('../../architectures/')
from visionContraLayer import SemiConstrativeVisionModel
model = SemiConstrativeVisionModel.load_from_checkpoint(
    checkpoint_path="/workspace/persistent/HTClipper/models/grouped-and-masked/image-baselines/contra-learn/semi-supervised/vit_base_patch16_224/south/all/seed:1111/lr-0.0001-SupCon/final_model.ckpt",
    model_name="vit_base_patch16_224",  # Pass other required arguments as needed
    num_training_steps=200,
)
generate_embeddings_for_semi_supervised_models(model, "trained_vit_contraonly", "all")

INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)
INFO:timm.models._hub:[timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
  0%|          | 0/4 [00:00<?, ?it/s]
                                     [A
  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

Processing south - all


In [None]:
model = SemiConstrativeVisionModel.load_from_checkpoint(
    checkpoint_path="/workspace/persistent/HTClipper/models/grouped-and-masked/image-baselines/contra-learn/semi-supervised/vit_base_patch16_224/south/all/seed:1111/lr-0.0001-triplet/final_model.ckpt",
    model_name="vit_base_patch16_224",  # Pass other required arguments as needed
    num_training_steps=200,
)
generate_embeddings_for_semi_supervised_models(model, "trained_vit_tripletonly", "all")

RuntimeError: Error(s) in loading state_dict for VisionTransformer:
	Missing key(s) in state_dict: "head.weight", "head.bias". 

contrastive models

In [None]:
model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/grouped-and-masked/image-baselines/contra-learn/vit_base_patch16_224/south/all/seed:1111/lr-0.0001-CE+triplet/final_model.ckpt')
generate_embeddings(model, "trained_vit_contramix_CE+triplets", "all")

In [None]:
model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/grouped-and-masked/image-baselines/contra-learn/vit_base_patch16_224/south/all/seed:1111/lr-0.0001-triplet/final_model.ckpt')
generate_embeddings(model, "trained_vit_contramix_triplets", "all")

# ViT model trained on all datasets

In [15]:
model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/image-baselines/vit_base_patch16_224/all/faces/seed:1111/lr-0.0001-all-FacesImages/final_model.ckpt')
generate_embeddings(model, "trained_vit_patch16_all", "faces")

  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
                                     
  0%|          | 0/7 [00:00<?, ?it/s][A

Processing chicago - faces



100%|██████████| 1/1 [08:03<00:00, 483.09s/it][A
 14%|█▍        | 1/7 [08:03<48:18, 483.10s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                              
 14%|█▍        | 1/7 [08:03<48:18, 483.10s/it]

Processing dallas - faces



100%|██████████| 1/1 [03:00<00:00, 180.63s/it][A
 29%|██▊       | 2/7 [11:03<25:25, 305.18s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                              
 29%|██▊       | 2/7 [11:03<25:25, 305.18s/it]

Processing houston - faces



100%|██████████| 1/1 [03:52<00:00, 232.36s/it][A
 43%|████▎     | 3/7 [14:56<18:07, 271.93s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                              
 43%|████▎     | 3/7 [14:56<18:07, 271.93s/it]

Processing detroit - faces



100%|██████████| 1/1 [00:57<00:00, 57.74s/it][A
 57%|█████▋    | 4/7 [15:53<09:22, 187.38s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                              
 57%|█████▋    | 4/7 [15:53<09:22, 187.38s/it]

Processing atlanta - faces



100%|██████████| 1/1 [02:51<00:00, 171.89s/it][A
 71%|███████▏  | 5/7 [18:45<06:03, 181.79s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                              
 71%|███████▏  | 5/7 [18:45<06:03, 181.79s/it]

Processing sf - faces



100%|██████████| 1/1 [02:44<00:00, 164.04s/it][A
 86%|████████▌ | 6/7 [21:29<02:55, 175.76s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
                                              
 86%|████████▌ | 6/7 [21:29<02:55, 175.76s/it]

Processing ny - faces



100%|██████████| 1/1 [03:08<00:00, 188.71s/it][A
100%|██████████| 7/7 [24:38<00:00, 211.22s/it][A


In [None]:
model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/image-baselines/vit_base_patch16_224/all/nofaces/seed:1111/lr-0.0001-all-NoFacesImages/final_model.ckpt')
generate_embeddings(model, "trained_vit_patch16_all", "nofaces")

  0%|          | 0/7 [00:00<?, ?it/s]
                                     [A
  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

Processing chicago - nofaces


In [11]:
model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/image-baselines/vit_base_patch16_224/all/all/seed:1111/lr-0.0001-all-allImages/final_model.ckpt')
generate_embeddings(model, "trained_vit_patch16_all/all", "all")

  0%|          | 0/1 [00:00<?, ?it/s]
                                     [A
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

Processing ny - faces



                                     7.87s/it][A
  0%|          | 0/1 [03:57<?, ?it/s]         
 33%|███▎      | 1/3 [03:57<07:55, 237.87s/it][A

Processing ny - nofaces



                                     4.12s/it][A
  0%|          | 0/1 [05:33<?, ?it/s]         
 67%|██████▋   | 2/3 [05:33<02:34, 154.12s/it][A

Processing ny - all



100%|██████████| 3/3 [10:12<00:00, 211.19s/it][A
100%|██████████| 1/1 [10:12<00:00, 612.48s/it][A


# Getting the false positive and true positive retrieval results

In [21]:
def extract_cls_embeddings_and_labels_from_vit(model, dataloader):
    embeddings = []
    labels = []
    device = next(model.parameters()).device
    model.eval()
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        with torch.no_grad():
            outputs = model.forward_features(inputs)
            cls_token_embedding = outputs[:, 0, :]  # CLS token is the first token
            embeddings.append(cls_token_embedding.cpu())
            labels.append(targets.cpu())
    return torch.cat(embeddings), torch.cat(labels)

In [27]:
def generate_face_embeddings(model, model_name, mode="face"):
    all_cities = ["south", "midwest", "west", "northeast"]
    
    for city in tqdm(all_cities):
        for data_type in tqdm(["all"], leave=False):
            tqdm.write(f"Processing {city} - {data_type}")  # Print the current city and data type being processed
            # Construct the file path and read the CSV file
            file_path = os.path.join(args.data_dir, file_paths[city][data_type])
            df = pd.read_csv(file_path)

            # Removing vendors that have less than 2 ads
            vendors_of_interest = {k:v for k, v in dict(Counter(df.VENDOR)).items() if v>1}
            df = df[df['VENDOR'].isin(list(vendors_of_interest.keys()))]

            # Remapping new vendor ids
            all_vendors = df.VENDOR.unique()
            vendor_to_idx_dict = {vendor: idx for idx, vendor in enumerate(all_vendors)}
            df["VENDOR"] = df["VENDOR"].replace(vendor_to_idx_dict)

            train_df, test_df = train_test_split(df, test_size=0.20, random_state=1111, stratify=df['VENDOR'])
            train_df, val_df = train_test_split(train_df, test_size=0.05, random_state=1111, stratify=train_df['VENDOR'])

            # Faces Dataset
            train_images, train_labels = load_images_and_labels(train_df, target_size=(224, 224), augment=False,
                                                     num_augmented_samples=args.nb_augmented_samples)

            val_images, val_labels = load_images_and_labels(val_df, target_size=(224, 224), augment=False,
                                                     num_augmented_samples=args.nb_augmented_samples)

            if mode == "face":
                test_df = test_df[test_df['IF_FACE'] == "yes"]
            else:
                test_df = test_df[test_df['IF_FACE'] == "no"]

            test_images, test_labels = load_images_and_labels(test_df, target_size=(224, 224), augment=False, num_augmented_samples=args.nb_augmented_samples)
            data_module = ImageDataModule(train_images, train_labels, val_images, val_labels, test_images, test_labels, batch_size=args.batch_size, augment_data=args.augment_data)
            # Setup the data module for training/validation and testing
            data_module.setup('fit')
            # data_module.setup('test')

            # Extract embeddings and labels
            train_embeddings, train_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.train_dataloader()) 
            test_embeddings, test_labels = extract_cls_embeddings_and_labels_from_vit(model, data_module.test_dataloader())

            assert train_embeddings.shape[0] == train_labels.shape[0]
            assert test_embeddings.shape[0] == test_labels.shape[0]

            # Save the embeddings and labels to disk
            file_dir = f"/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/error_analysis/vision_baselines/trained_{model_name}/{mode}"
            Path(file_dir).mkdir(parents=True, exist_ok=True)            
            torch.save(train_embeddings, os.path.join(file_dir, f"{model_name}_{city}_train_embeddings.pt"))
            torch.save(train_labels, os.path.join(file_dir, f"{model_name}_{city}_train_labels.pt"))
            torch.save(test_embeddings, os.path.join(file_dir, f"{model_name}_{city}_test_embeddings.pt"))
            torch.save(test_labels, os.path.join(file_dir, f"{model_name}_{city}_test_labels.pt"))

In [25]:
model = load_model('vit_base_patch16_224', '/workspace/persistent/HTClipper/models/grouped-and-masked/image-baselines/contra-learn/vit_base_patch16_224/south/all/seed:1111/lr-0.0001-CE+SupCon/final_model.ckpt').eval()

In [None]:
generate_face_embeddings(model, "vit_patch16", mode="face")

  0%|          | 0/4 [00:00<?, ?it/s]
                                     [A
  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

Processing south - all



100%|██████████| 1/1 [17:35<00:00, 1055.16s/it][A
 25%|██▌       | 1/4 [17:35<52:45, 1055.16s/it][A
                                               
 25%|██▌       | 1/4 [17:35<52:45, 1055.16s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Processing midwest - all


In [None]:
generate_face_embeddings(model, "vit_patch16", mode="noface")