In [1]:
import os
import numpy as np

import torch
from torch.nn.functional import cosine_similarity

from tqdm import tqdm

import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_embeddings(model_name, city):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    if model_name == "declutr":
        emb_dir = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/pretrained_declutr/"
        train_label_filename = "pretrained_checkpoint_" + model_name + "_" + city + "_labels_train.pt"
        train_data_filename = "pretrained_checkpoint_" + model_name  + "_" + city + "_data_train.pt"
        test_label_filename = "pretrained_checkpoint_" + model_name + "_" + city + "_labels_test.pt"
        test_data_filename = "pretrained_checkpoint_" + model_name  + "_" + city + "_data_test.pt"
    else:
        emb_dir = "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/pretrained_vit_patch16/"
        train_label_filename = "pretrained_vit_patch16_" + city + "_all_train_labels.pt"
        train_data_filename = "pretrained_vit_patch16_" + city + "_all_train_embeddings.pt"
        test_label_filename = "pretrained_vit_patch16_" + city + "_all_test_labels.pt"
        test_data_filename = "pretrained_vit_patch16_" + city + "_all_test_embeddings.pt"
    
    train_emb = torch.load(os.path.join(emb_dir, train_data_filename), map_location=device)
    # train_labels = torch.load(os.path.join(emb_dir, train_label_filename), map_location=device)
    
    test_emb = torch.load(os.path.join(emb_dir, test_data_filename), map_location=device)
    # test_labels = torch.load(os.path.join(emb_dir, test_label_filename), map_location=device)
    
    embeddings = torch.cat((torch.from_numpy(train_emb), torch.from_numpy(test_emb)), dim=0)
    return embeddings

In [3]:
def calculate_similarity_matrix(embeddings1, embeddings2, batch_size=512):
    """
    Calculates the cosine similarity between two sets of embeddings in a batched manner.
    Args:
    - embeddings1, embeddings2: Tensors containing embeddings (size [N, d] and [M, d]).
    - batch_size: Size of the batch for batched similarity calculation.

    Returns:
    - similarity_matrix: Tensor of cosine similarities of size [N, M].
    """
    similarities = []
    # Outer loop over embeddings1 with tqdm
    for i in tqdm(range(0, embeddings1.size(0), batch_size), desc="Processing embeddings1 batches"):
        batch1 = embeddings1[i:i + batch_size]
        
        # Inner loop over embeddings2 with tqdm
        batch_similarities = []
        for j in range(0, embeddings2.size(0), batch_size):
            batch2 = embeddings2[j:j + batch_size]
            # Compute similarity and store
            sim = cosine_similarity(batch1.unsqueeze(1), batch2.unsqueeze(0), dim=2)
            batch_similarities.append(sim)
        
        # Concatenate along columns for the current batch of embeddings1
        similarities.append(torch.cat(batch_similarities, dim=1))
    
    # Concatenate all batches along rows to form the full similarity matrix
    similarity_matrix = torch.cat(similarities, dim=0)
    return similarity_matrix.cpu()

# Declutr-model

In [33]:
datasets = {
    "South": load_embeddings("declutr", "south"),
    "West": load_embeddings("declutr", "west"),
    "Northeast": load_embeddings("declutr", "northeast"),
    "Midwest": load_embeddings("declutr", "midwest")
}

In [34]:
# Function to calculate similarity between two embeddings
def calculate_average_similarity(embeddings1, embeddings2, batch_size=512):
    similarities = []
    for i in tqdm(range(0, embeddings1.size(0), batch_size), desc="Processing batch"):
        batch1 = embeddings1[i:i + batch_size]
        batch_similarities = []
        for j in range(0, embeddings2.size(0), batch_size):
            batch2 = embeddings2[j:j + batch_size]
            sim = cosine_similarity(batch1.unsqueeze(1), batch2.unsqueeze(0), dim=2)
            batch_similarities.append(sim.mean().item())  # Compute mean similarity for this sub-batch
        similarities.append(np.mean(batch_similarities))  # Average sub-batch similarities
    return np.mean(similarities)  # Final average similarity for the entire batch

In [35]:
# Initialize an empty dictionary to store similarities
similarity_matrix = {}
regions = list(datasets.keys())

for i, region1 in enumerate(regions):
    if region1 not in similarity_matrix:
        similarity_matrix[region1] = {}  # Initialize the inner dictionary
    for j, region2 in enumerate(regions):
        if region2 not in similarity_matrix:
            similarity_matrix[region2] = {}  # Ensure initialization for the symmetric entry
        if i == j:
            similarity_matrix[region1][region2] = 1.0  # Similarity with itself
        elif j > i:  # Avoid redundant calculations
            similarity = calculate_average_similarity(datasets[region1], datasets[region2])
            similarity_matrix[region1][region2] = similarity
            similarity_matrix[region2][region1] = similarity  # Symmetric matrix

# Convert dictionary to a 2D numpy array for heatmap
matrix = np.array([[similarity_matrix[region1][region2] for region2 in regions] for region1 in regions])

# Normalize the similarity matrix for the heatmap
normalized_matrix = (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

Processing batch: 100%|██████████| 28/28 [00:52<00:00,  1.87s/it]
Processing batch: 100%|██████████| 28/28 [00:42<00:00,  1.52s/it]
Processing batch: 100%|██████████| 28/28 [02:16<00:00,  4.89s/it]
Processing batch: 100%|██████████| 7/7 [00:09<00:00,  1.37s/it]
Processing batch: 100%|██████████| 7/7 [00:31<00:00,  4.54s/it]
Processing batch: 100%|██████████| 6/6 [00:25<00:00,  4.18s/it]


In [37]:
# Mask only the lower triangle by setting it to NaN, without affecting values in the upper triangle
masked_matrix = np.triu(matrix, k=0)
masked_matrix[masked_matrix == 0] = np.nan  # Mask zero values in lower triangle for better clarity

# Generate heatmap with Plotly, including text annotations for each cell
fig = go.Figure(data=go.Heatmap(
    z=masked_matrix,
    x=regions,
    y=regions,
    colorscale="RdYlGn",
    colorbar=dict(title=" ", titlefont=dict(size=20), tickfont=dict(size=20)),
    zmin=0,
    zmax=1,
    text=[[f"{val:.2f}" if not np.isnan(val) else "" for val in row] for row in masked_matrix],  # Display values to 2 decimal points
    texttemplate="%{text}",
    textfont=dict(size=20),  # Increase font size for the values
    showscale=True
))

# Update layout to hide inner ticks and set the title
fig.update_layout(
    xaxis=dict(
        title="Region",
        titlefont=dict(size=20),  # Increase font size of the axis title
        tickfont=dict(size=20),  # Increase font size of the tick labels
        tickmode="array",
        tickvals=list(range(len(regions))),
        ticktext=regions,
        showgrid=False,
        ticks=""
    ),
    yaxis=dict(
        title="Region",
        titlefont=dict(size=20),  # Increase font size of the axis title
        tickfont=dict(size=20),  # Increase font size of the tick labels
        tickmode="array",
        tickvals=list(range(len(regions))),
        ticktext=regions,
        showgrid=False,
        ticks=""
    ),
    width=700,
    height=600
)

# Render the figure
import plotly.io as pio
pio.renderers.default = 'iframe'

fig.show()

# Vit-patch16

In [None]:
datasets = {
    "South": load_embeddings("vit", "south"),
    "West": load_embeddings("vit", "west"),
    "Northeast": load_embeddings("vit", "northeast"),
    "Midwest": load_embeddings("vit", "midwest")
}

In [None]:
# Initialize an empty dictionary to store similarities
similarity_matrix = {}
regions = list(datasets.keys())

for i, region1 in enumerate(regions):
    if region1 not in similarity_matrix:
        similarity_matrix[region1] = {}  # Initialize the inner dictionary
    for j, region2 in enumerate(regions):
        if region2 not in similarity_matrix:
            similarity_matrix[region2] = {}  # Ensure initialization for the symmetric entry
        if i == j:
            similarity_matrix[region1][region2] = 1.0  # Similarity with itself
        elif j > i:  # Avoid redundant calculations
            similarity = calculate_average_similarity(datasets[region1], datasets[region2])
            similarity_matrix[region1][region2] = similarity
            similarity_matrix[region2][region1] = similarity  # Symmetric matrix

# Convert dictionary to a 2D numpy array for heatmap
matrix = np.array([[similarity_matrix[region1][region2] for region2 in regions] for region1 in regions])

# Normalize the similarity matrix for the heatmap
normalized_matrix = (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

In [32]:
# Mask only the lower triangle by setting it to NaN, without affecting values in the upper triangle
masked_matrix = np.triu(matrix, k=0)
masked_matrix[masked_matrix == 0] = np.nan  # Mask zero values in lower triangle for better clarity

# Generate heatmap with Plotly, including text annotations for each cell
fig = go.Figure(data=go.Heatmap(
    z=masked_matrix,
    x=regions,
    y=regions,
    colorscale="PuBuGn",
    colorbar=dict(title=" ", titlefont=dict(size=20), tickfont=dict(size=20)),
    zmin=0,
    zmax=1,
    text=[[f"{val:.2f}" if not np.isnan(val) else "" for val in row] for row in masked_matrix],  # Display values to 2 decimal points
    texttemplate="%{text}",
    textfont=dict(size=20),  # Increase font size for the values
    showscale=True
))

# Update layout to hide inner ticks and set the title
fig.update_layout(
    xaxis=dict(
        title="Region",
        titlefont=dict(size=20),  # Increase font size of the axis title
        tickfont=dict(size=20),  # Increase font size of the tick labels
        tickmode="array",
        tickvals=list(range(len(regions))),
        ticktext=regions,
        showgrid=False,
        ticks=""
    ),
    yaxis=dict(
        title="Region",
        titlefont=dict(size=20),  # Increase font size of the axis title
        tickfont=dict(size=20),  # Increase font size of the tick labels
        tickmode="array",
        tickvals=list(range(len(regions))),
        ticktext=regions,
        showgrid=False,
        ticks=""
    ),
    width=700,
    height=600
)

# Render the figure
import plotly.io as pio
pio.renderers.default = 'iframe'

fig.show()

# Finding number of shared vendors per dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
south_df = pd.read_csv("../data/processed/south.csv")[['TEXT', 'IMAGES', 'VENDOR', 'FACES', 'CITY']].drop_duplicates()
midwest_df = pd.read_csv("../data/processed/midwest.csv")[['TEXT', 'IMAGES', 'VENDOR', 'FACES', 'CITY']].drop_duplicates()
west_df = pd.read_csv("../data/processed/west.csv")[['TEXT', 'IMAGES', 'VENDOR', 'FACES', 'CITY']].drop_duplicates()
northeast_df = pd.read_csv("../data/processed/northeast.csv")[['TEXT', 'IMAGES', 'VENDOR', 'FACES', 'CITY']].drop_duplicates()

In [None]:
south_vendors = list(south_df.VENDOR.unique())
midwest_vendors = list(midwest_df.VENDOR.unique())
west_vendors = list(west_df.VENDOR.unique())
northeast_vendors = list(northeast_df.VENDOR.unique())

In [28]:
import numpy as np
import plotly.graph_objects as go

# List of all vendor lists and their labels
vendor_lists = [south_vendors, midwest_vendors, west_vendors, northeast_vendors]
regions = ["South", "Midwest", "West", "Northeast"]

# Initialize an empty matrix for normalized pairwise common elements count
normalized_common_elements_matrix = np.full((len(vendor_lists), len(vendor_lists)), np.nan)

# Calculate normalized intersections and populate the upper triangle of the matrix
for i in range(len(vendor_lists)):
    for j in range(i, len(vendor_lists)):
        if i == j:
            # Self-intersection: normalized to 1 since it's fully contained
            normalized_common_elements_matrix[i][j] = 1.0
        else:
            # Pairwise intersection normalized by the size of the second list
            common_elements = set(vendor_lists[i]).intersection(vendor_lists[j])
            normalized_count = len(common_elements) / len(set(vendor_lists[j]))
            normalized_common_elements_matrix[i][j] = normalized_count

# Create a heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=normalized_common_elements_matrix,
    x=regions,
    y=regions,
    colorscale="earth",
    colorbar=dict(title=" ", titlefont=dict(size=20), tickfont=dict(size=20)),
    zmin=0,
    zmax=1,  # Since the normalized values range from 0 to 1
    text=[[f"{val:.2f}" if not np.isnan(val) else "" for val in row] for row in normalized_common_elements_matrix],
    texttemplate="<b>%{text}</b>",  # Bold the text
    textfont=dict(size=20),  # Increase font size for the values
    showscale=True
))

# Update layout for a clean look
fig.update_layout(
    xaxis=dict(
        title="Region",
        titlefont=dict(size=20),  # Increase font size of the axis title
        tickfont=dict(size=20),  # Increase font size of the tick labels
        tickmode="array",
        tickvals=list(range(len(regions))),
        ticktext=regions,
        showgrid=False,
        ticks=""
    ),
    yaxis=dict(
        title="Region",
        titlefont=dict(size=20),  # Increase font size of the axis title
        tickfont=dict(size=20),  # Increase font size of the tick labels
        tickmode="array",
        tickvals=list(range(len(regions))),
        ticktext=regions,
        showgrid=False,
        ticks=""
    ),
    width=600,
    height=600
)

fig.show()

In [17]:
import numpy as np
import plotly.graph_objects as go

# List of vendor lists for comparison with south_vendors
comparison_lists = [south_vendors, midwest_vendors, west_vendors, northeast_vendors]
regions = ["South", "Midwest", "West", "Northeast"]

# Initialize an empty list for normalized comparisons with south_vendors
normalized_values = []

# Calculate normalized intersections only for south_vendors with each other list
for i, comparison_list in enumerate(comparison_lists):
    if i == 0:  # Self-intersection for South
        normalized_values.append(1.0)  # Fully contained, so set to 1
    else:
        common_elements = set(south_vendors).intersection(comparison_list)
        normalized_count = len(common_elements) / len(set(comparison_list))
        normalized_values.append(normalized_count)

# Convert normalized_values to a 2D array for the heatmap (to display vertically)
normalized_values_matrix = np.array([normalized_values]).T

# Create a single-column heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
    z=normalized_values_matrix,
    x=["South"],  # Single column for South comparisons
    y=regions,
    colorscale="earth",
    colorbar=dict(title="Common Vendors"),
    zmin=0,
    zmax=1,  # Since normalized values range from 0 to 1
    showscale=True
))

# Add annotations for each cell to display normalized values
annotations = []
for i, region in enumerate(regions):
    value = normalized_values[i]
    annotations.append(dict(
        x="South", 
        y=region,
        text=f"{value:.2f}",  # Display value with two decimal places
        showarrow=False,
        font=dict(color="black" if value > 0.5 else "black")  # Adjust text color for readability
    ))

# Update layout with annotations
fig.update_layout(
    # title="Normalized Common Elements with South Vendors",
    annotations=annotations,
    xaxis=dict(title="", tickmode="array", tickvals=[0], ticktext=["South"], showgrid=False, ticks=""),
    yaxis=dict(title="Region", tickmode="array", tickvals=list(range(len(regions))), ticktext=regions, showgrid=False),
    width=300,
    height=500
)

fig.show()

# Trying to get performance of common and unique vendors

In [7]:
def load_embedddings_for_e2e(model_name, city, mode="text"):
    assert mode in ["text", "image", "multimodal"]
    
    # Define directory mapping for models
    model_dirs = {
        "declutr-vit": "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/trained_declutr_vit/CE+SupCon",
        "CE-SupCon-mean-0.5": "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/multimodal_baselines/E2E/CE-SupCon-mean-0.5",
        "declutr-vit-face": "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/error_analysis/multimodal_baseline/trained_declutr-vit/face",
        "declutr-vit-noface": "/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/error_analysis/multimodal_baseline/trained_declutr-vit/noface",
    }
    
    # Check if model_name is valid
    if model_name not in model_dirs:
        raise ValueError(f"Model '{model_name}' not implemented")
        
    emb_dir = model_dirs[model_name]
    
    if model_name == "declutr-vit":
        filenames = {
            "train_emb": f"{city}_{mode}data_train.pt",
            "train_labels": f"{city}_labels_{mode}_train.pt",
            "test_emb": f"{city}_{mode}data_test.pt",
            "test_labels": f"{city}_labels_{mode}_test.pt",
        }
    else:
        filenames = {
            "train_emb": f"{city}_{mode}data_train.pt",
            "train_labels": f"{city}_labels_train.pt",
            "test_emb": f"{city}_{mode}data_test.pt",
            "test_labels": f"{city}_labels_test.pt",
        }
    
    # Load embeddings and labels
    train_emb = torch.load(os.path.join(emb_dir, filenames["train_emb"]), map_location=torch.device('cpu'))
    train_labels = torch.load(os.path.join(emb_dir, filenames["train_labels"]), map_location=torch.device('cpu'))
    test_emb = torch.load(os.path.join(emb_dir, filenames["test_emb"]), map_location=torch.device('cpu'))
    test_labels = torch.load(os.path.join(emb_dir, filenames["test_labels"]), map_location=torch.device('cpu'))
    
    embeddings = torch.cat((train_emb, test_emb), dim=0)
    labels = torch.cat((train_labels, test_labels), dim=0)
    
    return embeddings, labels

In [8]:
_, labels1 = load_embedddings_for_e2e("CE-SupCon-mean-0.5", "south")

In [9]:
_, labels2 = load_embedddings_for_e2e("CE-SupCon-mean-0.5", "west")

In [10]:
labels1

tensor([ 725,  170, 1454,  ...,  923,  923,  923])

In [11]:
# Convert tensors to sets for set operations
set1 = set(labels1.tolist())
set2 = set(labels2.tolist())

# Common elements
common_elements = set1.intersection(set2)

# Unique elements
unique_to_tensor1 = set1 - set2
unique_to_tensor2 = set2 - set1

# Convert back to tensors if needed
common_tensor = torch.tensor(list(common_elements))
unique_tensor1 = torch.tensor(list(unique_to_tensor1))
unique_tensor2 = torch.tensor(list(unique_to_tensor2))

# Output results
print("Common elements:", common_tensor)
print("Unique to tensor1:", unique_tensor1)
print("Unique to tensor2:", unique_tensor2)

Common elements: tensor([   0,    1,    2,    3,    7,    8,   14,   15,   16,   17,   19,   20,
          21,   23,   24,   26,   28,   29,   31,   34,   35,   38,   41,   43,
          44,   45,   46,   47,   48,   49,   55,   57,   60,   61,   62,   65,
          66,   68,   69,   71,   73,   77,   78,   80,   81,   84,   85,   86,
          87,   88,   89,   91,   92,   93,   94,   95,   96,   97,   98,   99,
         100,  102,  104,  107,  108,  109,  110,  111,  114,  115,  122,  123,
         125,  126,  127,  128,  129,  130,  549,  551,  553,  561,  570,  576,
         585,  589,  594,  595,  596,  598,  604,  613,  620,  621,  624,  628,
         639,  640,  643,  644,  652,  654,  655,  658,  659,  660,  663,  669,
         671,  678,  683,  701,  702,  706,  712,  716,  722,  736,  739,  752,
         758,  763,  773,  775,  779,  782,  786,  788,  791,  793,  796,  797,
         802,  804,  806,  807,  808,  824,  826,  842,  844,  846,  849,  851,
         852,  857,  86