In [None]:
from importlib import reload
import os
import json

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product as iter_product

import src, src.debias, src.models, src.ranking, src.datasets, src.data_utils

if torch.cuda.device_count() > 1:
    use_device_id = int(input(f"Choose cuda index, from [0-{torch.cuda.device_count() - 1}]: ").strip())
else:
    use_device_id = 0
use_device = "cuda:" + str(use_device_id) if torch.cuda.is_available() else "cpu"
if not torch.cuda.is_available():
    input("CUDA isn't available, so using cpu. Please press any key to confirm this isn't an error: \n")
print("Using device", use_device)
torch.cuda.set_device(use_device_id)


with open(src.PATHS.TRAINED_MODELS.METADATA, mode="r") as _runs_metafile:
    runs_metadata = json.load(_runs_metafile)

clip_arch = "openai/CLIP/ViT-B/16"

run_metadata = runs_metadata["91"]
model_save_name = f"best_ndkl_oai-clip-vit-b-16_neptune_run_OXVLB-91_model_e{run_metadata['epoch']}_step_{run_metadata['step']}.pt"
n_debias_tokens = 2

with torch.cuda.device(use_device_id):
    model, preprocess, tokenizer, model_alias = src.models.DebiasCLIP.from_cfg(src.Dotdict({
        "CLIP_ARCH": clip_arch, "DEVICE": use_device, "num_debias_tokens": n_debias_tokens
    }))
    model.load_state_dict(
        torch.load(os.path.join(src.PATHS.TRAINED_MODELS.BASE, model_save_name), map_location=use_device),
        strict=True)
    model = model.eval().to(use_device)

In [None]:
from tqdm import tqdm, trange
max_tok = tokenizer("")[0][0].item()
embeddings = []
offset = 0

for i in trange(offset, max_tok):
    embeddings.append(model.clip.token_embedding(torch.tensor(i, device=use_device)))
embeddings = torch.stack(embeddings)

In [None]:

use_cos_dist = False
db_embeddings = model.debias_tokens.weight.detach()
if use_cos_dist:
    _embeddings = embeddings / torch.norm(embeddings, dim=-1, keepdim=True)
    _db_embeddings = db_embeddings / torch.norm(db_embeddings, dim=-1, keepdim=True)
    dists = _db_embeddings @ _embeddings.T
else:
    dists = torch.cdist(db_embeddings, embeddings, p=2)

closest_inxs = []
for dist_row in dists:
    if use_cos_dist:
        closest_inx = dist_row.topk(1000).indices.cpu()
    else:
        closest_inx = (-dist_row).topk(1000).indices.cpu()
    closest_inxs.append((closest_inx+offset).tolist())


In [None]:

from clip import simple_tokenizer
raw_tkz = simple_tokenizer.SimpleTokenizer()
for inx, tops in enumerate(closest_inxs):
    print(f"For debias token {inx}:")
    max_alpha = 10
    n_alpha = 0
    for top in tops:
        inv = raw_tkz.decode([top])
        if n_alpha == max_alpha: break
        if top==0 or inv.isalpha():
            print(f"token: {inv}, dist: {dists[inx][top-offset].item():.4f}")
            n_alpha += 1

In [None]:

import time
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

pca = PCA(n_components=3)
pca_res = pca.fit_transform(embeddings.cpu().numpy())
token_words = [raw_tkz.decode([i]) for i in range(offset, max_tok)]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:

cols = ["x", "y", "z"]
df = pd.DataFrame(data=pca_res, columns=cols)
sns.scatterplot(
    x="x", y="y",
    palette=sns.color_palette("hls", 10),
    legend="full",
    data=df,
    alpha=0.01
)

In [None]:

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(embeddings.cpu().numpy())

In [None]:

df['tsne-2d-one'] = tsne_results[:,0]
df['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    palette=sns.color_palette("hls", 10),
    data=df,
    legend="full",
    alpha=0.3
)