In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [2]:
import numpy as np
import pandas as pd
import math
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Cosine similarity function
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Entropy function
def entropy(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    # Ensure the dot product is positive and normalize
    dot_product = np.clip(dot_product, 1e-12, None)  # Avoid negative values and log(0)
    normalized_dot_product = dot_product / np.sum(dot_product)
    return -np.sum(normalized_dot_product * np.log2(normalized_dot_product))

# Mean metric calculation function
def calculate_mean_metric(selected_embedding, embeddings, metric='cosine'):
    similarities = []
    for embedding in embeddings:
        if not np.array_equal(embedding, selected_embedding):
            if metric == 'cosine':
                similarity = cosine_similarity(selected_embedding, embedding)
            elif metric == 'entropy':
                similarity = entropy(selected_embedding, embedding)
            similarities.append(similarity)
    return np.mean(similarities)

# Bias measurement function
def bias_measurement(D, e):
    chance_level = 1 / len(D)  # Inverse of the number of classes
    mean_metric = calculate_mean_metric(e, D, metric='cosine')
    bias_value = np.log(mean_metric / chance_level)
    return bias_value

# Clustering functions
def initialize_centroids_knexus(X, K):
    indices = np.random.choice(X.shape[0], K, replace=False)
    return X[indices]

def assign_clusters_knexus(X, centroids):
    clusters = []
    for x in X:
        distances = np.array([np.linalg.norm(x - c) for c in centroids])
        clusters.append(np.argmin(distances))
    return np.array(clusters)

def update_centroids_knexus(X, clusters, K):
    centroids = np.zeros((K, X.shape[1]))
    for k in range(K):
        cluster_points = X[clusters == k]
        if len(cluster_points) > 0:
            centroids[k] = np.mean(cluster_points, axis=0)
    return centroids

def knexus(X, K, max_iters=100):
    centroids = initialize_centroids_knexus(X, K)
    for _ in range(max_iters):
        clusters = assign_clusters_knexus(X, centroids)
        new_centroids = update_centroids_knexus(X, clusters, K)
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    return centroids, clusters

def main(embeddings, sample_counts, num_clusters):
    embedding_values = np.array(list(embeddings.values()))
    classes = list(embeddings.keys())

    # Calculate bias values for all classes
    bias_values = [bias_measurement(embedding_values, embedding) for embedding in embedding_values]

    # Perform clustering
    centroids, clusters = knexus(embedding_values, num_clusters)

    # Create a DataFrame for clustering results
    cluster_results_df = pd.DataFrame({
        'Class': classes,
        'Bias Value': bias_values,
        'Cluster': clusters,
        'Sample Count': [sample_counts[cls] for cls in classes]
    })

    # Sort the DataFrame by cluster and sample count for better visualization
    cluster_results_df_sorted = cluster_results_df.sort_values(by=['Cluster', 'Sample Count'], ascending=[True, False]).reset_index(drop=True)

    # Select representative class for each cluster
    representative_classes = cluster_results_df_sorted.groupby('Cluster').first().reset_index()

    return cluster_results_df_sorted, representative_classes

In [4]:
model = SentenceTransformer('all-mpnet-base-v2')
df = pd.read_csv('https://s3.amazonaws.com/kinetics/400/annotations/train.csv')

labels = df['label'].unique().tolist()
embeddings = model.encode(labels)
embeddings_dict = {label: embedding for label, embedding in zip(labels, embeddings)}
sample_count_dict = dict(df['label'].value_counts())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
cluster_results_df_sorted, representative_classes = main(embeddings_dict, sample_count_dict, 150)
display(cluster_results_df_sorted)
display(representative_classes)

Unnamed: 0,Class,Bias Value,Cluster,Sample Count
0,exercising with an exercise ball,4.091587,0,290
1,exercising arm,4.331620,0,267
2,ripping paper,4.288185,1,456
3,rock scissors paper,4.197625,1,274
4,air drumming,4.621850,2,983
...,...,...,...,...
395,weaving basket,4.262149,147,593
396,folding clothes,4.207457,147,545
397,shredding paper,4.261618,147,253
398,water sliding,4.226221,148,270


Unnamed: 0,Cluster,Class,Bias Value,Sample Count
0,0,exercising with an exercise ball,4.091587,290
1,1,ripping paper,4.288185,456
2,2,air drumming,4.621850,983
3,3,cooking chicken,3.855179,850
4,4,country line dancing,4.330269,865
...,...,...,...,...
145,145,making pizza,4.154789,997
146,146,frying vegetables,3.727410,458
147,147,folding paper,4.097067,790
148,148,water sliding,4.226221,270


In [6]:
# Download results as excel files
cluster_results_df_sorted.to_excel('cluster_results_df_sorted.xlsx', index=False)
representative_classes.to_excel('representative_classes.xlsx', index=False)