In [17]:
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt

import numpy as np

import random

Clustering code:

In [18]:
def generate_random_vectors():
    embeddings=[]
    for i in range(100):
        embeddings.append([random.randint(1,10), random.randint(1,10), random.randint(1,10), random.randint(1,10)])

    return np.array(embeddings)


In [19]:
#Assuming we have a matrix of the embeded feature vectors:
embeddings=generate_random_vectors()

# Assuming 'embeddings' is a 2D array where each row represents the embedded feature vector of an image
# 'eps' is the epsilon parameter, 'min_samples' is the min_samples parameter
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
clusters = dbscan.fit_predict(embeddings)

# 'clusters' will contain the cluster assignments for each data point (-1 for noise/outliers)

Clustering visualitzation:

In [20]:
# Assuming 'embeddings' is a 2D array where each row represents the embedded feature vector of an image
# 'clusters' contains the cluster assignments obtained from DBSCAN

# Reduce dimensionality to 2D using t-SNE
tsne = TSNE(n_components=2)
embeddings_2d = tsne.fit_transform(embeddings)

# Plot the data points, color-coded by cluster assignment
plt.figure(figsize=(8, 6))
for cluster_label in set(clusters):
    if cluster_label == -1:
        # Plot noise points in black
        plt.scatter(embeddings_2d[clusters == cluster_label, 0], embeddings_2d[clusters == cluster_label, 1], color='black', label='Noise')
    else:
        plt.scatter(embeddings_2d[clusters == cluster_label, 0], embeddings_2d[clusters == cluster_label, 1], label=f'Cluster {cluster_label}')
plt.title('DBSCAN Clustering Visualization with t-SNE')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend()
plt.show()

AttributeError: 'NoneType' object has no attribute 'split'

Compute similarities within cluster:

In [None]:
# Assuming 'embeddings' is a 2D array where each row represents the embedded feature vector of an image
# 'clusters' contains the cluster assignments obtained from DBSCAN

# Compute pairwise cosine similarities within each cluster
similarities_within_clusters = []
for cluster_label in set(clusters):
    cluster_indices = (clusters == cluster_label)
    cluster_embeddings = embeddings[cluster_indices]
    pairwise_similarities = cosine_similarity(cluster_embeddings)
    similarities_within_clusters.append(pairwise_similarities)

# 'similarities_within_clusters' will contain pairwise similarities within each cluster

Top 10 similar images:

In [None]:


# Assuming 'embeddings_with_links' is a 2D numpy array where each row contains the embedded feature vector of an image and its corresponding link
# 'specific_image_index' is the index of the specific image for which you want to find similar images
# 'k' is the number of similar images to retrieve

def top_k_similar_images(embeddings_with_links, specific_image_index, k=10):
    # Extract the embedded feature vector of the specific image
    specific_embedding = embeddings_with_links[specific_image_index, 0].reshape(1, -1)

    # Compute cosine similarities between the specific image and all other images
    similarities = cosine_similarity(specific_embedding, embeddings_with_links[:, 0])

    # Sort indices of images based on similarity (excluding the specific image itself)
    similar_image_indices = np.argsort(-similarities)[0][1:k+1]

    # Retrieve links of top k similar images
    similar_image_links = embeddings_with_links[similar_image_indices, 1]

    return similar_image_links.tolist()

# Example usage:
# Replace 'embeddings_with_links' and 'specific_image_index' with your actual data
# Replace 'k' with the desired number of similar images
similar_images = top_k_similar_images(embeddings_with_links, specific_image_index=0, k=10)
print("Top 10 similar images:")
for idx, link in enumerate(similar_images, start=1):
    print(f"{idx}. {link}")