In [1]:
import os

from pyseter import sort
from sklearn.metrics.pairwise import cosine_similarity

feature_dir = 'working_dir/features'
out_path = feature_dir + '/features.npy'
filenames, feature_array = sort.load_features(out_path)

similarity_scores = cosine_similarity(feature_array)


In [16]:
import numpy as np
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import normalize
import networkx as nx

# 1. L2 normalize
X = normalize(feature_array, norm='l2')

# 2. Build k-NN graph (cosine = euclidean on normalized vectors)
k = 15
A = kneighbors_graph(X, n_neighbors=k, metric='euclidean', mode='distance')

# 3. Convert distances to similarities (for normalized vectors, distÂ² = 2 - 2*cos)
A.data = 1 - (A.data ** 2) / 2

# 4. Mutual k-NN: keep edge only if both directions exist
A_mutual = A.minimum(A.T)  # element-wise min, zeroes out non-mutual edges

# 5. Build networkx graph
G = nx.from_scipy_sparse_array(A_mutual, edge_attribute='weight')

# 6. Remove isolated nodes' zero-weight edges (artifact of sparse min)
G.remove_edges_from([(u, v) for u, v, d in G.edges(data=True) if d['weight'] <= 0])

# 7. Cluster with Louvain (networkx has it built in since 2.6+)
from networkx.algorithms.community import louvain_communities
communities = louvain_communities(G, weight='weight', resolution=1.0)

# Convert to labels
labels = np.full(X.shape[0], -1)
for i, comm in enumerate(communities):
    for node in comm:
        labels[node] = i

sort.report_cluster_results(labels)

Found 49 clusters.
Largest cluster has 123 images.


In [9]:
nc = sort.NetworkCluster(match_threshold=0.55)
results = nc.cluster_images(similarity_scores)
sort.report_cluster_results(results.cluster_idx)

Following clusters may contain false positives:
['ID_0001', 'ID_0006', 'ID_0008', 'ID_0021', 'ID_0110']
Found 208 clusters.
Largest cluster has 128 images.
