In [1]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from scipy.spatial.distance import cdist 

In [2]:
from sklearn.datasets import load_digits

In [70]:
from tqdm import tqdm

In [3]:
digits = load_digits(n_class=2) # just take zeros and ones from dataset

In [4]:
# want to make digits a simple array of images
images = digits.images

In [5]:
type(images)

numpy.ndarray

In [6]:
images.shape

(360, 8, 8)

In [14]:
images = images.reshape(-1,64)

In [16]:
images.shape

(360, 64)

In [7]:
labels = digits.target

In [111]:
def agg_clus(data_matrix, n_clusters=2):
    
    # cluster assignment is a list with each index representing an assignment 
    cluster_assignment = np.arange(data_matrix.shape[0])
    
    pbar = tqdm(total = data_matrix.shape[0]-n_clusters, position=0, leave=True)
    while np.unique(cluster_assignment).shape[0] > n_clusters: # np.unique() removes duplicates from array
        used_labels = np.unique(cluster_assignment)
        # compute the distance betweeen each cluster
        # init the size of the cluster to cluster distances to be a dxd matrix with d = #clusters
        c2c_distances = np.zeros((used_labels.shape[0], used_labels.shape[0]))
        
        for i in range(used_labels.shape[0]):
            for j in range(used_labels.shape[0]):
                # get all features from examples that match cluster assignment
                cluster_A_pts = data_matrix[cluster_assignment == used_labels[i], :]
                cluster_B_pts = data_matrix[cluster_assignment == used_labels[j], :]
                # note that we can use something other than min for our distance measure
                if i != j:
                    c2c_distances[i][j] = cluster_distance = cdist(cluster_A_pts, cluster_B_pts).min()
                else: 
                    c2c_distances[i][j] = np.inf # we dont want our 2 closest arrays to be the same 
                
                
        # merge the two nearest clusters 
        min_value = np.amin(c2c_distances)
        
        # closest will output 2 pairs with each pair being the reverse of the cluster indicies
        closest = np.where(c2c_distances == min_value)
        # print('closest pair: ', closest[0])
        # print(used_labels[closest[0][1]], used_labels[closest[0][0]])
            
        cluster_assignment[cluster_assignment == used_labels[closest[1][0]]] = used_labels[closest[0][0]] 
        pbar.update(1)
    pbar.close()
    return cluster_assignment
        

In [117]:
# small test:
assignments = agg_clus(images[:100])

100%|███████████████████████████████████████████| 98/98 [00:06<00:00, 15.90it/s]


In [118]:
score = labels[:100][assignments == labels[:100]].shape[0] / assignments.shape[0]

In [119]:
score

1.0

In [120]:
# full test:
assignments = agg_clus(images)
score = labels[assignments == labels].shape[0] / assignments.shape[0]
score

100%|█████████████████████████████████████████| 358/358 [04:45<00:00,  1.25it/s]


1.0