### Hierarchical Clustering:

#### Input: 
- data: a `m` by `n` array with data and features
- edges: an adjacency list that represents the graph

#### General algo:
1. For each features:
    2. For each data point:
        3. Calculate the cost
4. Pick the split that has the least cost
5. Recurse on the resulting two sub-data arrays

#### Calcuating the cost:
1. running_cost := 0
1. best_cost := infinity
2. cost_array := \[ \]
3. For edge(a,b) in edges:
    4. cost_array\[a\] += 1
    5. cost_array\[b\] -= 1
6. For i in len(cost_array):
    7. running_cost += cost_array\[i\]
    8. if running_cost < best_cost:
        9. best_cost = running_cost
        10. split_index = i
11. return i

In [55]:
# Testing framework
import random

class Data:
    
    def __init__(self, size, degree, fill=True):
        self.degree = degree
        self.size = size
        self.data = [i for i in range(size)]
        self.edges = [[] for i in range(size)]
        self.dist = [0 for i in range(size)]
        if fill:
            self.fill_edges()
    
    # Randomly creates graph
    def fill_edges(self):
        for i in range(self.degree*self.size):
            edge = random.sample(range(self.size), 2)
            self.edges[edge[0]].append(edge)
            self.edges[edge[1]].append(edge)
    
    # Creates graph from list of edges
    def fill_edges(self, e):
        for edge in e:
            self.edges[edge[0]].append(edge)
            self.edges[edge[1]].append(edge)
    
    # O(n) space algorithm
    # Does not require symmetrical adjacency list
    def split(self):
        self.dist = [0 for i in range(self.size)]
        for i in range(self.size):
            for e in self.edges[i]:
                self.dist[min(e)] += 1
                self.dist[max(e)] -= 1
        index = 0
        min_cost = self.size*self.degree
        running_cost = 0
        weighted_cost = 0
        # We can skip the last index b/c it's not a real cut
        for i in range(self.size-1):
            running_cost += self.dist[i]
            weighted_cost = running_cost/((i+1)*(self.size-i-1))
            if weighted_cost < min_cost:
                index = i
                min_cost = weighted_cost
        return (index, min_cost)
    
    # O(1) space algorithm
    # Only works if the adjacency list is symmetrical
    def split2(self):
        index = 0
        min_cost = self.size*self.degree
        running_cost = 0
        weighted_cost = 0
        for i in range(self.size-1):
            for e in self.edges[i]:
                if i == min(e):
                    running_cost += 1
                else:
                    running_cost -= 1
            weighted_cost = running_cost/((i+1)*(self.size-i-1))
            if weighted_cost < min_cost:
                index = i
                min_cost = weighted_cost
        return (index, min_cost)

In [56]:
# Run tests

# Simple case
d1 = Data(5, 1, fill=False)
e = [[0,1], [0,2], [3,4]]
d1.fill_edges(e)
assert(d1.split2() == (2, 0))
assert(d1.split()[0] == d1.split2()[0])

# Harder case
d2 = Data(5, 1, fill=False)
e = [[0,1], [0,2], [1,3], [2,4]]
d2.fill_edges(e)
assert(d2.split2() == (3, 0.25))
assert(d2.split()[0] == d2.split2()[0])

# Null case
d3 = Data(10, 1, fill=False)
assert(d3.split2() == (0, 0))
assert(d3.split()[0] == d3.split2()[0])

In [57]:
d1 = Data(5, 1, fill=False)
e = [[0,1], [0,2], [3,4]]
d1.fill_edges(e)
print(d1.edges)

[[[0, 1], [0, 2]], [[0, 1]], [[0, 2]], [[3, 4]], [[3, 4]]]


In [58]:
d1.split()

(2, 0.0)

In [59]:
d1.dist = [0 for i in range(d1.size)]
for i in range(d1.size):
    for e in d1.edges[i]:
        d1.dist[min(e)] += 1
        d1.dist[max(e)] -= 1

In [60]:
d1.dist

[4, -2, -2, 2, -2]

### Implementation with Animals w/ Attributes

The animals dataset consists of 50 animals with 85 features

In [61]:
import numpy as np
import pandas as pd

# Set up classes
with open('Animals_with_Attributes2/classes.txt') as f:
    content = f.readlines()
animals = [np.array(x.strip().split()) for x in content]
animal_ids = [int(x[0]) for x in animals]
animal_names = [x[1].replace("+"," ") for x in animals]

# Set up features list
with open('Animals_with_Attributes2/predicates.txt') as f:
    content = f.readlines()
features = [np.array(x.strip().split()) for x in content]

# Set up features weights
with open('Animals_with_Attributes2/predicate-matrix-continuous.txt') as f:
    content = f.readlines()
animals_data = [[float(y) for y in np.array(x.strip().split())] for x in content]

In [62]:
# Create pandas Dataframe
animals = pd.DataFrame(animals_data)
animals.insert(0, 'id', pd.Series(animal_ids))
animals.insert(1, 'name', pd.Series(animal_names))

In [63]:
from sklearn.neighbors import NearestNeighbors

# graph creation with KNN 
# generates a matrix of 1's & 0's where the 1's denotes an edge 
# between two animals (currently can have an edge to itself)
neigh = NearestNeighbors(n_neighbors=6)  
neigh.fit(np.array(animals_data), animal_ids)  
graph = neigh.kneighbors_graph(animals_data).toarray()

In [64]:
#sanity check - see if animal is close to the other 5
for n in range(3):
    print("Animal Name: "+str(animal_names[n]))
    print("Neighbors: ")
    for i, x in enumerate(graph[n]):
        if x:
            print(animal_names[int(i)])
    print('------')

Animal Name: antelope
Neighbors: 
antelope
horse
moose
giraffe
buffalo
deer
------
Animal Name: grizzly bear
Neighbors: 
grizzly bear
german shepherd
wolf
bobcat
lion
polar bear
------
Animal Name: killer whale
Neighbors: 
killer whale
blue whale
humpback whale
seal
walrus
dolphin
------


In [65]:
#generate a list of edges from adjacency matrix
edges = set()
for row in range(len(graph)):
    for col in range(len(graph[row])):
        if graph[row][col] and row != col:
            edges.add((min(row, col), max(row, col)))
edges = list(edges)
edges.sort()
print(edges)

[(0, 6), (0, 15), (0, 30), (0, 36), (0, 37), (0, 39), (1, 7), (1, 31), (1, 40), (1, 42), (1, 44), (2, 8), (2, 17), (2, 23), (2, 46), (2, 49), (3, 11), (3, 26), (3, 34), (3, 35), (3, 43), (4, 5), (4, 7), (4, 9), (4, 32), (4, 45), (5, 9), (5, 25), (5, 32), (5, 38), (5, 43), (5, 44), (5, 45), (6, 20), (6, 30), (6, 37), (6, 39), (6, 48), (7, 9), (7, 12), (7, 14), (7, 19), (7, 21), (7, 31), (7, 34), (7, 40), (7, 42), (7, 44), (7, 45), (8, 17), (8, 23), (8, 46), (8, 49), (9, 21), (9, 32), (9, 34), (9, 45), (10, 11), (10, 25), (10, 33), (10, 43), (10, 47), (11, 25), (11, 26), (11, 28), (11, 29), (11, 33), (11, 34), (11, 43), (11, 47), (12, 14), (12, 31), (12, 40), (12, 42), (13, 18), (13, 20), (13, 27), (13, 36), (13, 41), (14, 31), (14, 40), (14, 42), (15, 20), (15, 36), (15, 39), (15, 48), (16, 19), (16, 24), (16, 26), (16, 29), (16, 34), (16, 47), (17, 23), (17, 46), (17, 49), (18, 20), (18, 27), (18, 41), (18, 48), (19, 24), (19, 36), (19, 38), (19, 42), (20, 22), (20, 27), (20, 36), (20,

In [66]:
class Hierarchy_Node:
    def __init__(self, edges, size):
        self.edges = edges
        self.size = size
        self.left = None
        self.right = None

def find_cut(tuples, idx):
    lo = 0
    hi = len(tuples) - 1
    mid = int((lo + hi) / 2)
    while lo < hi - 1:
        if tuples[mid][0] < idx:
            lo = mid
            mid = int((lo + hi) / 2)
        elif tuples[mid][0] > idx:
            hi = mid
            mid = int((lo + hi) / 2)
        else:
            while mid < len(tuples) and tuples[mid][0] == idx:
                mid += 1
            return mid - 1
    if tuples[lo][0] == idx:
        return lo
    elif tuples[hi][0] == idx:
        return hi
    return -1

In [67]:
import queue

root = Hierarchy_Node(edges, len(animals_data))
q = queue.Queue(maxsize=len(animals_data))
q.put(root)
while not q.empty():
    hn = q.get()
    if hn.size == 1:
        continue
    data = Data(len(animals_data), 5, False)
    data.fill_edges(hn.edges)
    (idx, cost) = data.split()
    cut = find_cut(hn.edges, idx)
    q.put(Hierarchy_Node(hn.edges[:(cut + 1)], idx + 1))
    q.put(Hierarchy_Node(hn.edges[(cut + 1):], hn.size - idx - 1))

IndexError: list index out of range