### Hierarchical Clustering:

#### Input: 
- data: a `m` by `n` array with data and features
- edges: an adjacency list that represents the graph

#### General algo:
1. For each features:
    2. For each data point:
        3. Calculate the cost
4. Pick the split that has the least cost
5. Recurse on the resulting two sub-data arrays

#### Calcuating the cost:
1. running_cost := 0
1. best_cost := infinity
2. cost_array := \[ \]
3. For edge(a,b) in edges:
    4. cost_array\[a\] += 1
    5. cost_array\[b\] -= 1
6. For i in len(cost_array):
    7. running_cost += cost_array\[i\]
    8. if running_cost < best_cost:
        9. best_cost = running_cost
        10. split_index = i
11. return i

In [108]:
# Testing framework
import random

class Data:
    
    def __init__(self, size, degree, fill=True):
        self.degree = degree
        self.size = size
        self.data = [i for i in range(size)]
        self.edges = [[] for i in range(size)]
        self.dist = [0 for i in range(size)]
        if fill:
            self.fill_edges()
    
    def fill_edges(self):
        for i in range(self.degree*self.size):
            edge = random.sample(range(self.size), 2)
            self.edges[edge[0]].append(edge)
            self.edges[edge[1]].append(edge)
    
    def split(self):
        for i in range(self.size):
            for e in self.edges[i]:
                self.dist[min(e)] += 1
                self.dist[max(e)] -= 1
        index = 0
        min_cost = self.size*self.degree
        running_cost = 0
        weighted_cost = 0
        # We can skip the last index b/c it's not a real cut
        for i in range(self.size-1):
            running_cost += self.dist[i]
            weighted_cost = running_cost/((i+1)*(self.size-i-1))
            if weighted_cost < min_cost:
                index = i
                min_cost = weighted_cost
        return (index, min_cost)

In [112]:
# Run tests
d1 = Data(5, 1, fill=False)
d1.edges[0] = [[0,1],[0,2]]
d1.edges[3] = [[3, 4]]
assert(d1.split() == (2, 0))

d2 = Data(5, 1, fill=False)
d2.edges[0] = [[0, 1], [0, 2]]
d2.edges[1] = [[1,3]]
d2.edges[2] = [[2,4]]
assert(d2.split() == (3, 0.25))