### Hierarchical Clustering:

#### Input: 
- data: a `m` by `n` array with data and features
- edges: an adjacency list that represents the graph

#### General algo:
1. For each features:
    2. For each data point:
        3. Calculate the cost
4. Pick the split that has the least cost
5. Recurse on the resulting two sub-data arrays

#### Calcuating the cost:
1. running_cost := 0
1. best_cost := infinity
2. cost_array := \[ \]
3. For edge(a,b) in edges:
    4. cost_array\[a\] += 1
    5. cost_array\[b\] -= 1
6. For i in len(cost_array):
    7. running_cost += cost_array\[i\]
    8. if running_cost < best_cost:
        9. best_cost = running_cost
        10. split_index = i
11. return i

In [68]:
# Testing framework
import random

class Data:
    
    def __init__(self, size, degree, fill=True):
        self.degree = degree
        self.size = size
        self.data = [i for i in range(size)]
        self.edges = [[] for i in range(size)]
        self.dist = [0 for i in range(size)]
        if fill:
            self.fill_edges()
    
    # Randomly creates graph
    def fill_edges(self):
        for i in range(self.degree*self.size):
            edge = random.sample(range(self.size), 2)
            self.edges[edge[0]].append(edge)
            self.edges[edge[1]].append(edge)
    
    # Creates graph from list of edges
    def fill_edges(self, e):
        for edge in e:
            self.edges[edge[0]].append(edge)
            self.edges[edge[1]].append(edge)
    
    # O(n) space algorithm
    # Does not require symmetrical adjacency list
    def split(self):
        self.dist = [0 for i in range(self.size)]
        for i in range(self.size):
            for e in self.edges[i]:
                self.dist[min(e)] += 1
                self.dist[max(e)] -= 1
        index = 0
        min_cost = self.size*self.degree
        running_cost = 0
        weighted_cost = 0
        # We can skip the last index b/c it's not a real cut
        for i in range(self.size-1):
            running_cost += self.dist[i]
            weighted_cost = running_cost/((i+1)*(self.size-i-1))
            if weighted_cost < min_cost:
                index = i
                min_cost = weighted_cost
        return (index, min_cost)
    
    # O(1) space algorithm
    # Only works if the adjacency list is symmetrical
    def split2(self):
        index = 0
        min_cost = self.size*self.degree
        running_cost = 0
        weighted_cost = 0
        for i in range(self.size-1):
            for e in self.edges[i]:
                if i == min(e):
                    running_cost += 1
                else:
                    running_cost -= 1
            weighted_cost = running_cost/((i+1)*(self.size-i-1))
            if weighted_cost < min_cost:
                index = i
                min_cost = weighted_cost
        return (index, min_cost)

In [69]:
# Run tests

# Simple case
d1 = Data(5, 1, fill=False)
e = [[0,1], [0,2], [3,4]]
d1.fill_edges(e)
assert(d1.split2() == (2, 0))
assert(d1.split()[0] == d1.split2()[0])

# Harder case
d2 = Data(5, 1, fill=False)
e = [[0,1], [0,2], [1,3], [2,4]]
d2.fill_edges(e)
assert(d2.split2() == (3, 0.25))
assert(d2.split()[0] == d2.split2()[0])

# Null case
d3 = Data(10, 1, fill=False)
assert(d3.split2() == (0, 0))
assert(d3.split()[0] == d3.split2()[0])

In [70]:
d1 = Data(5, 1, fill=False)
e = [[0,1], [0,2], [3,4]]
d1.fill_edges(e)
print(d1.edges)

[[[0, 1], [0, 2]], [[0, 1]], [[0, 2]], [[3, 4]], [[3, 4]]]


In [71]:
d1.split()

(2, 0.0)

In [72]:
d1.dist = [0 for i in range(d1.size)]
for i in range(d1.size):
    for e in d1.edges[i]:
        d1.dist[min(e)] += 1
        d1.dist[max(e)] -= 1

In [73]:
d1.dist

[4, -2, -2, 2, -2]

### Implementation with Animals w/ Atttributes

The animals dataset consists of 50 animals with 85 features

*Note*: I'm still unsure about the best representation for the animal data. Pandas DataFrame make some queries faster, but maybe we should just have a dict, where each key is the animal name.

In [1]:
import numpy as np
import pandas as pd

# Set up classes
with open('Animals_with_Attributes2/classes.txt') as f:
    content = f.readlines()
animals = [np.array(x.strip().split()) for x in content]
animal_ids = [int(x[0]) for x in animals]
animal_names = [x[1].replace("+"," ") for x in animals]

# Set up features list
with open('Animals_with_Attributes2/predicates.txt') as f:
    content = f.readlines()
features = [np.array(x.strip().split()) for x in content]

# Set up features weights
with open('Animals_with_Attributes2/predicate-matrix-continuous.txt') as f:
    content = f.readlines()
animals_data = [[float(y) for y in np.array(x.strip().split())] for x in content]

# Create pandas Dataframe
animals = pd.DataFrame(animals_data)
animals.insert(0, 'id', pd.Series(animal_ids))
animals.insert(1, 'name', pd.Series(animal_names))

# Create dictionary for easy access
animals_dict = {}
for i in range(len(animal_names)):
    animals_dict[animal_names[i]] = animals_data[i]

In [2]:
from sklearn.neighbors import NearestNeighbors

# graph creation with KNN 
# generates a matrix of 1's & 0's where the 1's denotes an edge 
# between two animals (currently can have an edge to itself)
neigh = NearestNeighbors(n_neighbors=6).fit(animals_data)  
graph = neigh.kneighbors_graph(animals_data).toarray()

In [6]:
'''
#sanity check - see if animal is close to the other 5
for n in range(3):
    print("Animal Name: "+str(animal_names[n]))
    print("Neighbors: ")
    for i, x in enumerate(graph[n]):
        if x:
            print(animal_names[int(i)])
    print('------')
'''

'\n#sanity check - see if animal is close to the other 5\nfor n in range(3):\n    print("Animal Name: "+str(animal_names[n]))\n    print("Neighbors: ")\n    for i, x in enumerate(graph[n]):\n        if x:\n            print(animal_names[int(i)])\n    print(\'------\')\n'

In [3]:
#generate a list of edges from adjacency matrix
edges = {}
for n in animal_names:
    edges[n] = []
    
for row in range(len(graph)):
    for col in range(len(graph[row])):
        if graph[row][col] and row != col:
            n1 = animal_names[row]
            n2 = animal_names[col]
            edges[n1].append((n1, n2))

In [74]:
# Testing out recursive algorithm

arr = [i for i in range(10)]

def split1(arr):
    return(arr[1], arr[0:1], arr[2:])

class Node:
    def __init__(self, data):
        self.data = data
        self.left = None
        self.right = None
        
    def print(self):
        q = [(self, 0)]
        level = 0
        while len(q) > 0:
            curr_node = q.pop(0)
            curr = curr_node[0]
            curr_level = curr_node[1]
            if curr_level > level:
                print('\n', end='')
                level = curr_level
            print(curr.data, end=' ')
            if curr.left:
                q.append((curr.left, curr_level+1))
            if curr.right:
                q.append((curr.right, curr_level+1))
        

def makeTree(arr):
    if (len(arr) == 0):
        return None
    if (len(arr) == 1):
        return Node(arr[0])
    else:
        center, left, right = split1(arr)
        center = Node(center)
        center.left = makeTree(left)
        center.right = makeTree(right)
        return center

In [44]:
n1 = makeTree(arr)
n1.print()

1 
0 3 
2 5 
4 7 
6 9 
8 

# Actually writing the code
Will now write a wrapper class that accepts the animals data and will perform clustering

In [42]:
import math

class Node:
    def __init__(self, data):
        self.data = data
        self.left = None
        self.right = None

class Hcluster:

    # data is the matrix of all our data points
    # edges is a symmetric adjacency list using some sort of id
    # table is our data indexed by the id for O(1) lookup
    def __init__(self, data, edges, table):
        self.data = data
        self.edges = edges
        self.table = table
    
    def split(self, data, feature):
        index = 0
        min_cost = math.inf
        running_cost = 0
        weighted_cost = 0
        
        # TODO: need to change this line later
        sorted_data = [[a, self.table[a]] for a in self.table.keys()]
        sorted_data.sort(key=lambda x: x[0])
        sorted_data.sort(key=lambda x: x[1][feature])
        
        test_num = 50
        
        for i in range(len(sorted_data)):
            print(sorted_data[i][0], sorted_data[i][1][feature])
        print()
        for i in range(test_num):
            start_node = sorted_data[i][0]
            start_val = self.table[start_node][feature]
            for e in self.edges[start_node]:
                end_node = e[1]
                end_val = self.table[end_node][feature]
                
                if start_val < end_val:
                    running_cost += 1
                elif start_val > end_val:
                    running_cost -= 1
                else:
                    if start_node < end_node:
                        running_cost += 1
                    else:
                        running_cost -= 1
                print(running_cost, start_node, start_val, end_node, end_val)
            print(running_cost, '\n')
                
            
#         #return(index, sorted_data[0:index], sorted_data[index+1:])
        
        
        
    def makeTree(self, data):
        pass

In [44]:
tree = Hcluster(animals, edges, animals_dict)
tree.split(tree.data, 0)

antelope -1.0
deer 0.0
fox 0.0
lion 1.88
elephant 2.5
hippopotamus 4.77
giraffe 6.11
rhinoceros 9.75
polar bear 10.0
collie 10.13
dolphin 10.22
moose 10.24
squirrel 10.56
blue whale 12.92
bobcat 16.13
mouse 18.37
walrus 18.84
beaver 19.38
persian cat 19.38
pig 21.52
humpback whale 24.01
rabbit 26.49
weasel 30.57
sheep 32.36
chihuahua 32.63
spider monkey 36.04
mole 39.05
grizzly bear 39.25
leopard 40.88
hamster 41.38
tiger 42.47
german shepherd 43.54
wolf 43.74
ox 43.99
horse 44.9
buffalo 45.37
otter 46.81
chimpanzee 47.51
rat 50.13
cow 55.31
siamese cat 56.21
gorilla 63.37
raccoon 63.57
dalmatian 69.58
giant panda 76.85
seal 81.96
killer whale 83.4
zebra 85.04
skunk 87.99
bat 91.55

1 antelope -1.0 horse 44.9
2 antelope -1.0 moose 10.24
3 antelope -1.0 giraffe 6.11
4 antelope -1.0 buffalo 45.37
5 antelope -1.0 deer 0.0
5 

4 deer 0.0 antelope -1.0
5 deer 0.0 horse 44.9
6 deer 0.0 moose 10.24
7 deer 0.0 giraffe 6.11
8 deer 0.0 cow 55.31
8 

9 fox 0.0 german shepherd 43.54
10 fox 0.0 sia