In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors


# Set up classes
with open('Animals_with_Attributes2/classes.txt') as f:
    content = f.readlines()
animals = [np.array(x.strip().split()) for x in content]
animal_ids = [int(x[0]) for x in animals]
animal_names = [x[1].replace("+"," ") for x in animals]

# Set up features list
with open('Animals_with_Attributes2/predicates.txt') as f:
    content = f.readlines()
features = [np.array(x.strip().split()) for x in content]

# Set up features weights
with open('Animals_with_Attributes2/predicate-matrix-continuous.txt') as f:
    content = f.readlines()
animals_data = [[float(y) for y in np.array(x.strip().split())] for x in content]

# Create pandas Dataframe
animals = pd.DataFrame(animals_data)
animals.insert(0, 'id', pd.Series(animal_ids))
animals.insert(1, 'name', pd.Series(animal_names))

# Create dictionary for easy access
animals_dict = {}
for i in range(len(animal_names)):
    animals_dict[animal_names[i]] = animals_data[i]

# graph creation with KNN 
# generates a matrix of 1's & 0's where the 1's denotes an edge 
# between two animals (currently can have an edge to itself)
neigh = NearestNeighbors(n_neighbors=6).fit(animals_data)  
graph = neigh.kneighbors_graph(animals_data).toarray()

#generate a list of edges from adjacency matrix
edges = {}
for n in animal_names:
    edges[n] = []
    
for row in range(len(graph)):
    for col in range(len(graph[row])):
        if graph[row][col] and row != col:
            n1 = animal_names[row]
            n2 = animal_names[col]
            edges[n1].append((n1, n2))
            edges[n2].append((n2, n1))

for e in edges:
    edges[e] = set(edges[e])

In [50]:
import math
import copy

class Node:
    def __init__(self, name, data):
        self.name = name
        self.data = data
        self.left = None
        self.right = None

class Hcluster:

    # data is the matrix of all our data points
    # edges is a symmetric adjacency list using some sort of id
    # table is our data indexed by the id for O(1) lookup
    def __init__(self, data, edges, table, num_features):
        self.data = data
        self.edges = edges
        self.table = table
        self.num_features = num_features
    
    
    
    def split(self, data):
        min_index = 0
        min_cost = math.inf
        min_feature = 0
        sorted_data = copy.copy(data)
        
        # Find best feature to split:
        for feature in range(self.num_features):            
            running_cost = 0
            weighted_cost = 0

            # Sort by name first and then value
            sorted_data.sort(key=lambda x: x[0])
            sorted_data.sort(key=lambda x: x[1][feature])

            sorted_dict = {}
            for i in range(len(sorted_data)):
                sorted_dict[sorted_data[i][0]] = i
                
            for i in range(len(sorted_data)-1):
                start_edge = sorted_data[i][0]
                for e in self.edges[start_edge]:
                    end_edge = e[1]
                    if sorted_dict[start_edge] < sorted_dict[end_edge]:
                        running_cost += 1
                    else:
                        running_cost -= 1
                weighted_cost = running_cost/((i+1)*(len(sorted_data)-i-1))
                if weighted_cost < min_cost:
                    min_index = i
                    min_cost = weighted_cost
                    min_feature = feature
        
        # Re-sort along best feature and split
        sorted_data.sort(key=lambda x: x[0])
        sorted_data.sort(key=lambda x: x[1][min_feature])
        cut_node = Node(sorted_data[min_index][0], sorted_data[min_index][1])
        return(cut_node, sorted_data[0:i], sorted_data[i+1:])
        
        
        
    def makeTreeHelper(self, data):
        if len(data) == 0:
            return None
        if len(data) == 1:
            return Node(data)
        else:
            center, left, right = self.split(data)
            center.left = self.makeTreeHelper(left)
            center.right = self.makeTreeHelper(right)
            return center
    
    
    
    def makeTree(self):
        sorted_data = [[a, self.table[a]] for a in self.table.keys()]
        self.makeTreeHelper(sorted_data)

In [52]:
c = Hcluster(animals, edges, animals_dict, 85)
center, left, right = c.split([[a, c.table[a]] for a in c.table.keys()])

In [53]:
c.split(left)

KeyError: 'killer whale'