In [13]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors


# Set up classes
with open('Animals_with_Attributes2/classes.txt') as f:
    content = f.readlines()
animals = [np.array(x.strip().split()) for x in content]
animal_ids = [int(x[0]) for x in animals]
animal_names = [x[1].replace("+"," ") for x in animals]

# Set up features list
with open('Animals_with_Attributes2/predicates.txt') as f:
    content = f.readlines()
features = [np.array(x.strip().split()) for x in content]

# Set up features weights
with open('Animals_with_Attributes2/predicate-matrix-continuous.txt') as f:
    content = f.readlines()
animals_data = [[float(y) for y in np.array(x.strip().split())] for x in content]

# Create pandas Dataframe
animals = pd.DataFrame(animals_data)
animals.insert(0, 'id', pd.Series(animal_ids))
animals.insert(1, 'name', pd.Series(animal_names))

# Create dictionary for easy access
animals_dict = {}
for i in range(len(animal_names)):
    animals_dict[animal_names[i]] = animals_data[i]

# graph creation with KNN 
# generates a matrix of 1's & 0's where the 1's denotes an edge 
# between two animals (currently can have an edge to itself)
neigh = NearestNeighbors(n_neighbors=6).fit(animals_data)  
graph = neigh.kneighbors_graph(animals_data).toarray()

#generate a list of edges from adjacency matrix
edges = {}
for n in animal_names:
    edges[n] = []
    
for row in range(len(graph)):
    for col in range(len(graph[row])):
        if graph[row][col] and row != col:
            n1 = animal_names[row]
            n2 = animal_names[col]
            edges[n1].append((n1, n2))
            edges[n2].append((n2, n1))

for e in edges:
    edges[e] = set(edges[e])

In [21]:
import math

class Node:
    def __init__(self, data):
        self.data = data
        self.left = None
        self.right = None

class Hcluster:

    # data is the matrix of all our data points
    # edges is a symmetric adjacency list using some sort of id
    # table is our data indexed by the id for O(1) lookup
    def __init__(self, data, edges, table):
        self.data = data
        self.edges = edges
        self.table = table
    
    def split(self, data, feature):
        index = 0
        min_cost = math.inf
        running_cost = 0
        weighted_cost = 0
        
        # TODO: change this line later
        sorted_data = [[a, self.table[a]] for a in self.table.keys()]
        
        # Sort by name first and then value
        sorted_data.sort(key=lambda x: x[0])
        sorted_data.sort(key=lambda x: x[1][feature])
        
        sorted_dict = {}
        for i in range(len(sorted_data)):
            sorted_dict[sorted_data[i][0]] = i
        
        for i in range(len(sorted_data)-1):
            start_edge = sorted_data[i][0]
            for e in self.edges[start_edge]:
                end_edge = e[1]
                if sorted_dict[start_edge] < sorted_dict[end_edge]:
                    running_cost += 1
                else:
                    running_cost -= 1
            weighted_cost = running_cost/((i+1)*(len(sorted_data)-i-1))
            if weighted_cost < min_cost:
                index = i
                min_cost = weighted_cost
        return(index, min_cost)
        
        
    def makeTree(self, data):
        pass

In [26]:
c = Hcluster(animals, edges, animals_dict)
c.split(animals, 0)

(8, 0.1002710027100271)