In [178]:
import math
import random
import numpy as np
from sklearn import datasets
from sklearn import cluster

In [179]:
%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [180]:
%%cython -a

import cython
import numpy as np
cimport numpy as np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False) 

def _nn_search_cython(int q, np.ndarray[list, ndim = 1] d, int N):
    """
    find the nn set for ponit q
    """
    cdef np.ndarray P
    cdef int m, i
    
    m = d.shape[0]
    P = np.ndarray([q])
    for i in xrange(m):
        if q in d[i]:
            P = np.append(P, d[i])
                
    P = np.unique(P.astype(int))
    P = np.delete(P,np.where(P==q))
    P = np.delete(P,np.where(P<0))
    P = np.delete(P,np.where(P>N))
    return P

def nn_search_cython(q, d,N):
    return _nn_search_cython(q, d, N)

In [181]:
import numba
from numba import jit
from functools import reduce

In [182]:
from ipyparallel import Client
rc = Client()
dv = rc[:]

In [183]:
def get_input(x):
    temp = []
    for i,v in enumerate(x):
        temp.append([v,i])
    return temp

def chainHash(InputList, Leafs):
    res = {}
    for tup in InputList:
        if tup[0] not in res:
            temp = []
            temp.append(tup[1])
            res["%s" % tup[0]] = temp
        else:
            parent = list(map(lambda s: find_parentid(Leafs[s]), res["%s" % tup[0]]))
            if (find_parentid(Leafs[tup[1]]) not in parent) | (Leafs[tup[1]].parent is None):
                res["%s" % tup[0]].append(tup[1])
    return res

def find_parentid(Node):
    temp = None
    if Node.parent is not None:
        temp = find_parentid(Node.parent)
    else:
        temp = Node.id
    if temp>=0:
        return None
    else:
        return temp

def find_parentNode(Node):
    if Node.parent is not None:
        return find_parentNode(Node.parent)
    else:
        return Node

def euler_distance(point1, point2):
    """
    imput: point1, point2: list
    output: float
    """
    return np.linalg.norm(point1 - point2)


def flatten(xs, acc=[]):
    """Blah"""
    return list(reduce(lambda x,y: x+y,xs))

@dv.parallel(block = True)
def change_unary2(x):
    temp = ''
    for num in x:
        tem = int(11 - num)
        temp += ("1"*(11-tem)+ "0"*tem)
    return temp

In [184]:
class Nodes(object):
    def __init__(self,id):
        """
        :param vec
        :param left
        :param right
        :param distance
        :param id
        :param count
        """
        self.parent = None
        self.children = []
        self.id = id
    def add_leaf(self, leaf):
        if leaf not in self.children:
            self.children.append(leaf)
    def set_parent(self, node):
        if self.parent is not None:
            pass
        else:
            self.parent = node
    def show_childrenid(self):
        temp = []
        for child in self.children:
            temp.append(child.id)
        return temp
    def display(self,depth):
        print ('-'*depth + "  " +str(self.id))
        for child in self.children:
            child.display(depth+2)
    
class Leafs(Nodes):
    def __init__(self,id, vec):
        """
        :param vec
        :param left
        :param right
        :param distance
        :param id
        :param count
        """
        self.vec = vec
        self.parent = None
        self.children = []
        self.id = id
    def add_leaf(self,leaf):
        raise Exception("Leaf nodes can't insert catalog")
    def set_parent(self, node):
        if self.parent is not None:
            raise Exception("It has a parent already")
        else:
            self.parent = node

In [185]:
class LSH(object):
    def __init__(self, k, l, C, d):
        """
        k: number of sampled bits
        l: number of hash functions
        C: a constant
        d: number of attributes
        R: minimun distance
        """
        assert l > 0
        assert k > 0
        self.k = k
        self.l = l
        self.C = C
        self.d = d
        self.I = []
    def creat_I(self):
        """
        create l distinct hash functions
        """
        while (len(self.I) < self.l):
            temp = sorted(random.sample(range(self.C*self.d),self.k))
            if temp not in self.I:
                self.I.append(temp)
                
    @dv.parallel(block = True)
    def change_unary(self, x):
        """
        change the list into unary expression
        x: list[1*d]
        """
        temp = ''
        for num in x:
            tem = int(self.C - num)
            temp += ("1"*(self.C-tem)+ "0"*tem)
        return temp
    
    def get_h_value(self, v, fun_I):
        temp = np.array(list(v))
        return ''.join(temp[fun_I])
    def hash_table(self,data): 
        """
        每一行对应一个hash function的值
        """
        m,n = np.shape(data)
        h_table = []
        v_table = np.array(change_unary2.map(data))
        self.creat_I()
        for fun_I in self.I:
            temp = list(map(lambda s: self.get_h_value(s, fun_I), v_table))
            h_table.append(temp)
        return np.array(h_table)
    def get_buckets(self,Leafs,h_table):
        r = list(map(lambda s: chainHash(get_input(s), Leafs),h_table))
        return r

In [202]:
class Hierarchical(object):
    def __init__(self):
        self.labels = None
        self.Nodes = []
        self.point_num = 0
    def merge_nodes(self, node1, node2):
        newid = -len(self.Nodes)-1
        flag = 0
        if (node1.parent is not None) & (node2.parent is not None):
            if find_parentid(node1) == find_parentid(node2):
                flag = 1
            else:
                NewNode = Nodes(id = newid)
                NewNode.add_leaf(find_parentNode(node1))
                NewNode.add_leaf(find_parentNode(node2))
                find_parentNode(node1).set_parent(NewNode)
                find_parentNode(node2).set_parent(NewNode)
                self.Nodes.append(NewNode)
        if (node1.parent is not None) & (node2.parent is None):
            newid = find_parentid(node1)
            self.Nodes[np.abs(newid)-1].add_leaf(node2)
            node2.set_parent(self.Nodes[np.abs(newid)-1])
        if (node1.parent is None) & (node2.parent is not None):
            newid = find_parentid(node2)
            self.Nodes[np.abs(newid)-1].add_leaf(node1)
            node1.set_parent(self.Nodes[np.abs(newid)-1])
        if (node1.parent is None) & (node2.parent is None):
            NewNode = Nodes(id = newid)
            NewNode.add_leaf(node1)
            NewNode.add_leaf(node2)
            node1.set_parent(NewNode)
            node2.set_parent(NewNode)
            self.Nodes.append(NewNode)
        return flag
             
    def fit(self, x, R, A, C,l):
        """
        x:raw data, m*n
        """
        leafs = [Leafs(vec=v, id=i) for i,v in enumerate(x)]
        distances = {}
        self.point_num, future_num = np.shape(x)  
        self.labels = [ -1 ] * self.point_num
        currentNo = self.point_num
        i = 1
        while (currentNo > 1) & (R < 20):
            #k = int(future_num * C * np.sqrt(future_num)/(2 * R))+3
            k = 10
            ls = LSH(k,l ,C ,d = future_num)
            h_table = ls.hash_table(x)
            r = ls.get_buckets(leafs, h_table)
            w = np.array(flatten(list(map(lambda s: list(s.values()),r))))
            for p in range(self.point_num):
                P = nn_search_cython(p, w, self.point_num+1).astype(int)
                for q in P:
                    d_key = (p, q)
                    if d_key not in distances:
                        distances[d_key] = euler_distance(leafs[p].vec, leafs[q].vec)
                    d = distances[d_key]
                    if i <= 1:
                        if d <= R:
                            flag = self.merge_nodes(leafs[p], leafs[q])
                            if flag == 0:
                                currentNo -= 1
                    else:
                        if (d <= R) & (d > R/A):
                            flag = self.merge_nodes(leafs[p], leafs[q])
                            if flag == 0:
                                currentNo -= 1
            i += 1
            R = R*A
        for i in range(self.point_num):
            self.labels[i] = find_parentid(leafs[i])

    def display_depth(self, depth):
        self.Nodes[-1].display(depth)


# real-world data example 1

In [489]:
from sklearn import datasets
from sklearn import cluster
iris = datasets.load_iris()
test = iris.data

In [490]:
d = pd.DataFrame(test)

In [491]:
d = (d - d.min())/(d.max() - d.min())

In [492]:
test = d.values

In [510]:
%%time
lsh1 = Hierarchical()
lsh1.fit(test, R =0.2, A =1.5, C = 11, l = 30)

CPU times: user 1 s, sys: 8 ms, total: 1.01 s
Wall time: 1.03 s


In [511]:
lsh1.display_depth(0)

  -11
--  -10
----  -8
------  -7
--------  100
--------  104
--------  115
--------  120
--------  136
--------  140
--------  143
--------  144
--------  148
--------  102
--------  105
--------  107
------  -6
--------  -5
----------  57
----------  60
--------  -4
----------  -3
------------  53
------------  59
------------  62
------------  69
------------  80
------------  81
------------  82
------------  89
------------  90
------------  92
------------  94
------------  55
----------  -2
------------  50
------------  51
------------  52
------------  58
------------  65
------------  75
------------  76
------------  77
------------  86
------------  54
------------  56
------------  61
------------  63
------------  70
------------  74
------------  78
------------  85
------------  91
------------  97
------------  127
------------  133
------------  138
------------  129
------------  71
------------  72
------------  73
------------  83
------------  123
------------  12

In [527]:
def find_children(q, temp = []):
    l = lsh1.Nodes[q].children
    for num in l:
        if num.id>=0:
            temp.append(num.id)
        else:
            find_children((-num.id-1))
    return temp
a1 = find_children(0)

In [528]:
def find_children(q, temp = []):
    l = lsh1.Nodes[q].children
    for num in l:
        if num.id>=0:
            temp.append(num.id)
        else:
            find_children((-num.id-1))
    return temp
a2 = find_children(3)

In [446]:
len(test)

150

In [529]:
x1 = np.zeros(150)

In [530]:
x1[a1] = 1

In [531]:
x1[a2] = 2

In [532]:
x1.astype(int)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0,
       0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2])

In [518]:
from sklearn.cluster import AgglomerativeClustering 
model1 = AgglomerativeClustering(n_clusters = 3, linkage = 'ward')
model1.fit(test)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=3,
            pooling_func=<function mean at 0x7f242804bc80>)

In [533]:
np.array(model1.labels_)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2,
       2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])

# real-world data example 2

In [188]:
import pandas as pd

In [189]:
testdata = pd.read_csv("testdata.csv")

In [190]:
testdata = testdata.iloc[:,1:5].values

In [191]:
testdata = testdata * 10

In [208]:
%%time
lsh2 = Hierarchical()
lsh2.fit(testdata, R =1.5, A =1.5, C = 11, l = 30)

CPU times: user 3.42 s, sys: 36 ms, total: 3.45 s
Wall time: 3.51 s


In [238]:
def find_children(q, temp = []):
    l = lsh2.Nodes[q].children
    for num in l:
        if num.id>=0:
            temp.append(num.id)
        else:
            find_children((-num.id-1))
    return temp
c1 = find_children(10)

In [239]:
def find_children(q, temp = []):
    l = lsh2.Nodes[q].children
    for num in l:
        if num.id>=0:
            temp.append(num.id)
        else:
            find_children((-num.id-1))
    return temp
c2 = find_children(22)

In [209]:
lsh2.display_depth(0)

  -25
--  -11
----  -10
------  -9
--------  -8
----------  10
----------  11
--------  -3
----------  2
----------  5
----------  16
----------  17
----------  18
----------  22
----------  28
----------  38
----------  50
----------  55
----------  56
----------  61
----------  65
----------  66
----------  83
----------  89
----------  95
----------  96
----------  97
----------  98
----------  109
----------  117
----------  119
----------  122
----------  124
----------  131
----------  133
----------  140
----------  144
----------  146
----------  6
----------  24
----------  30
----------  143
----------  29
----------  35
----------  91
----------  92
----------  101
--------  62
--------  69
--------  86
--------  120
--------  136
--------  13
------  -2
--------  1
--------  14
--------  19
--------  53
--------  58
--------  70
--------  82
--------  94
--------  116
--------  127
------  46
------  99
------  102
------  110
------  134
------  76
------  132
------  147


In [234]:
x = np.zeros(431)

In [236]:
x[c1]=1

In [240]:
x[c2]=2

In [245]:
x.astype(int)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [242]:
from sklearn.cluster import AgglomerativeClustering 
model = AgglomerativeClustering(n_clusters = 3, linkage = 'ward')
model.fit(testdata) 

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=3,
            pooling_func=<function mean at 0x7f242804bc80>)

In [244]:
np.array(model.labels_)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,