# kd-tree implementation for KNN

Reference: [github](https://github.com/Vectorized/Python-KD-Tree)

Step 1: Import packages.

In [1]:
import random
import heapq
import time
import numpy as np
from operator import itemgetter
import pandas as pd
from sklearn.cross_validation import train_test_split
import cv2
import tqdm



Step 2: Prepare data(extract hog features).

In [2]:
raw_data = pd.read_csv('../data/train.csv', header=0)
data = raw_data.values
imgs = data[0:, 1:] # for one row, the first column is the label followed by the image data
imgs = imgs.astype(np.uint8)
labels = data[:, 0]
features = np.zeros((imgs.shape[0], 324))

hog = cv2.HOGDescriptor('../data/hog.xml')
for index in range(len(imgs)):
    features[index] = hog.compute(imgs[index].reshape(28, 28)).squeeze()

print(features[0].shape)
    
# 选取 2/3 数据作为训练集， 1/3 数据作为测试集
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=23323)

x_test = x_test[:500]
y_test = y_test[:500]

(324,)


Step 3: Build model.

In [3]:
class KNN(object):
    def __init__(self, x_train, y_train):
        '''
            x_train: (num_data, num_dims)
            y_train: (num_data, )
            self.xy: (num_data, num_dims + 1)
        '''
        self.xy = np.zeros((x_train.shape[0], x_train.shape[1] + 1), dtype=float)
        self.xy[:, :-1] = x_train
        self.xy[:, -1] = y_train
        self.num_dims = x_train.shape[1]
        self.kdtree = self.create_kdtree(self.xy)
        self.num_classes = len(set(y_train))
    
    def euclidean_distance(self, point1, point2):
        return np.sqrt(np.sum((point1 - point2) ** 2))
    
    def get_neighbors_naive(self, x_test, k):
        neighbors = list()
        for feature in tqdm.tqdm(x_test):
            curr_res = [(self.euclidean_distance(xy[:-1], feature), list(xy[:-1])) for xy in self.xy]
            neighbors.append(sorted(curr_res)[:k])
        return neighbors
    
    def create_kdtree(self, points, i=0):
        if len(points) == 1:
            return (None, None, points[0])
        elif len(points) == 0:
            return None
        points = sorted(points, key=lambda x: x[i])
        middle = len(points) // 2
        i = (i + 1) % self.num_dims
        return (self.create_kdtree(points[:middle], i), self.create_kdtree(points[middle + 1:], i), points[middle])
    
    def get_neighbors_kdtree_single(self, node, point, k, i=0, heap=None):
        '''
            point: (num_dims)
            curr_xy: (num_dims + 1)
        '''
        if node == None:
            return None
        left_node, right_node, curr_xy = node
        dist = self.euclidean_distance(point, curr_xy[:-1])
        dx = point[i] - curr_xy[i]
        if not heap:
            heap = list()
        # add curr_point to k list
        if len(heap) < k:
            heapq.heappush(heap, (-dist, curr_xy))
        elif dist < -heap[0][0]:
            heapq.heappushpop(heap, (-dist, curr_xy))
        
        i = (i + 1) % self.num_dims
        
        # updown
        if dx < 0:
            self.get_neighbors_kdtree_single(left_node, point, k, i, heap)
        else:
            self.get_neighbors_kdtree_single(right_node, point, k, i, heap)
        
        # downup
        if abs(dx) < -heap[0][0]: # if another area is covered, the other child should be visited
            if dx < 0:
                self.get_neighbors_kdtree_single(right_node, point, k, i, heap)
            else:
                self.get_neighbors_kdtree_single(left_node, point, k, i, heap)
        
        return [(-neg_dist, list(xy)) for (neg_dist, xy) in sorted(heap, key=lambda x: -1 * x[0])]
    
    def get_neighbors_kdtree(self, x_test, k):
        res = list()
        for feature in tqdm.tqdm(x_test):
            res.append(self.get_neighbors_kdtree_single(self.kdtree, feature, k))
        return res
    
    def get_neighbors(self, x_test, k, mode='kdtree'):
        if mode == 'naive':
            return self.get_neighbors_naive(x_test, k)
        elif mode == 'kdtree':
            return self.get_neighbors_kdtree(x_test, k)
        return None
    
    def predict(self, x_test, k, mode='kdtree'):
        neighbors = self.get_neighbors(x_test, k, mode)
        y_predicted = list()
        for index in range(len(x_test)):
            y_count = dict()
            for neighbor in neighbors[index]:
                dist, xy = neighbor
                y = xy[-1]
                y_count[y] = y_count.get(y, 0) + 1
            y_pred = sorted(y_count.items(), key=itemgetter(1), reverse=True)[0][0]
            y_predicted.append(y_pred)
            
        return y_predicted

Step 4: Get neighbors with naive knn and kdtree knn. / Predict labels.

*Test Neighbors*

Define a useful function that only prints the distances of neighbors

In [4]:
def print_dist(neighbors):
    for index, neighbor in enumerate(neighbors):
        print(index, [dist for dist, _ in neighbor])

**1. naive knn**

In [5]:
knn = KNN(x_train, y_train)
t1 = time.time()
neighbors1 = knn.get_neighbors(x_test, 8, mode='naive')
t2 = time.time()
print('time cost:{}s'.format(t2 - t1))
print_dist(neighbors1[::1000])

100%|██████████| 500/500 [07:56<00:00,  1.09it/s]

time cost:476.6258080005646s
0 [0.9376415984996561, 0.9816037981401295, 1.0399242574178866, 1.0446794591555788, 1.0469156624331093, 1.053519790052552, 1.065016312611107, 1.0923577771706792]





**2. kdtree knn**

In [6]:
knn = KNN(x_train, y_train)
t1 = time.time()
neighbors2 = knn.get_neighbors(x_test, 8, mode='kdtree')
t2 = time.time()
print('time cost:{}s'.format(t2 - t1)) # more time is used. Maybe because the cost of function call
print_dist(neighbors2[::1000])

100%|██████████| 500/500 [25:36<00:00,  3.06s/it]

time cost:1536.5977761745453s
0 [0.9376415984996561, 0.9816037981401295, 1.0399242574178866, 1.0446794591555788, 1.0469156624331093, 1.053519790052552, 1.065016312611107, 1.0923577771706792]





*Test label prediction*

In [7]:
knn = KNN(x_train, y_train)
t1 = time.time()
y_predicted = knn.predict(x_test, 8, 'kdtree')
t2 = time.time()
print('time cost:{}s'.format(t2 - t1))
y_predicted = list(map(int, y_predicted))
print(sum(np.array(y_predicted) == np.array(y_test)) / len(y_predicted))

100%|██████████| 500/500 [25:14<00:00,  3.11s/it]

time cost:1514.2618918418884s
0.978



