##### Task 1 - Write a program to compute and report the MBR for the POI dataset D

In [1]:
# Import library
from csv import reader
from collections import defaultdict
import numpy as np
import random
import time

In [2]:
# Create a class Rectangle for all rectangles in this assignment
# Include MBR, grids in EXCELL & quad-tree and query windows
class Rectangle:
    def __init__(self, cx, cy, hw, hh, bucket = None):
        '''
        cx: x-coordinate of centre
        cy: y-coordinate of centre
        hw: half width of rectangle
        hh: half height of rectangle
        bucket: store points
        '''
        self.cx, self.cy, self.hw, self.hh = cx, cy, hw, hh
        self.west = cx - hw
        self.east = cx + hw
        self.north = cy + hh
        self.south = cy - hh
        self.result = []
        self.cells_searched = 0
        self.points_searched = 0
        if bucket != None:
            self.bucket = bucket
        else:
            self.bucket = []
    
    def get(self):
        '''
        Return the 4 edges of Rectangle
        '''
        return self.west, self.east, self.south, self.north
    
    def contains(self, point):
        """
        Check if a point is inside this Rectangle
        """
        global mbr
        if (self.west <= point[0] < self.east and self.south <= point[1] < self.north):
            return True
        elif (point[0] == mbr.east == self.east and self.south <= point[1] < self.north):
            return True
        elif (self.west <= point[0] < self.east and point[1] == mbr.north == self.north):
            return True
        elif (point[0] == mbr.east == self.east and point[1] == mbr.north == self.north):
            return True
        else:
            return False

    def overlaps(self, obj):
        """
        Check if another Rectangle intersects with this Rectangle
        """
        if obj.west > self.east or obj.east < self.west or \
           obj.north < self.south or obj.south > self.north:
            result = False
        else:
            result = True
        return result

In [3]:
def data_transform_rectangle(data):
    '''
    Extract the MBR features from dataset
    Return:
    cx: x-coordinate of centre
    cy: y-coordinate of centre
    hw: half width of rectangle
    hh: half height of rectangle
    '''
    x_low = min(data[:, 0])
    x_high = max(data[:, 0])
    y_low = min(data[:, 1])
    y_high = max(data[:, 1])
    cx = (x_low + x_high) / 2
    cy = (y_low + y_high) / 2
    hw = (x_high - x_low) / 2
    hh = (y_high - y_low) / 2
    return cx, cy, hw, hh

In [20]:
# Get the MBR for the POI dataset D
data = np.genfromtxt('AllPOI Simplified.csv', dtype="float", delimiter=',', encoding="utf-8-sig")
cx, cy, hw, hh = data_transform_rectangle(data)
mbr = Rectangle(cx, cy, hw, hh)
a, b, c, d = mbr.get()
print('MBR minimum of x-coordinate:', a)
print('MBR maximum of x-coordinate:', b)
print('MBR minimum of y-coordinate:', c)
print('MBR maximum of y-coordinate:', d)

MBR minimum of x-coordinate: 115.43504300000001
MBR maximum of x-coordinate: 117.488743
MBR minimum of y-coordinate: 39.439945
MBR maximum of y-coordinate: 41.029975


##### Task 2 - Create an in-memory index for D - EXCELL

##### Task 2(1): Write a program to create an EXCELL index

In [5]:
class EXCELL:
    def __init__(self, MBR, points, start_N, bucket_size):
        '''
        MBR: definition of space
        points: all points in the space
        start_N: number of cells at the start of iteration
        bucket_size: maximum capacity of each bucket

        cells_summary: summary of counting number of points inside cells
        grids: list of all grids (class Rectangle) in the space
        N: the minimum number of N that fulfils the bucket size
        '''
        self.MBR = MBR
        self.points = points
        self.start_N = start_N
        self.bucket_size = bucket_size
        self.cells_summary = dict()
        self.grids = list()
        self.N = None

    def get_N(self):
        '''
        Get the minimum number of N that fulfils the bucket size
        by looping through the N from user-defined start_N, which is 1 in this assignment
        '''
        N = self.start_N  # Initialize number of cells on each axis
        stop_split = False  # Initialize splitting status
        while stop_split == False:
            self.cells = dict()
            x_interval = (self.MBR.east-self.MBR.west)/N
            y_interval = (self.MBR.north-self.MBR.south)/N
            for i in range(N):
                for j in range(N):
                    self.cells[(i, j)] = []
            for point in self.points:
                x = point[0]
                y = point[1]
                # Bound the points on the edge to the grid: avoid error
                x_coor = (x-self.MBR.west)//x_interval
                if x_coor < 0:
                    x_coor = 0
                y_coor = (y-self.MBR.south)//y_interval
                if y_coor < 0:
                    y_coor = 0
                if x == self.MBR.east:
                    self.cells[(x_coor-1, y_coor)].append(point)
                elif y == self.MBR.north:
                    self.cells[(x_coor, y_coor-1)].append(point)
                else:
                    self.cells[(x_coor, y_coor)].append(point)

            # Check the number of points in cells
            length = []
            for key, value in self.cells.items():
                length.append(len(value))
            check_length = np.array(length)
            # Check if the number of points in all cells is under 256
            stop_split = ((check_length <= self.bucket_size).sum()
                          == check_length.size)
            # Jump out of loop or split more
            if stop_split == True:
                self.N = N
                break
            else:
                N += 1
        return N

    def get_grids(self):
        '''
        Transform the dictionary key with correct data structure (Rectangle)
        Save to self.grids as a list of grids
        '''
        structured_cells = defaultdict(list)
        for key, value in self.cells.items():
            half_w = self.MBR.hw / self.N
            half_h = self.MBR.hh / self.N
            cx = self.MBR.west + half_w + 2*half_w * key[0]
            cy = self.MBR.south + half_h + 2*half_h * key[1]
            new_key = Rectangle(cx, cy, half_w, half_h, value)
            structured_cells[new_key] = value
        for rect in structured_cells.keys():
            self.grids.append(rect)
    
    def get_summary(self):
        '''
        Compute the summary for Task 2(1)
        '''
        cells_summary = defaultdict(int)
        for i in range(self.N):
            for j in range(self.N):
                number_of_points = len(self.cells[i, j])
                if number_of_points == 0:
                    cells_summary['0'] += 1
                elif 1 <= number_of_points <= 25:
                    cells_summary['1-25'] += 1
                elif 26 <= number_of_points <= 239:
                    cells_summary['26-239'] += 1
                elif 240 <= number_of_points <= 255:
                    cells_summary['240-255'] += 1
                elif number_of_points == 256:
                    cells_summary['256'] += 1
                else:
                    print('Number of points exceeds buscket size of a cell.')
        self.cells_summary = dict(cells_summary)
        return self.cells_summary

In [6]:
# Create EXCELL index and report the value of n
start_n = 1
max_bucket_size = 256
excell = EXCELL(mbr, data, start_n, max_bucket_size)
start_time = time.time()
print('n:', excell.get_N())
get_N_time = time.time() - start_time
print('Time used to get N:', get_N_time)

n: 272
Time used to get N: 297.3092770576477


In [23]:
# Report the number of cells containing different amount of points
excell.get_grids()
summary = excell.get_summary()
cells_sum = 0
for key, value in summary.items():
    print('Number of cells containing {} points:'.format(key), value)
    cells_sum += value
print('Total number of cells in quad-tree:', cells_sum)

Number of cells containing 0 points: 62197
Number of cells containing 1-25 points: 9979
Number of cells containing 26-239 points: 1801
Number of cells containing 240-255 points: 7
Total number of cells in quad-tree: 73984


##### Task 2(2): Write a program to perform window queries

In [8]:
def excell_window_query(window, space_grids):
    '''
    Return
    points_inside: Number of points inside query window
    cells_searched: Number of index cells searched
    points_searched: Number of points searched
    '''
    points_inside, cells_searched, points_searched = 0, 0, 0
    for grid in space_grids:
        if window.overlaps(grid):
            cells_searched += 1
            for point in grid.bucket:
                points_searched += 1
                if window.contains(point):
                    points_inside += 1
    return points_inside, cells_searched, points_searched

##### Task 2(3): Generate 10 random window queries, search and report

In [9]:
def gen_rand_window(mbr, window_number):
    '''
    mbr: definition of space
    window_number: generate n windows defined by user
    '''
    window = []
    while len(window) < window_number:
        tmp = None
        # Bound the windows within MBR
        cx = random.uniform(mbr.west, mbr.east)
        cy = random.uniform(mbr.south, mbr.north)
        hw = min(cx - mbr.west, mbr.east - cx) * random.uniform(0, 1)
        hh = min(cy - mbr.south, mbr.north - cy) * random.uniform(0, 1)
        tmp = Rectangle(cx, cy, hw, hh)
        window.append(tmp)
    return window

In [10]:
# Generate random windows
window_number = 10
random_window = gen_rand_window(mbr, window_number)
random_window

[<__main__.Rectangle at 0xb259b74a30>,
 <__main__.Rectangle at 0xb259b74d30>,
 <__main__.Rectangle at 0xb259b74b20>,
 <__main__.Rectangle at 0xb259b74d60>,
 <__main__.Rectangle at 0xb259b74ac0>,
 <__main__.Rectangle at 0xb259b74f70>,
 <__main__.Rectangle at 0xb259b74e50>,
 <__main__.Rectangle at 0xb259b74eb0>,
 <__main__.Rectangle at 0xb259b749a0>,
 <__main__.Rectangle at 0xb259b74a90>]

In [11]:
# Perform window query
count = 1
for rw in random_window:
    start_time = time.time()
    a, b, c = excell_window_query(rw, excell.grids)
    query_time = time.time() - start_time
    print('Window', count)
    print('Time used to query:', query_time)
    print('Number of points inside:', a)
    print('Number of index cells searched:', b)
    print('Number of points searched:', c)
    print('')
    count += 1

Window 1
Time used to query: 0.06204080581665039
Number of points inside: 723
Number of index cells searched: 183
Number of points searched: 938

Window 2
Time used to query: 0.38059568405151367
Number of points inside: 157354
Number of index cells searched: 18542
Number of points searched: 157887

Window 3
Time used to query: 0.046880245208740234
Number of points inside: 0
Number of index cells searched: 18
Number of points searched: 0

Window 4
Time used to query: 0.29958152770996094
Number of points inside: 102788
Number of index cells searched: 12180
Number of points searched: 105510

Window 5
Time used to query: 0.06250548362731934
Number of points inside: 4745
Number of index cells searched: 1740
Number of points searched: 5066

Window 6
Time used to query: 0.0625007152557373
Number of points inside: 192
Number of index cells searched: 495
Number of points searched: 227

Window 7
Time used to query: 0.04687690734863281
Number of points inside: 208
Number of index cells searched: 

##### Task 3 - Create an in-memory index for quad-tree decompaction

##### Task 3(1): Write a program to create an index following quad-tree decompaction

##### Task 3(2): Write a program to perform window queries

In [12]:
# Create a class Node for all nodes in quad-tree
class Node:
    def __init__(self, space, depth, bucket_size):
        '''
        self.space: Rectangle of the node
        self.depth: depth in the tree
        self.bucket_size: maximum capacity of the bucket (i.e. 256 in this assignment)
        self.nw, self.ne, self.sw, self.se: children of node
        self.have_children: boolean to judge this node having children or not
        '''
        self.space = space
        self.depth = depth
        self.bucket_size = bucket_size
        self.nw = None
        self.ne = None
        self.sw = None
        self.se = None
        self.have_children = False
    
    def split(self):
        '''
        Split the node, re-allocate the points to children and compute m
        '''
        global minlevel
        cx, cy = self.space.cx, self.space.cy
        hw, hh = self.space.hw / 2, self.space.hh / 2
        self.nw = Node(Rectangle(cx - hw, cy + hh, hw, hh), self.depth + 1, self.bucket_size)
        self.ne = Node(Rectangle(cx + hw, cy + hh, hw, hh), self.depth + 1, self.bucket_size)
        self.sw = Node(Rectangle(cx - hw, cy - hh, hw, hh), self.depth + 1, self.bucket_size)
        self.se = Node(Rectangle(cx + hw, cy - hh, hw, hh), self.depth + 1, self.bucket_size)
        for child in [self.nw, self.ne, self.sw, self.se]:
            for point in self.space.bucket:
                if child.space.contains(point):
                    child.space.bucket.append(point)
        minlevel = max(minlevel, self.depth + 1)
        self.space.bucket = []
        self.have_children = True
    
    def insert(self, point):
        '''
        Insert a point to node
        '''
        if self.space.contains(point):
            if self.have_children == True:
                self.nw.insert(point)
                self.ne.insert(point)
                self.sw.insert(point)
                self.se.insert(point)
            elif len(self.space.bucket) < self.bucket_size:
                self.space.bucket.append(point)
            else:
                self.split()
                self.nw.insert(point)
                self.ne.insert(point)
                self.sw.insert(point)
                self.se.insert(point)
    
    def count_bucket(self, count_list):
        '''
        Count the number of points in the grid
        '''
        if self.have_children == True:
            self.nw.count_bucket(count_list)
            self.ne.count_bucket(count_list)
            self.sw.count_bucket(count_list)
            self.se.count_bucket(count_list)
        else:
            count_list.append(len(self.space.bucket))
        return count_list
    
    # Part of Task 3(2)
    def query(self, window):
        '''
        Carry out window query for each node
        Store the result into window node
        '''
        if window.overlaps(self.space):
            if self.have_children == True:
                self.nw.query(window)
                self.ne.query(window)
                self.sw.query(window)
                self.se.query(window)
            else:
                window.cells_searched += 1
                for point in self.space.bucket:
                    window.points_searched += 1
                    if window.contains(point):
                        window.result.append(point)

In [13]:
# Create a class QuadTree to build a quad-tree
class QuadTree:
    def __init__(self, space, depth, bucket_size):
        '''
        self.space: Rectangle of the whole sapce
        self.depth: starting depth of the tree
        self.bucket_size: maximum capacity of the bucket (i.e. 256 in this assignment)
        self.root: transform the root into class Node
        '''
        self.space = space
        self.depth = depth
        self.bucket_size = bucket_size
        self.root = Node(self.space, self.depth, self.bucket_size)
    
    def add_points(self, points):
        '''
        Add points into the tree
        '''
        for point in points:
            self.root.insert(point)

    def get_summary(self):
        '''
        Compute the summary for Task 3(1)
        '''
        cells_summary = defaultdict(int)
        cur = self.root
        sum_list = cur.count_bucket(list())
        for i in sum_list:
            if i == 0:
                cells_summary['0'] += 1
            elif 1 <= i <= 25:
                cells_summary['1-25'] += 1
            elif 26 <= i <= 239:
                cells_summary['26-239'] += 1
            elif 240 <= i <= 255:
                cells_summary['240-255'] += 1
            elif i == 256:
                cells_summary['256'] += 1
            else:
                print('Number of points exceeds buscket size of a cell.')
        return dict(cells_summary)
    
    # Part of Task 3(2)
    def window_query(self, window):
        '''
        Return
        points_inside: Number of points inside query window
        cells_searched: Number of index cells searched
        points_searched: Number of points searched
        '''
        cur = self.root
        cur.query(window)
        return window.result, window.cells_searched, window.points_searched

In [21]:
# Create quad-tree index and report the value of m
start_depth = 0
max_bucket_size = 256
minlevel = 1
start_time = time.time()
quadtree = QuadTree(mbr, start_depth, max_bucket_size)
quadtree.add_points(data)
qtree_time = time.time() - start_time
print('m:', minlevel)
print('Time used to build quad-tree:', qtree_time)

m: 9
Time used to build quad-tree: 12.975633382797241


In [22]:
# Report the number of cells containing different amount of points
summary = quadtree.get_summary()
reorder_list = ['0', '1-25', '26-239', '240-255', '256']
reordered_summary = {k: summary[k] for k in reorder_list}
cells_sum = 0
for key, value in reordered_summary.items():
    print('Number of cells containing {} points:'.format(key), value)
    cells_sum += value
print('Total number of cells in quad-tree:', cells_sum)

Number of cells containing 0 points: 34
Number of cells containing 1-25 points: 99
Number of cells containing 26-239 points: 1611
Number of cells containing 240-255 points: 28
Number of cells containing 256 points: 2
Total number of cells in quad-tree: 1774


##### Task 3(3): Use the same 10 random window queries as Task 2(2), search and report

In [16]:
# Perform window query
count = 1
for rw in random_window:
    start_time = time.time()
    a, b, c = quadtree.window_query(rw)
    query_time = time.time() - start_time
    print('Window', count)
    print('Time used to query:', query_time)
    print('Number of points inside:', len(a))
    print('Number of index cells searched:', b)
    print('Number of points searched:', c)
    print('')
    count += 1

Window 1
Time used to query: 0.006003618240356445
Number of points inside: 723
Number of index cells searched: 16
Number of points searched: 1310

Window 2
Time used to query: 0.35767388343811035
Number of points inside: 157354
Number of index cells searched: 1539
Number of points searched: 162027

Window 3
Time used to query: 0.0
Number of points inside: 0
Number of index cells searched: 1
Number of points searched: 0

Window 4
Time used to query: 0.250014066696167
Number of points inside: 102788
Number of index cells searched: 1007
Number of points searched: 108220

Window 5
Time used to query: 0.024188756942749023
Number of points inside: 4745
Number of index cells searched: 51
Number of points searched: 5080

Window 6
Time used to query: 0.004002094268798828
Number of points inside: 192
Number of index cells searched: 11
Number of points searched: 1182

Window 7
Time used to query: 0.0020012855529785156
Number of points inside: 208
Number of index cells searched: 6
Number of points