In [None]:
from sklearn.cluster import KMeans
import numpy as np

In [None]:
import heapq
from collections import deque

class PriorityQueue:
    """
    A simple priority queue implementation using heapq.
    Stores items with their priorities and allows to push and pop items efficiently.
    Lower priority values come first.
    """
    def __init__(self):
        self.items = []
        self.count = 0  # Tie-breaker for comparisons

    def __bool__(self):
        return bool(self.items)

    def push(self, priority, item):
        heapq.heappush(self.items, (priority, self.count, item))
        self.count += 1

    def pop(self):
        priority, count, item = heapq.heappop(self.items)
        return priority, item
class LimitedSet:
    """
    A limited set implementation to keep the k closest neighbors based on distance.
    Combines a priority queue with a set for uniqueness.
    """
    def __init__(self, k: int):
        self.pq = []  # Priority queue for neighbors
        self.items = set()  # Set to ensure uniqueness
        self.k = k
    def get_max_distance(self):
        """
        Get the maximum distance of the current neighbors.
        If no neighbors are present, return infinity.
        """
        return self.pq[-1][0] if self.pq else float('inf')
    def add(self, distance, value):
        """
        Add a candidate to the set if it's among the k closest.
        """
        if isinstance(value, np.ndarray):
            value = tuple(value)
        if (distance, value) in self.items:
            return  # Ignore duplicates

        heapq.heappush(self.pq, (-distance, value))  # Use -distance for max-heap behavior
        self.items.add(value)

        if len(self.pq) > self.k:
            # Remove the farthest element
            _, removed_value = heapq.heappop(self.pq)
            self.items.remove(removed_value)

    def get_sorted_neighbors(self):
        """
        Return the neighbors sorted by distance.
        """
        return sorted(((-d, v) for d, v in self.pq), key=lambda x: x[0])
    def __iter__(self):
        """
        Allow iteration over the elements in the priority queue.
        """
        return iter(self.get_sorted_neighbors())
    def discard(self, item) :
        if item in self.items:
            self.items.remove(item)
            # Rebuild the priority queue to maintain consistency
            self.pq = [entry for entry in self.pq if entry[1] != item]
            heapq.heapify(self.pq)  # Re-heapify after removal
    def limit(self):
        """
        Return the highest priority value (lowest value) in the limited set.
        """
        if not self.pq:
            return float('inf')
        while self.pq and self.pq[0][1] not in self.items:
            heapq.heappop(self.pq)
        return float("inf") if len(self.items) < self.k else -self.pq[0][0]


In [None]:
class Node:
    def __init__(self, tree, routing_object=None):
        """
        Initialize the base Node class.

        :param tree: The MTree instance this node belongs to.
        :param routing_object: The routing object of the node (None for root).
        """
        self.tree = tree  # Reference to the M-tree instance
        self.routing_object = routing_object  # The routing object or None for root
        self.covering_radius = 0  # r(Oi), the radius covering the subtree
        self.parent = None  # Reference to the parent node
        self.parent_distance = None  # d(Oi, P(Oi)) - not defined for the root
        self.distance_function = self.tree.distance_function




    def distance(self, item):
        """
        Calculate the distance from this node's routing_object to another node or value.

        :param item: The other node or value to calculate the distance to.
        :return: The calculated distance.
        """


        if isinstance(item, Node):

            return self.distance_function(self.routing_object, item.routing_object) + item.covering_radius

        return self.distance_function(self.routing_object, item)

    def min_distance(self, value):
        """
        Calculate the minimum distance from this node to a value.

        :param value: The value to calculate the minimum distance to.
        :return: The calculated minimum distance.
        """
        return max(0, self.distance_function(self.routing_object, value) - self.covering_radius)

    def max_distance(self, value):
        """
        Calculate the maximum distance from this node to a value.

        :param value: The value to calculate the maximum distance to.
        :return: The calculated maximum distance.
        """
        return self.distance_function(self.routing_object, value) + self.covering_radius
    def is_root(self):
        return self.parent == None
class DataNode(Node):
    def __init__(self, tree, value):
        """
        Initialize a data node that stores the feature value.

        :param tree: The MTree instance this data node belongs to.
        :param value: The feature value stored in this data node.
        """
        super().__init__(tree=tree, routing_object=value)
        self.tree = tree
        self.value = value  # The feature value (Oj)
        self.oid = id(value)  # Object identifier (oid(Oj))
        self.parent = None
        self.distance_from_parent = 0  # d(Oj, P(Oj))

    """
    Leaf node of the M-tree. Stores database objects.
    """
    def __repr__(self):
        """
        Represent the leaf node as a string.
        """
        return f"DataNode(enty={self.value})"

    def __iter__(self):

       yield self.routing_object

    def __contains__(self, item):
        return item == self.routing_object





class InternalNode(Node):
    def __init__(self, tree, children) -> None:
        """
        Initialize the InternalNode class.

        :param tree: The MTree instance this internal node belongs to.
        :param children: A list of child nodes for this internal node.
        """
        super().__init__(tree=tree, routing_object=None)
        self.children = []
        self.capacity = self.tree.node_capacity
        self.set_children(children)


    def __repr__(self):
        return f"<{repr(self.routing_object)}, r={repr(self.covering_radius)}, {repr(self.children)}>"

    def set_children(self, children):
        """
        Set the children of this internal node and calculate the covering radius.

        :param children: A list of child nodes to be set as children of this internal node.
        """
        if not children:
            return
        self.children.clear()
        if isinstance(children,InternalNode):
            print('hoho')
            self.routing_object = children.routing_object
            self.covering_radius = children.covering_radius
            self.children = children.children

        else:
            if self.routing_object is None:
                #print('child',children,' len ',len(children))
                self.routing_object = np.mean([child.routing_object for child in children], axis=0)
                #print('route',self.routing_object)

            for child in children:
                self._add_child(child)
            #print('waaaaaaaaaaaaaaaaaaaaa',self,'oo',children)
            self.covering_radius = max(self.distance(child) for child in self.children)

    def add_child(self, child):
        """
        Add a child node to this internal node. If the node exceeds capacity, it will be split.

        :param child: The child node to be added.
        """
        #child.parent = self
        #print('chilf',child)
        if len(self.children) >= self.capacity:
            print('i called split',self, ' parent',self.parent)
            self.split(child)
        else:
            self._add_child(child)

    def _add_child(self, child):
        child.parent = self
        self.children.append(child)
        if self.children:
            self.routing_object = np.mean([child.routing_object for child in self.children], axis=0)
        self.covering_radius = max(self.covering_radius, self.distance(child))

    def insert(self, value):
        """
        Insert a value into this node, possibly triggering a recursive descent and split.

        :param value: The value to be inserted.
        """

        if self.is_leaf():
            self.add_child(DataNode(self.tree, value))
        else:
            print('mochkila')
            self.covering_radius= max(self.covering_radius, self.distance(value))
            best_child = min(self.children, key=lambda c: c.min_distance(value))
            best_child.insert(value)
        #if self.is_leaf():
          #  if len(self.children) < self.capacity:
          #      self.children.append(DataNode(self.tree, value))
          #  else:
          #      self.split(DataNode(self.tree,value))
        #else:
          #  best_child = min(self.children, key=lambda c: c.min_distance(value))
            #best_child.insert(value)

    def is_leaf(self):
        return all(isinstance(child, DataNode) for child in self.children)

    def split(self, new_node):


        partitions1, partitions2 = self.promote_and_partition(new_node)
        #print("Splitting node:")
        #print("Old children:", self.children)
        #print("New partitions1:", partitions1)
        #print("New partitions2:", partitions2)

        #if len(partitions1) ==1:
           # self=partitions1[0]
        #else:
        #self.set_children(partitions1)

        #partitions1 = [child for child in partitions1 if child.covering_radius != self.cover]


        print('a',partitions1)
        print('b',partitions2)
        #print('problem',self.problem)
        #if self.parent == None:
            #print('hihi', [child for child in self.tree.root.children])
        if len(partitions1) == 1 and isinstance(partitions1[0], InternalNode):
            #print('dkhlt')
            untouched_subtree = partitions1[0]
            self.children = untouched_subtree.children  # Use the children directly
            self.routing_object = untouched_subtree.routing_object
            self.covering_radius = untouched_subtree.covering_radius
        else:
            # If partitions1 contains multiple nodes, rebuild `self`
            self.children = partitions1
            self.routing_object = np.mean([child.routing_object for child in self.children], axis=0)
            self.covering_radius = max(self.distance(child) for child in self.children)
        for child in self.children:
            child.parent = self
        if len(partitions2) == 1 and isinstance(partitions2[0], InternalNode):
            #print('dkhlt')
            print(partitions2[0])
            new_internal = InternalNode(tree=self.tree, children=partitions2[0])

            new_internal.parent = self.parent
        else:
            # If partitions1 contains multiple nodes, rebuild `self`
            new_internal = InternalNode(tree=self.tree, children=partitions2)
            new_internal.routing_object = np.mean([child.routing_object for child in new_internal.children], axis=0)
            new_internal.covering_radius = max(new_internal.distance(child) for child in new_internal.children)
            new_internal.parent = self.parent
        # If partitions1 contains multiple nodes, rebuild `self`

        #print('0000',new_internal)
        for child in new_internal.children:
            child.parent = new_internal

        if self.parent:

            self.parent.add_child(new_internal)
        else:

            #print('new',new_internal)
            #print("khraaaaa",self)
            new_root = InternalNode(tree=self.tree, children=[self, new_internal])
            self.tree.root = new_root
            self.parent = new_root
            new_internal.parent = new_root

    def promote_and_partition(self, node):

        #print(node)
        #print('hani',self)
        candidates = self.children + [node]
        #print(candidates)

        # Split policy: mRAD (minimizing the sum of radii)
        best_pair = None
        min_sum_radii = float('inf')

        for i, a in enumerate(candidates):
            for b in candidates[i + 1:]:
                a_list, b_list = [], []

                for item in candidates:
                    if item == a or item == b:
                        continue
                    if a.distance(item) < b.distance(item):
                        a_list.append(item)
                    else:
                        b_list.append(item)

                r_a = max(a.distance(child) for child in a_list) if a_list else 0
                r_b = max(b.distance(child) for child in b_list) if b_list else 0

                if r_a + r_b < min_sum_radii:
                    min_sum_radii = r_a + r_b
                    best_pair = (a, b, a_list, b_list)

        a, b, a_list, b_list = best_pair

        return [a] + a_list, [b] + b_list

    def __iter__(self):
        for node in self.children:
            yield from node

    def __contains__(self, item):
        return any(item in node for node in self.children)

In [None]:
class MTree:
    def __init__(self, values=(), *, node_capacity=2, distance_function=None):
        """
        Initialize the MTree.

        :param values: An iterable of values to be inserted into the MTree.
        :param node_capacity: Capacity of the nodes in the MTree.
        :param distance_function: A function to calculate the distance between two values.
        """
        self.distance_function = distance_function or (lambda a, b: abs(a - b))
        self.node_capacity = node_capacity
        self.length = 0
        self.root = None  # Root will be dynamically created as needed.


        for value in values:
            self.insert(value)

    def insert(self, value):

        """
        Insert a value into the MTree.

        :param value: The value to be inserted.
        """
        self.length += 1

        if self.root is None:
            self.root = InternalNode(tree=self, children=[DataNode(self, value)])
        else:
            self.root.insert(value)
            if self.root.parent:
                self.root = self.root.parent

    def __len__(self):
        """
        Return the number of elements in the MTree.
        """
        return self.length

    def __contains__(self, item):
        """
        Check if an item is in the MTree.

        :param item: The value to check.
        :return: True if the item is in the MTree, False otherwise.
        """
        if self.root is None:
            return False
        return item in self.root

    def __repr__(self):
        """
        Return a string representation of the MTree.
        """
        return repr(self.root) if self.root else "<empty tree>"

    def __iter__(self):
        """
        Iterate over the values in the MTree.

        :yield: Each value stored in the MTree.
        """
        if self.root:
            yield from self.root

    def build_with_clustering(self, data):
        """
        Build the MTree using divisive clustering.
        :param data: A list of data points to cluster and insert into the tree.
        """
        def divisive_clustering(points, max_capacity):
            """
            Perform divisive clustering on the points to create clusters
            with a maximum capacity of max_capacity.
            :param points: List of data points to cluster.
            :param max_capacity: Maximum number of points per cluster.
            :return: A list of clusters, each cluster containing its points.
            """
            clusters = [points]
            while any(len(cluster) > max_capacity for cluster in clusters):
                new_clusters = []
                for cluster in clusters:
                    if len(cluster) <= max_capacity:
                        new_clusters.append(cluster)
                    else:
                        # Use k-means to split the cluster into 2
                        routing_objects = [
                        point.routing_object if isinstance(point, InternalNode) else point
                        for point in cluster
                        ]
                        kmeans = KMeans(n_clusters=2, random_state=42)
                        labels = kmeans.fit_predict(np.array(routing_objects))
                        cluster1 = [cluster[i] for i in range(len(cluster)) if labels[i] == 0]
                        cluster2 = [cluster[i] for i in range(len(cluster)) if labels[i] == 1]
                        new_clusters.extend([cluster1, cluster2])
                clusters = new_clusters
            return clusters

        def calculate_centroid(cluster):
            """
            Calulate the centroid of a cluster.
            :param cluster: List of points (each point is a tuple or list of coordinates) in the cluster.
            :return: The centroid as a tuple representing the mean in each dimension.
            """
            points = np.array([node for node in cluster])  # Extract values from DataNodes
            return tuple(np.mean(points, axis=0))  # Calculate mean for each dimension


        clusters = divisive_clustering(data, self.node_capacity)
        nodes = []

        for cluster in clusters:
            centroid = calculate_centroid(cluster)
            covering_radius = max(self.distance_function(centroid, point) for point in cluster)
            data_nodes = [DataNode(self, value) for value in cluster]
            #print(data_nodes)
            internal_node = InternalNode(tree=self, children=data_nodes)
            internal_node.routing_object = centroid
            internal_node.covering_radius = covering_radius
            nodes.append(internal_node)
            #print(internal_node)
            #print('hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh')

        # Combine nodes into a tree structure
        while len(nodes) > 1:
            parent_clusters = divisive_clustering(nodes, self.node_capacity)
            #print(parent_clusters,'oo')
            new_nodes = []
            for cluster in parent_clusters:
                if len(cluster) == 1:
                    #print('cluster',cluster)
                    parent_node = cluster[0]
                else:
                    centroids = [node.routing_object for node in cluster]
                    centroid = np.mean(np.array(centroids), axis=0)
                    covering_radius = max(
                        self.distance_function(centroid, node.routing_object) + node.covering_radius
                        for node in cluster
                    )

                    parent_node = InternalNode(tree=self, children=cluster)

                    parent_node.routing_object = centroid
                    parent_node.covering_radius = covering_radius
                new_nodes.append(parent_node)
                #print('parent',parent_node)
            nodes = new_nodes

        self.root = nodes[0]
    def batch_knn_search(self, query_points, k):
        """
        Perform k-NN search for multiple query points and organize results by query point.

        :param query_points: List of query points to search for
        :param k: Number of nearest neighbors to find per query point
        :return: List of (query_point, neighbors) pairs
        """
        all_results = []

        for query_point in query_points:
            neighbors = self.knn_search(query_point, k)
            # Store results as (query_point, neighbors) pair
            all_results.append((query_point, neighbors))

        return all_results

    def knn(self, query_object, k):
            """
            Find the k nearest neighbors of the given query_object in the MTree.

            :param query_object: The object to query for nearest neighbors.
            :param k: The number of nearest neighbors to find.
            :return: A list of the k nearest neighbors.
            """
            if k <= 0:
                return []

            if self.root is None:
                return []
            # Initialize structures
            nearest_neighbors = LimitedSet(k)  # NN array
            priority_queue = PriorityQueue()

            # Start with the root node
            priority_queue.push(self.root.min_distance(query_object), self.root)

            # Dynamic search radius
            max_distance = float('inf')

            while priority_queue:
                d_min, node = priority_queue.pop()
                nearest_neighbors.discard(node)

                # Prune nodes outside the current search radius
                if isinstance(node, InternalNode):
                    # Process children of internal node
                    for child in node.children:
                        if (
                        abs(node.distance(query_object) - node.distance(child.routing_object))
                        - child.covering_radius
                            <= nearest_neighbors.limit()
                        ):
                            if child.min_distance(query_object) <= nearest_neighbors.limit():
                                if isinstance(child, InternalNode):
                                    priority_queue.push(child.min_distance(query_object), child)
                                nearest_neighbors.add(child.max_distance(query_object), child)



            return [node[1].value  for node in nearest_neighbors]

In [None]:
from math import sqrt
def euclidean_distance(a, b):
    """
    Calculate the Euclidean distance between two points a and b.
    :param a: Tuple representing the first point.
    :param b: Tuple representing the second point.
    :return: Euclidean distance between a and b.
    """
    return sqrt(sum((a_i - b_i) ** 2 for a_i, b_i in zip(a, b)))

In [None]:
data_points = [(1,5), (2,10), (3,7), (10,20), (15,25),]

mtree1 = MTree(node_capacity=2,distance_function=euclidean_distance)
mtree1.build_with_clustering(data_points)


In [None]:
query_points =[(2, 8),(2, 1),(10,10)]

k = 13
neighbors=[]

# Iterate over the selected query points
for query_point in query_points:
    neighbor= mtree1.knn(query_point,k)
    neighbors.append(neighbor)

    # Print results
print(f"Query Point: {query_point}")
print(f"{k} Nearest Neighbors: {neighbors}")
print(len(neighbors))

Query Point: (10, 10)
13 Nearest Neighbors: [[(3, 7), (2, 10), (1, 5), (10, 20), (15, 25)], [(1, 5), (3, 7), (2, 10), (10, 20), (15, 25)], [(3, 7), (2, 10), (10, 20), (1, 5), (15, 25)]]
3


In [None]:
from math import sqrt
def euclidean_distance(a, b):
    """
    Calculate the Euclidean distance between two points a and b.
    :param a: Tuple representing the first point.
    :param b: Tuple representing the second point.
    :return: Euclidean distance between a and b.
    """
    return sqrt(sum((a_i - b_i) ** 2 for a_i, b_i in zip(a, b)))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import h5py

# Path to your HDF5 file
file_path = '/content/drive/MyDrive/gist-960-euclidean.hdf5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as f:
    # List all keys in the HDF5 file
    print("Keys in the HDF5 file:", list(f.keys()))

    # Read the training set, queries, ground truth, and distances
    train_data = f['train'][:]  # Assuming 'train' is the key for training data
    query_data = f['test'][:]  # Assuming 'test' is the key for query data
    ground_truth = f['neighbors'][:]  # Assuming 'neighbors' is the key for ground truth neighbors
    distance = f['distances'][:]  # Assuming 'distances' is the key for distances

    # Display some details about the datasets
    print("Train data shape:", train_data.shape)
    print("Query data shape:", query_data.shape)
    print("Ground truth shape:", ground_truth.shape)
    print("Distance shape:", distance.shape)


Keys in the HDF5 file: ['distances', 'neighbors', 'test', 'train']
Train data shape: (1000000, 960)
Query data shape: (1000, 960)
Ground truth shape: (1000, 100)
Distance shape: (1000, 100)


In [None]:
points=[]
for i in range(1000000):
  points.append(tuple(train_data[i]))

In [None]:
!pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
import time
import sys
import os
import psutil
import numpy as np
from memory_profiler import profile
from scipy.spatial.distance import euclidean

def euclidean_distance(a, b):
    return euclidean(np.array(a), np.array(b))
def measure_mtree_performance(points, node_capacity=40):
    """
    Measure the time and memory usage for building an MTree

    Args:
        points: List of points to insert into the tree
        node_capacity: Capacity of each node in the tree

    Returns:
        dict: Dictionary containing performance metrics
    """
    # Get initial memory usage
    process = psutil.Process()
    initial_memory = process.memory_info().rss / (1024 * 1024)  # Convert to MB

    # Measure build time
    start_time = time.time()

    # Build the tree
    mtree = MTree(node_capacity=node_capacity, distance_function=euclidean_distance)
    mtree.build_with_clustering(points)

    # Calculate build time
    build_time = time.time() - start_time

    # Get final memory usage
    final_memory = process.memory_info().rss / (1024 * 1024)  # Convert to MB
    memory_used = final_memory - initial_memory

    # Get tree statistics
    tree_stats = {
        'num_points': len(points),
        'node_capacity': node_capacity,
        'tree_depth': calculate_tree_depth(mtree.root),
        'num_nodes': count_nodes(mtree.root)
    }

    return {
        'build_time_seconds': build_time,
        'memory_usage_mb': memory_used,
        'tree_statistics': tree_stats,
        'mtree': mtree
    }

def calculate_tree_depth(node):
    """Calculate the depth of the tree"""
    if isinstance(node, DataNode):
        return 1
    return 1 + max(calculate_tree_depth(child) for child in node.children)

def count_nodes(node):
    """Count total number of nodes in the tree"""
    if isinstance(node, DataNode):
        return 1
    return 1 + sum(count_nodes(child) for child in node.children)

def print_performance_report(metrics):
    """Print a formatted performance report"""
    print("\n=== MTree Performance Report ===")
    print(f"\nBuild Time: {metrics['build_time_seconds']:.4f} seconds")
    print(f"Memory Usage: {metrics['memory_usage_mb']:.2f} MB")

    print("\nTree Statistics:")
    stats = metrics['tree_statistics']
    print(f"- Number of Points: {stats['num_points']}")
    print(f"- Node Capacity: {stats['node_capacity']}")
    print(f"- Tree Depth: {stats['tree_depth']}")
    print(f"- Total Nodes: {stats['num_nodes']}")

    # Calculate and print efficiency metrics
    points_per_node = stats['num_points'] / stats['num_nodes']
    print(f"\nEfficiency Metrics:")
    print(f"- Points per Node: {points_per_node:.2f}")
    print(f"- Memory per Point: {metrics['memory_usage_mb']/stats['num_points']:.4f} MB")



    # Measure performance
metrics = measure_mtree_performance(points)

    # Print report
print_performance_report(metrics)


=== MTree Performance Report ===

Build Time: 1.1509 seconds
Memory Usage: 10.24 MB

Tree Statistics:
- Number of Points: 1000
- Node Capacity: 40
- Tree Depth: 3
- Total Nodes: 13

Efficiency Metrics:
- Points per Node: 76.92
- Memory per Point: 0.0102 MB


In [None]:
mtree = MTree(node_capacity=40,distance_function=euclidean_distance)
mtree.build_with_clustering(points)

