From 57ed531dcd22f12d9f48fe2a1766ad17d57a5cbd Mon Sep 17 00:00:00 2001 From: Luca Simi Date: Sun, 30 Jun 2024 21:27:37 +0200 Subject: [PATCH 1/5] Started to cythonize metrics --- pyproject.toml | 2 +- setup.py | 11 +++++++ src/tdamapper/proximity.py | 21 ++++++------ src/tdamapper/utils/cython/metrics.pyx | 45 ++++++++++++++++++++++++++ src/tdamapper/utils/metrics.py | 5 +++ src/tdamapper/utils/vptree.py | 3 +- src/tdamapper/utils/vptree_flat.py | 3 +- tests/test_core.py | 4 +-- tests/test_proximity.py | 8 +++-- tests/test_vptree.py | 25 ++++++++------ tests/test_vptree_bench.py | 10 +++--- 11 files changed, 107 insertions(+), 30 deletions(-) create mode 100644 setup.py create mode 100644 src/tdamapper/utils/cython/metrics.pyx create mode 100644 src/tdamapper/utils/metrics.py diff --git a/pyproject.toml b/pyproject.toml index 8fffc24..b2bc4e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=42", "wheel"] +requires = ["setuptools>=42", "wheel", "Cython"] build-backend = "setuptools.build_meta" [project] diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e127b30 --- /dev/null +++ b/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup +from setuptools.extension import Extension +from Cython.Build import cythonize + + + +setup( + ext_modules=cythonize([ + Extension("tdamapper.utils.cython.metrics", ["src/tdamapper/utils/cython/metrics.pyx"]) + ]) +) \ No newline at end of file diff --git a/src/tdamapper/proximity.py b/src/tdamapper/proximity.py index 6265808..614736a 100644 --- a/src/tdamapper/proximity.py +++ b/src/tdamapper/proximity.py @@ -13,6 +13,7 @@ import numpy as np +from tdamapper.utils.metrics import get_metric from tdamapper.utils.vptree_flat import VPTree as FVPT from tdamapper.utils.vptree import VPTree as VPT @@ -61,11 +62,6 @@ def _rho(x): return np.floor(x) + 0.5 -def _l_infty(x, y): - # in alternative: np.linalg.norm(x - y, ord=np.inf) - return np.max(np.abs(x - y)) - - class Proximity: """ Abstract interface for proximity functions. @@ -128,7 +124,8 @@ class BallProximity(Proximity): """ def __init__(self, radius, metric, flat=True): - self.__metric = lambda x, y: metric(x[1], y[1]) + _metric = get_metric(metric) + self.__metric = lambda x, y: _metric(x[1], y[1]) self.__radius = radius self.__data = None self.__vptree = None @@ -198,7 +195,8 @@ class KNNProximity(Proximity): def __init__(self, neighbors, metric, flat=True): self.__neighbors = neighbors - self.__metric = _pullback(lambda x: x[1], metric) + _metric = get_metric(metric) + self.__metric = _pullback(lambda x: x[1], _metric) self.__data = None self.__vptree = None self.__flat = flat @@ -271,8 +269,10 @@ def __init__(self, n_intervals, overlap_frac, flat=True): self.__minimum = None self.__maximum = None self.__delta = None - metric = _pullback(self._gamma_n, _l_infty) - self.__ball_proximity = BallProximity(self.__radius, metric, flat=flat) + _l_infty = get_metric('chebyshev') + #_l_infty = lambda x, y: np.max(np.abs(x - y)) + _metric = _pullback(self._gamma_n, _l_infty) + self.__ball_proximity = BallProximity(self.__radius, _metric, flat=flat) def _gamma_n(self, x): return self.__n_intervals * (x - self.__minimum) / self.__delta @@ -309,6 +309,9 @@ def fit(self, X): :return: The object itself. :rtype: self """ + #X = np.asarray(X) + X = np.array(X).reshape(len(X), -1) + print(X.shape) self._set_bounds(X) self.__ball_proximity.fit(X) return self diff --git a/src/tdamapper/utils/cython/metrics.pyx b/src/tdamapper/utils/cython/metrics.pyx new file mode 100644 index 0000000..c3a24f4 --- /dev/null +++ b/src/tdamapper/utils/cython/metrics.pyx @@ -0,0 +1,45 @@ +from libc.math cimport sqrt, pow, fabs + + +_EUCLIDEAN = 'euclidean' +_MINKOWSKI = 'minkowski' +_MINKOWSKI_P = 'p' +_CHEBYSHEV = 'chebyshev' + + +cpdef double chebyshev(const double[:] x, const double[:] y) nogil: + cdef double max_diff = 0.0 + cdef Py_ssize_t i, n = x.shape[0] + for i in range(n): + max_diff = max(max_diff, fabs(x[i] - y[i])) + return max_diff + + +cpdef double euclidean(double[:] x, double[:] y) nogil: + cdef double norm_squared = 0.0 + cdef Py_ssize_t i, n = x.shape[0] + for i in range(n): + norm_squared += pow(fabs(x[i] - y[i]), 2) + return sqrt(norm_squared) + + +cpdef double minkowski(int p, double[:] x, double[:] y) nogil: + cdef double norm_p = 0.0 + cdef Py_ssize_t i, n = x.shape[0] + for i in range(n): + norm_p += pow(fabs(x[i] - y[i]), p) + return pow(norm_p, 1.0 / p) + + +def get_metric(metric, **kwargs): + if callable(metric): + return metric + elif metric == _EUCLIDEAN: + return euclidean + elif metric == _MINKOWSKI: + p = kwargs.get(_MINKOWSKI_P, 2) + return lambda x, y: minkowski(p, x, y) + elif metric == _CHEBYSHEV: + return chebyshev + else: + raise ValueError('metric must be a string or callable') \ No newline at end of file diff --git a/src/tdamapper/utils/metrics.py b/src/tdamapper/utils/metrics.py new file mode 100644 index 0000000..e2bea8e --- /dev/null +++ b/src/tdamapper/utils/metrics.py @@ -0,0 +1,5 @@ +import tdamapper.utils.cython.metrics as cython_metrics + + +def get_metric(metric, **kwargs): + return cython_metrics.get_metric(metric, **kwargs) diff --git a/src/tdamapper/utils/vptree.py b/src/tdamapper/utils/vptree.py index c63f06c..fb84b37 100755 --- a/src/tdamapper/utils/vptree.py +++ b/src/tdamapper/utils/vptree.py @@ -1,6 +1,7 @@ """A class for fast knn and range searches, depending only on a given metric""" from random import randrange +from tdamapper.utils.cython.metrics import get_metric from tdamapper.utils.quickselect import quickselect_tuple from tdamapper.utils.heap import MaxHeap @@ -8,7 +9,7 @@ class VPTree: def __init__(self, distance, dataset, leaf_capacity=1, leaf_radius=0.0, pivoting=None): - self.__distance = distance + self.__distance = get_metric(distance) self.__dataset = [(0.0, x) for x in dataset] self.__leaf_capacity = leaf_capacity self.__leaf_radius = leaf_radius diff --git a/src/tdamapper/utils/vptree_flat.py b/src/tdamapper/utils/vptree_flat.py index a0b470b..d85faa2 100755 --- a/src/tdamapper/utils/vptree_flat.py +++ b/src/tdamapper/utils/vptree_flat.py @@ -1,6 +1,7 @@ """A class for fast knn and range searches, depending only on a given metric""" from random import randrange +from tdamapper.utils.cython.metrics import get_metric from tdamapper.utils.quickselect import quickselect_tuple from tdamapper.utils.heap import MaxHeap @@ -8,7 +9,7 @@ class VPTree: def __init__(self, distance, dataset, leaf_capacity=1, leaf_radius=0.0, pivoting=None): - self.__distance = distance + self.__distance = get_metric(distance) self.__dataset = [(0.0, x) for x in dataset] self.__leaf_capacity = leaf_capacity self.__leaf_radius = leaf_radius diff --git a/tests/test_core.py b/tests/test_core.py index 8e7fb9a..27916fa 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -4,13 +4,13 @@ import networkx as nx from sklearn.cluster import DBSCAN +from tdamapper.utils.cython.metrics import get_metric from tdamapper.core import MapperAlgorithm, mapper_connected_components, mapper_labels from tdamapper.cover import TrivialCover, BallCover from tdamapper.clustering import TrivialClustering -def dist(x, y): - return np.linalg.norm(x - y) +dist = 'euclidean' def dataset(dim=10, num=1000): diff --git a/tests/test_proximity.py b/tests/test_proximity.py index 7e84a48..f241640 100644 --- a/tests/test_proximity.py +++ b/tests/test_proximity.py @@ -44,7 +44,7 @@ def testCubicalProximity(self): p = 0.1 w = (M - m) / (n * (1.0 - p)) delta = p * w - data = list(range(m, M + 1)) + data = np.array([[x] for x in range(m, M + 1)]) prox = CubicalProximity(n_intervals=n, overlap_frac=p) prox.fit(data) for x in data: @@ -53,4 +53,8 @@ def testCubicalProximity(self): a_i = m + i * (w - delta) - delta / 2.0 b_i = m + (i + 1) * (w - delta) + delta / 2.0 expected = [y for y in data if y > a_i and y < b_i] - self.assertEqual(set(expected), set(result)) + for c in result: + self.assertTrue(c in expected) + for c in expected: + self.assertTrue(c in result) + #self.assertEqual(set(expected), set(result)) diff --git a/tests/test_vptree.py b/tests/test_vptree.py index 778b2b9..2277038 100644 --- a/tests/test_vptree.py +++ b/tests/test_vptree.py @@ -3,12 +3,12 @@ import numpy as np +from tdamapper.utils.cython.metrics import get_metric from tdamapper.utils.vptree import VPTree from tdamapper.utils.vptree_flat import VPTree as FlatVPTree -def distance(x, y): - return np.linalg.norm(x - y) +distance = 'euclidean' def dataset(dim=10, num=1000): @@ -25,19 +25,21 @@ def _testBallSearch(self, data, dist, vpt): for _ in range(len(data) // 10): point = random.choice(data) ball = vpt.ball_search(point, self.eps) - near = [y for y in data if dist(point, y) < self.eps] + d = get_metric(dist) + near = [y for y in data if d(point, y) < self.eps] for x in ball: - self.assertTrue(any(dist(x, y) == 0.0 for y in near)) + self.assertTrue(any(d(x, y) == 0.0 for y in near)) for x in near: - self.assertTrue(any(dist(x, y) == 0.0 for y in ball)) + self.assertTrue(any(d(x, y) == 0.0 for y in ball)) def _testKNNSearch(self, data, dist, vpt): for _ in range(len(data) // 10): point = random.choice(data) neigh = vpt.knn_search(point, self.neighbors) self.assertEqual(self.neighbors, len(neigh)) - dist_neigh = [dist(point, y) for y in neigh] - dist_data = [dist(point, y) for y in data] + d = get_metric(dist) + dist_neigh = [d(point, y) for y in neigh] + dist_data = [d(point, y) for y in data] dist_data.sort() dist_neigh.sort() self.assertEqual(0.0, dist_data[0]) @@ -46,9 +48,10 @@ def _testKNNSearch(self, data, dist, vpt): self.assertEqual(set(dist_neigh), set(dist_data[:self.neighbors])) def _testNNSearch(self, data, dist, vpt): + d = get_metric(dist) for val in data: neigh = vpt.knn_search(val, 1) - self.assertEqual(0.0, dist(val, neigh[0])) + self.assertEqual(0.0, d(val, neigh[0])) def _testVPTree(self, builder, data, dist): vpt = builder(dist, data, leaf_radius=self.eps, leaf_capacity=self.neighbors) @@ -67,8 +70,9 @@ def _testVPTree(self, builder, data, dist): def testVPTreeRefs(self): data = dataset() data_refs = list(range(len(data))) + d = get_metric(distance) def dist_refs(i, j): - return distance(data[i], data[j]) + return d(data[i], data[j]) self._testVPTree(VPTree, data_refs, dist_refs) def testVPTreeData(self): @@ -78,8 +82,9 @@ def testVPTreeData(self): def testFlatVPTreeRefs(self): data = dataset() data_refs = list(range(len(data))) + d = get_metric(distance) def dist_refs(i, j): - return distance(data[i], data[j]) + return d(data[i], data[j]) self._testVPTree(FlatVPTree, data_refs, dist_refs) def testFlatVPTreeData(self): diff --git a/tests/test_vptree_bench.py b/tests/test_vptree_bench.py index b6b12df..aea1f10 100644 --- a/tests/test_vptree_bench.py +++ b/tests/test_vptree_bench.py @@ -5,12 +5,12 @@ import numpy as np from sklearn.datasets import load_iris, load_breast_cancer, load_digits +from tdamapper.utils.cython.metrics import get_metric from tdamapper.utils.vptree import VPTree as VPT from tdamapper.utils.vptree_flat import VPTree as FVPT -def dist(x, y): - return np.linalg.norm(x - y) +dist = 'euclidean' def dataset(dim=10, num=1000): @@ -64,9 +64,10 @@ def _testBuild(self, data, name, builder): return vpt def _testBallSearchNaive(self, data, name): + d = get_metric(dist) t0 = time() for val in data: - neigh = [x for x in data if dist(val, x) <= self.eps] + neigh = [x for x in data if d(val, x) <= self.eps] t1 = time() self.logger.info(f'{name}: {t1 - t0}') @@ -78,9 +79,10 @@ def _testBallSearch(self, data, name, vpt): self.logger.info(f'{name}: {t1 - t0}') def _testKNNSearchNaive(self, data, name): + d = get_metric(dist) t0 = time() for val in data: - data.sort(key=lambda x: dist(x, val)) + data.sort(key=lambda x: d(x, val)) neigh = [x for x in data[:self.k]] t1 = time() self.logger.info(f'{name}: {t1 - t0}') From ce4cfbdac318ce0cc8ce8935b6932ec439d63d41 Mon Sep 17 00:00:00 2001 From: Luca Simi Date: Mon, 1 Jul 2024 07:49:07 +0200 Subject: [PATCH 2/5] Added cython artifacts to gitignore --- .gitignore | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.gitignore b/.gitignore index fd29d1a..abbd670 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ **/__pycache__ **/*.pyc +**/*.pyo **/*.csv **/*.egg-info **/.ipynb_checkpoints @@ -7,7 +8,15 @@ **/docs/build **/*.ipynb +**/*.c +**/*.cpp +**/*.so +**/*.pyd +**/*.dll +**/*.dylib + .coverage .vscode .idea dist +build \ No newline at end of file From 8734f83db35d83836fdbd51337a5a02e84f5c3d9 Mon Sep 17 00:00:00 2001 From: Luca Simi Date: Mon, 1 Jul 2024 08:31:15 +0200 Subject: [PATCH 3/5] Fixed tests --- src/tdamapper/proximity.py | 8 +++----- src/tdamapper/utils/cython/metrics.pyx | 2 +- tests/test_proximity.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/tdamapper/proximity.py b/src/tdamapper/proximity.py index 614736a..6255c51 100644 --- a/src/tdamapper/proximity.py +++ b/src/tdamapper/proximity.py @@ -309,11 +309,9 @@ def fit(self, X): :return: The object itself. :rtype: self """ - #X = np.asarray(X) - X = np.array(X).reshape(len(X), -1) - print(X.shape) - self._set_bounds(X) - self.__ball_proximity.fit(X) + XX = np.asarray(X).reshape(len(X), -1).astype(float) + self._set_bounds(XX) + self.__ball_proximity.fit(XX) return self def search(self, x): diff --git a/src/tdamapper/utils/cython/metrics.pyx b/src/tdamapper/utils/cython/metrics.pyx index c3a24f4..6613e81 100644 --- a/src/tdamapper/utils/cython/metrics.pyx +++ b/src/tdamapper/utils/cython/metrics.pyx @@ -7,7 +7,7 @@ _MINKOWSKI_P = 'p' _CHEBYSHEV = 'chebyshev' -cpdef double chebyshev(const double[:] x, const double[:] y) nogil: +cpdef double chebyshev(double[:] x, double[:] y) nogil: cdef double max_diff = 0.0 cdef Py_ssize_t i, n = x.shape[0] for i in range(n): diff --git a/tests/test_proximity.py b/tests/test_proximity.py index f241640..5071a37 100644 --- a/tests/test_proximity.py +++ b/tests/test_proximity.py @@ -44,7 +44,7 @@ def testCubicalProximity(self): p = 0.1 w = (M - m) / (n * (1.0 - p)) delta = p * w - data = np.array([[x] for x in range(m, M + 1)]) + data = list(range(m, M + 1)) prox = CubicalProximity(n_intervals=n, overlap_frac=p) prox.fit(data) for x in data: From 250078525654726ee3e8383bde9e04fbfbdc8cec Mon Sep 17 00:00:00 2001 From: Luca Simi Date: Mon, 1 Jul 2024 22:19:22 +0200 Subject: [PATCH 4/5] Added cythonized metrics. Added missing tests. Removed unused code --- setup.py | 8 ++--- src/tdamapper/proximity.py | 11 +++--- src/tdamapper/utils/cython/metrics.pyx | 25 ++++++++++++-- src/tdamapper/utils/metrics.py | 5 --- src/tdamapper/utils/quickselect.py | 48 +++++++++++++++----------- src/tdamapper/utils/vptree.py | 4 +-- src/tdamapper/utils/vptree_flat.py | 4 +-- tests/test_quickselect.py | 47 +++++++++++++++++++------ 8 files changed, 100 insertions(+), 52 deletions(-) delete mode 100644 src/tdamapper/utils/metrics.py diff --git a/setup.py b/setup.py index e127b30..f5b9e87 100644 --- a/setup.py +++ b/setup.py @@ -3,9 +3,9 @@ from Cython.Build import cythonize - setup( ext_modules=cythonize([ - Extension("tdamapper.utils.cython.metrics", ["src/tdamapper/utils/cython/metrics.pyx"]) - ]) -) \ No newline at end of file + Extension( + "tdamapper.utils.cython.metrics", + ["src/tdamapper/utils/cython/metrics.pyx"]) + ])) diff --git a/src/tdamapper/proximity.py b/src/tdamapper/proximity.py index 6255c51..199dc37 100644 --- a/src/tdamapper/proximity.py +++ b/src/tdamapper/proximity.py @@ -13,7 +13,7 @@ import numpy as np -from tdamapper.utils.metrics import get_metric +from tdamapper.utils.cython.metrics import get_metric, chebyshev from tdamapper.utils.vptree_flat import VPTree as FVPT from tdamapper.utils.vptree import VPTree as VPT @@ -269,9 +269,7 @@ def __init__(self, n_intervals, overlap_frac, flat=True): self.__minimum = None self.__maximum = None self.__delta = None - _l_infty = get_metric('chebyshev') - #_l_infty = lambda x, y: np.max(np.abs(x - y)) - _metric = _pullback(self._gamma_n, _l_infty) + _metric = _pullback(self._gamma_n, chebyshev) self.__ball_proximity = BallProximity(self.__radius, _metric, flat=flat) def _gamma_n(self, x): @@ -296,6 +294,9 @@ def _set_bounds(self, data): delta = self.__maximum - self.__minimum self.__delta = np.maximum(eps, delta) + def _convert(self, X): + return np.asarray(X).reshape(len(X), -1).astype(float) + def fit(self, X): """ Train internal parameters. @@ -309,7 +310,7 @@ def fit(self, X): :return: The object itself. :rtype: self """ - XX = np.asarray(X).reshape(len(X), -1).astype(float) + XX = self._convert(X) self._set_bounds(XX) self.__ball_proximity.fit(XX) return self diff --git a/src/tdamapper/utils/cython/metrics.pyx b/src/tdamapper/utils/cython/metrics.pyx index 6613e81..58cac63 100644 --- a/src/tdamapper/utils/cython/metrics.pyx +++ b/src/tdamapper/utils/cython/metrics.pyx @@ -5,6 +5,7 @@ _EUCLIDEAN = 'euclidean' _MINKOWSKI = 'minkowski' _MINKOWSKI_P = 'p' _CHEBYSHEV = 'chebyshev' +_COSINE = 'cosine' cpdef double chebyshev(double[:] x, double[:] y) nogil: @@ -23,7 +24,7 @@ cpdef double euclidean(double[:] x, double[:] y) nogil: return sqrt(norm_squared) -cpdef double minkowski(int p, double[:] x, double[:] y) nogil: +cdef double _minkowski(int p, double[:] x, double[:] y) nogil: cdef double norm_p = 0.0 cdef Py_ssize_t i, n = x.shape[0] for i in range(n): @@ -31,6 +32,22 @@ cpdef double minkowski(int p, double[:] x, double[:] y) nogil: return pow(norm_p, 1.0 / p) +def minkowski(p): + return lambda x, y: _minkowski(p, x, y) + + +cpdef double cosine(double[:] x, double[:] y) nogil: + cdef double dot_product = 0.0 + cdef double norm_x = 0.0 + cdef double norm_y = 0.0 + cdef Py_ssize_t i, n = x.shape[0] + for i in range(n): + dot_product += x[i] * y[i] + norm_x += pow(x[i], 2) + norm_y += pow(y[i], 2) + return 1.0 - (dot_product / sqrt(norm_x * norm_y)) + + def get_metric(metric, **kwargs): if callable(metric): return metric @@ -38,8 +55,10 @@ def get_metric(metric, **kwargs): return euclidean elif metric == _MINKOWSKI: p = kwargs.get(_MINKOWSKI_P, 2) - return lambda x, y: minkowski(p, x, y) + return lambda x, y: minkowski(x, y, p=p) elif metric == _CHEBYSHEV: return chebyshev + elif metric == _COSINE: + return cosine else: - raise ValueError('metric must be a string or callable') \ No newline at end of file + raise ValueError('metric must be a string or callable') diff --git a/src/tdamapper/utils/metrics.py b/src/tdamapper/utils/metrics.py deleted file mode 100644 index e2bea8e..0000000 --- a/src/tdamapper/utils/metrics.py +++ /dev/null @@ -1,5 +0,0 @@ -import tdamapper.utils.cython.metrics as cython_metrics - - -def get_metric(metric, **kwargs): - return cython_metrics.get_metric(metric, **kwargs) diff --git a/src/tdamapper/utils/quickselect.py b/src/tdamapper/utils/quickselect.py index b61c8a2..e50b872 100755 --- a/src/tdamapper/utils/quickselect.py +++ b/src/tdamapper/utils/quickselect.py @@ -1,44 +1,50 @@ -def partition(data, start, end, p, fun=lambda x: x): - higher = start - p_val = fun(p) - for j in range(start, end): - if fun(data[j]) < p_val: - data[higher], data[j] = data[j], data[higher] - higher += 1 - return higher +def __swap(arr, i, j): + arr[i], arr[j] = arr[j], arr[i] -def partition_tuple(data, start, end, p): +def partition(data, start, end, p_ord): higher = start - p_ord, _ = p for j in range(start, end): j_ord, _ = data[j] if j_ord < p_ord: - data[higher], data[j] = data[j], data[higher] + __swap(data, higher, j) higher += 1 return higher -def quickselect(data, start, end, k, fun=lambda x: x): +def quickselect(data, start, end, k): start_, end_, higher = start, end, None while higher != k + 1: - p = data[k] - data[start_], data[k] = data[k], data[start_] - higher = partition(data, start_ + 1, end_, p, fun) - data[start_], data[higher - 1] = data[higher - 1], data[start_] + p, _ = data[k] + __swap(data, start_, k) + higher = partition(data, start_ + 1, end_, p) + __swap(data, start_, higher - 1) if k <= higher - 1: end_ = higher else: start_ = higher -def quickselect_tuple(data, start, end, k): +def partition_tuple(data_ord, data_arr, start, end, p_ord): + higher = start + for j in range(start, end): + j_ord = data_ord[j] + if j_ord < p_ord: + __swap(data_arr, higher, j) + __swap(data_ord, higher, j) + higher += 1 + return higher + + +def quickselect_tuple(data_ord, data_arr, start, end, k): start_, end_, higher = start, end, None while higher != k + 1: - p = data[k] - data[start_], data[k] = data[k], data[start_] - higher = partition_tuple(data, start_ + 1, end_, p) - data[start_], data[higher - 1] = data[higher - 1], data[start_] + p_ord = data_ord[k] + __swap(data_arr, start_, k) + __swap(data_ord, start_, k) + higher = partition_tuple(data_ord, data_arr, start_ + 1, end_, p_ord) + __swap(data_arr, start_, higher - 1) + __swap(data_ord, start_, higher - 1) if k <= higher - 1: end_ = higher else: diff --git a/src/tdamapper/utils/vptree.py b/src/tdamapper/utils/vptree.py index fb84b37..399583e 100755 --- a/src/tdamapper/utils/vptree.py +++ b/src/tdamapper/utils/vptree.py @@ -2,7 +2,7 @@ from random import randrange from tdamapper.utils.cython.metrics import get_metric -from tdamapper.utils.quickselect import quickselect_tuple +from tdamapper.utils.quickselect import quickselect from tdamapper.utils.heap import MaxHeap @@ -61,7 +61,7 @@ def _build_rec(self, start, end, update): if update: self._update(start, end) _, v_point = self.__dataset[start] - quickselect_tuple(self.__dataset, start + 1, end, mid) + quickselect(self.__dataset, start + 1, end, mid) v_radius, _ = self.__dataset[mid] if v_radius <= self.__leaf_radius: left = _Tree([x for _, x in self.__dataset[start:mid]]) diff --git a/src/tdamapper/utils/vptree_flat.py b/src/tdamapper/utils/vptree_flat.py index d85faa2..7c389e6 100755 --- a/src/tdamapper/utils/vptree_flat.py +++ b/src/tdamapper/utils/vptree_flat.py @@ -2,7 +2,7 @@ from random import randrange from tdamapper.utils.cython.metrics import get_metric -from tdamapper.utils.quickselect import quickselect_tuple +from tdamapper.utils.quickselect import quickselect from tdamapper.utils.heap import MaxHeap @@ -63,7 +63,7 @@ def _build_iter(self): mid = (end + start) // 2 self._update(start, end) _, v_point = self.__dataset[start] - quickselect_tuple(self.__dataset, start + 1, end, mid) + quickselect(self.__dataset, start + 1, end, mid) v_radius, _ = self.__dataset[mid] self.__dataset[start] = (v_radius, v_point) if end - mid > self.__leaf_capacity: diff --git a/tests/test_quickselect.py b/tests/test_quickselect.py index 770a299..dd4e20c 100755 --- a/tests/test_quickselect.py +++ b/tests/test_quickselect.py @@ -1,29 +1,56 @@ import unittest import random -from tdamapper.utils.quickselect import partition, quickselect +from tdamapper.utils.quickselect import ( + partition, + quickselect, + partition_tuple, + quickselect_tuple) class TestQuickSelect(unittest.TestCase): - def testPartition(self): + def test_partition(self): n = 1000 - arr = [random.randint(0, n - 1) for _ in range(n)] + arr = [(i, random.randint(0, n - 1)) for i in range(n)] for choice in range(n): h = partition(arr, 0, n, choice) for i in range(0, h): - self.assertTrue(arr[i] < choice) + self.assertTrue(arr[i][0] < choice) for i in range(h, n): - self.assertTrue(arr[i] >= choice) + self.assertTrue(arr[i][0] >= choice) - def testQuickSelect(self): + def test_quickselect(self): n = 1000 - arr = [random.randint(0, n - 1) for _ in range(n)] + arr = [(i, random.randint(0, n - 1)) for i in range(n)] for choice in range(n): quickselect(arr, 0, n, choice) - val = arr[choice] + val = arr[choice][0] for i in range(0, choice): - self.assertTrue(arr[i] <= val) + self.assertTrue(arr[i][0] <= val) for i in range(choice, n): - self.assertTrue(arr[i] >= val) + self.assertTrue(arr[i][0] >= val) + + def test_partition_tuple(self): + n = 1000 + arr_data = [random.randint(0, n - 1) for i in range(n)] + arr_ord = list(range(n)) + for choice in range(n): + h = partition_tuple(arr_ord, arr_data, 0, n, choice) + for i in range(0, h): + self.assertTrue(arr_ord[i] < choice) + for i in range(h, n): + self.assertTrue(arr_ord[i] >= choice) + + def test_quickselect_tuple(self): + n = 1000 + arr_data = [random.randint(0, n - 1) for i in range(n)] + arr_ord = list(range(n)) + for choice in range(n): + quickselect_tuple(arr_ord, arr_data, 0, n, choice) + val = arr_ord[choice] + for i in range(0, choice): + self.assertTrue(arr_ord[i] <= val) + for i in range(choice, n): + self.assertTrue(arr_ord[i] >= val) \ No newline at end of file From dd766f8b81f32dbfb8d3dbea08327a80eb2eefa9 Mon Sep 17 00:00:00 2001 From: Luca Simi Date: Mon, 1 Jul 2024 22:25:53 +0200 Subject: [PATCH 5/5] Minor fix --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index abbd670..6382793 100644 --- a/.gitignore +++ b/.gitignore @@ -18,5 +18,5 @@ .coverage .vscode .idea -dist -build \ No newline at end of file +dist/ +build/