Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
**/__pycache__
**/*.pyc
**/*.pyo
**/*.csv
**/*.egg-info
**/.ipynb_checkpoints
**/*.log
**/docs/build
**/*.ipynb

**/*.c
**/*.cpp
**/*.so
**/*.pyd
**/*.dll
**/*.dylib

.coverage
.vscode
.idea
dist
dist/
build/
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=42", "wheel"]
requires = ["setuptools>=42", "wheel", "Cython"]
build-backend = "setuptools.build_meta"

[project]
Expand Down
11 changes: 11 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from setuptools import setup
from setuptools.extension import Extension
from Cython.Build import cythonize


setup(
ext_modules=cythonize([
Extension(
"tdamapper.utils.cython.metrics",
["src/tdamapper/utils/cython/metrics.pyx"])
]))
24 changes: 13 additions & 11 deletions src/tdamapper/proximity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import numpy as np

from tdamapper.utils.cython.metrics import get_metric, chebyshev
from tdamapper.utils.vptree_flat import VPTree as FVPT
from tdamapper.utils.vptree import VPTree as VPT

Expand Down Expand Up @@ -61,11 +62,6 @@ def _rho(x):
return np.floor(x) + 0.5


def _l_infty(x, y):
# in alternative: np.linalg.norm(x - y, ord=np.inf)
return np.max(np.abs(x - y))


class Proximity:
"""
Abstract interface for proximity functions.
Expand Down Expand Up @@ -128,7 +124,8 @@ class BallProximity(Proximity):
"""

def __init__(self, radius, metric, flat=True):
self.__metric = lambda x, y: metric(x[1], y[1])
_metric = get_metric(metric)
self.__metric = lambda x, y: _metric(x[1], y[1])
self.__radius = radius
self.__data = None
self.__vptree = None
Expand Down Expand Up @@ -198,7 +195,8 @@ class KNNProximity(Proximity):

def __init__(self, neighbors, metric, flat=True):
self.__neighbors = neighbors
self.__metric = _pullback(lambda x: x[1], metric)
_metric = get_metric(metric)
self.__metric = _pullback(lambda x: x[1], _metric)
self.__data = None
self.__vptree = None
self.__flat = flat
Expand Down Expand Up @@ -271,8 +269,8 @@ def __init__(self, n_intervals, overlap_frac, flat=True):
self.__minimum = None
self.__maximum = None
self.__delta = None
metric = _pullback(self._gamma_n, _l_infty)
self.__ball_proximity = BallProximity(self.__radius, metric, flat=flat)
_metric = _pullback(self._gamma_n, chebyshev)
self.__ball_proximity = BallProximity(self.__radius, _metric, flat=flat)

def _gamma_n(self, x):
return self.__n_intervals * (x - self.__minimum) / self.__delta
Expand All @@ -296,6 +294,9 @@ def _set_bounds(self, data):
delta = self.__maximum - self.__minimum
self.__delta = np.maximum(eps, delta)

def _convert(self, X):
return np.asarray(X).reshape(len(X), -1).astype(float)

def fit(self, X):
"""
Train internal parameters.
Expand All @@ -309,8 +310,9 @@ def fit(self, X):
:return: The object itself.
:rtype: self
"""
self._set_bounds(X)
self.__ball_proximity.fit(X)
XX = self._convert(X)
self._set_bounds(XX)
self.__ball_proximity.fit(XX)
return self

def search(self, x):
Expand Down
64 changes: 64 additions & 0 deletions src/tdamapper/utils/cython/metrics.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from libc.math cimport sqrt, pow, fabs


_EUCLIDEAN = 'euclidean'
_MINKOWSKI = 'minkowski'
_MINKOWSKI_P = 'p'
_CHEBYSHEV = 'chebyshev'
_COSINE = 'cosine'


cpdef double chebyshev(double[:] x, double[:] y) nogil:
cdef double max_diff = 0.0
cdef Py_ssize_t i, n = x.shape[0]
for i in range(n):
max_diff = max(max_diff, fabs(x[i] - y[i]))
return max_diff


cpdef double euclidean(double[:] x, double[:] y) nogil:
cdef double norm_squared = 0.0
cdef Py_ssize_t i, n = x.shape[0]
for i in range(n):
norm_squared += pow(fabs(x[i] - y[i]), 2)
return sqrt(norm_squared)


cdef double _minkowski(int p, double[:] x, double[:] y) nogil:
cdef double norm_p = 0.0
cdef Py_ssize_t i, n = x.shape[0]
for i in range(n):
norm_p += pow(fabs(x[i] - y[i]), p)
return pow(norm_p, 1.0 / p)


def minkowski(p):
return lambda x, y: _minkowski(p, x, y)


cpdef double cosine(double[:] x, double[:] y) nogil:
cdef double dot_product = 0.0
cdef double norm_x = 0.0
cdef double norm_y = 0.0
cdef Py_ssize_t i, n = x.shape[0]
for i in range(n):
dot_product += x[i] * y[i]
norm_x += pow(x[i], 2)
norm_y += pow(y[i], 2)
return 1.0 - (dot_product / sqrt(norm_x * norm_y))


def get_metric(metric, **kwargs):
if callable(metric):
return metric
elif metric == _EUCLIDEAN:
return euclidean
elif metric == _MINKOWSKI:
p = kwargs.get(_MINKOWSKI_P, 2)
return lambda x, y: minkowski(x, y, p=p)
elif metric == _CHEBYSHEV:
return chebyshev
elif metric == _COSINE:
return cosine
else:
raise ValueError('metric must be a string or callable')
48 changes: 27 additions & 21 deletions src/tdamapper/utils/quickselect.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,50 @@
def partition(data, start, end, p, fun=lambda x: x):
higher = start
p_val = fun(p)
for j in range(start, end):
if fun(data[j]) < p_val:
data[higher], data[j] = data[j], data[higher]
higher += 1
return higher
def __swap(arr, i, j):
arr[i], arr[j] = arr[j], arr[i]


def partition_tuple(data, start, end, p):
def partition(data, start, end, p_ord):
higher = start
p_ord, _ = p
for j in range(start, end):
j_ord, _ = data[j]
if j_ord < p_ord:
data[higher], data[j] = data[j], data[higher]
__swap(data, higher, j)
higher += 1
return higher


def quickselect(data, start, end, k, fun=lambda x: x):
def quickselect(data, start, end, k):
start_, end_, higher = start, end, None
while higher != k + 1:
p = data[k]
data[start_], data[k] = data[k], data[start_]
higher = partition(data, start_ + 1, end_, p, fun)
data[start_], data[higher - 1] = data[higher - 1], data[start_]
p, _ = data[k]
__swap(data, start_, k)
higher = partition(data, start_ + 1, end_, p)
__swap(data, start_, higher - 1)
if k <= higher - 1:
end_ = higher
else:
start_ = higher


def quickselect_tuple(data, start, end, k):
def partition_tuple(data_ord, data_arr, start, end, p_ord):
higher = start
for j in range(start, end):
j_ord = data_ord[j]
if j_ord < p_ord:
__swap(data_arr, higher, j)
__swap(data_ord, higher, j)
higher += 1
return higher


def quickselect_tuple(data_ord, data_arr, start, end, k):
start_, end_, higher = start, end, None
while higher != k + 1:
p = data[k]
data[start_], data[k] = data[k], data[start_]
higher = partition_tuple(data, start_ + 1, end_, p)
data[start_], data[higher - 1] = data[higher - 1], data[start_]
p_ord = data_ord[k]
__swap(data_arr, start_, k)
__swap(data_ord, start_, k)
higher = partition_tuple(data_ord, data_arr, start_ + 1, end_, p_ord)
__swap(data_arr, start_, higher - 1)
__swap(data_ord, start_, higher - 1)
if k <= higher - 1:
end_ = higher
else:
Expand Down
7 changes: 4 additions & 3 deletions src/tdamapper/utils/vptree.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""A class for fast knn and range searches, depending only on a given metric"""
from random import randrange

from tdamapper.utils.quickselect import quickselect_tuple
from tdamapper.utils.cython.metrics import get_metric
from tdamapper.utils.quickselect import quickselect
from tdamapper.utils.heap import MaxHeap


class VPTree:

def __init__(self, distance, dataset, leaf_capacity=1, leaf_radius=0.0, pivoting=None):
self.__distance = distance
self.__distance = get_metric(distance)
self.__dataset = [(0.0, x) for x in dataset]
self.__leaf_capacity = leaf_capacity
self.__leaf_radius = leaf_radius
Expand Down Expand Up @@ -60,7 +61,7 @@ def _build_rec(self, start, end, update):
if update:
self._update(start, end)
_, v_point = self.__dataset[start]
quickselect_tuple(self.__dataset, start + 1, end, mid)
quickselect(self.__dataset, start + 1, end, mid)
v_radius, _ = self.__dataset[mid]
if v_radius <= self.__leaf_radius:
left = _Tree([x for _, x in self.__dataset[start:mid]])
Expand Down
7 changes: 4 additions & 3 deletions src/tdamapper/utils/vptree_flat.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""A class for fast knn and range searches, depending only on a given metric"""
from random import randrange

from tdamapper.utils.quickselect import quickselect_tuple
from tdamapper.utils.cython.metrics import get_metric
from tdamapper.utils.quickselect import quickselect
from tdamapper.utils.heap import MaxHeap


class VPTree:

def __init__(self, distance, dataset, leaf_capacity=1, leaf_radius=0.0, pivoting=None):
self.__distance = distance
self.__distance = get_metric(distance)
self.__dataset = [(0.0, x) for x in dataset]
self.__leaf_capacity = leaf_capacity
self.__leaf_radius = leaf_radius
Expand Down Expand Up @@ -62,7 +63,7 @@ def _build_iter(self):
mid = (end + start) // 2
self._update(start, end)
_, v_point = self.__dataset[start]
quickselect_tuple(self.__dataset, start + 1, end, mid)
quickselect(self.__dataset, start + 1, end, mid)
v_radius, _ = self.__dataset[mid]
self.__dataset[start] = (v_radius, v_point)
if end - mid > self.__leaf_capacity:
Expand Down
4 changes: 2 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import networkx as nx
from sklearn.cluster import DBSCAN

from tdamapper.utils.cython.metrics import get_metric
from tdamapper.core import MapperAlgorithm, mapper_connected_components, mapper_labels
from tdamapper.cover import TrivialCover, BallCover
from tdamapper.clustering import TrivialClustering


def dist(x, y):
return np.linalg.norm(x - y)
dist = 'euclidean'


def dataset(dim=10, num=1000):
Expand Down
6 changes: 5 additions & 1 deletion tests/test_proximity.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,8 @@ def testCubicalProximity(self):
a_i = m + i * (w - delta) - delta / 2.0
b_i = m + (i + 1) * (w - delta) + delta / 2.0
expected = [y for y in data if y > a_i and y < b_i]
self.assertEqual(set(expected), set(result))
for c in result:
self.assertTrue(c in expected)
for c in expected:
self.assertTrue(c in result)
#self.assertEqual(set(expected), set(result))
Loading