tests: fixed tests

Fixed python tests by using original cpp functions
pablocael · Sep 13, 2023 · 241638c · 241638c
1 parent 024bf1c
commit 241638c
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 48 deletions.
diff --git a/pynear/__init__.py b/pynear/__init__.py
@@ -1,3 +1,4 @@
+from typing import List
 from typing import Tuple
 
 from _pynear import BKTreeBinaryIndex64
@@ -13,11 +14,39 @@
 from _pynear import VPTreeChebyshevIndex
 from _pynear import VPTreeL1Index
 from _pynear import VPTreeL2Index
+from _pynear import dist_chebyshev
+from _pynear import dist_hamming_64
+from _pynear import dist_hamming_128
+from _pynear import dist_hamming_256
+from _pynear import dist_hamming_512
+from _pynear import dist_l1
+from _pynear import dist_l2
 import numpy as np
 
 from ._version import __version__
 
 
+def dist_hamming(a: List, b: List):
+    if len(a) != len(b):
+        raise ValueError(
+            f"invalid data dimension: a and b dimensions must agree."
+        )
+    dim = len(a)
+    if dim == 64:
+        return dist_hamming_512(a, b)
+    elif dim == 32:
+        return dist_hamming_256(a, b)
+    elif dim == 16:
+        return dist_hamming_128(a, b)
+    elif dim == 8:
+        return dist_hamming_64(a, b)
+    else:
+        raise ValueError(
+            f"invalid data dimension: hamming distance only supports 64, 32, 16 or 8 bytes of data"
+        )
+
+
+
 class VPTreeBinaryIndex:
     def __init__(self) -> None:
         self._index = None

diff --git a/pynear/include/BuiltinSerializers.hpp b/pynear/include/BuiltinSerializers.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <vector>
 #include <cstring>
+#include <vector>
 
 namespace vptree {
 

diff --git a/pynear/src/PythonBindings.cpp b/pynear/src/PythonBindings.cpp
@@ -173,6 +173,14 @@ static const char *index_find_threshold = "Batch find all vectors below the dist
 static const char *index_values = "Return all stored vectors in arbitrary order";
 
 PYBIND11_MODULE(_pynear, m) {
+    m.def("dist_l2", dist_l2_f_avx2);
+    m.def("dist_l1", dist_l1_f_avx2);
+    m.def("dist_chebyshev", dist_chebyshev_f_avx2);
+    m.def("dist_hamming_64", dist_hamming_64);
+    m.def("dist_hamming_128", dist_hamming_128);
+    m.def("dist_hamming_256", dist_hamming_256);
+    m.def("dist_hamming_512", dist_hamming_512);
+
     py::class_<VPTreeNumpyAdapter<dist_l2_f_avx2>>(m, "VPTreeL2Index")
         .def(py::init<>())
         .def("set", &VPTreeNumpyAdapter<dist_l2_f_avx2>::set, index_set, py::arg("vectors"))

diff --git a/pynear/tests/VPTreeTests.cpp b/pynear/tests/VPTreeTests.cpp
@@ -360,36 +360,4 @@ TEST(VPTests, TestSearch) {
     diff = end - start;
 }
 
-TEST(VPTests, TestCornerCasesL1) {
-
-    std::vector<arrayf> points = {
-          {0.24747044, 0.10977995, 0.04395789, 0.37588218, 0.77715296, 0.38436773, 0.27868968, 0.44355425},
-          {0.40908694, 0.07170244, 0.76541245, 0.10503417, 0.48107386, 0.7900539, 0.93293387, 0.582928},
-          {0.34634387, 0.5111964, 0.69529665, 0.24239564, 0.14328131, 0.49494576, 0.81964535, 0.8323013},
-          {0.40923303, 0.9071538, 0.04779731, 0.4205647, 0.9884444, 0.6205023, 0.29096323, 0.29838845},
-          {0.7317226, 0.7195254, 0.15990016, 0.69135946, 0.8254121, 0.20821702, 0.90294975, 0.02925209}
-    };
-    std::vector<arrayf> queries = {
-        {0.5299074, 0.6855958, 0.42676213, 0.69523215, 0.4685414, 0.0975867, 0.8515448, 0.2583308}
-        /* {0.70882237 0.00969914 0.7337773  0.14389992 0.7006041  0.187069760.72513705 0.4052477 } */
-        /* {0.9641739  0.0330751  0.49499482 0.32284376 0.2969801  0.350194220.02012024 0.7615032 } */
-        /* {0.44070253 0.62186664 0.81927806 0.06221519 0.36935103 0.180103450.6288583  0.20059796} */
-        /* {0.3955885  0.7678838  0.83378315 0.69156003 0.90867287 0.78383080.84307    0.71617246} */
-        /* {0.5463595  0.15017477 0.51484144 0.46845767 0.46476486 0.5259280.83734906 0.9041701 } */
-        /* {0.80386406 0.55020994 0.24351802 0.7608507  0.00175726 0.592161830.1336592  0.28955624}, */
-        /* {0.5031839  0.09765117 0.4252744  0.9478887  0.02622282 0.354896220.00149701 0.01623238} */
-    };
-
-    std::vector<arrayf> expectedDistances = {{0.55482084, 0.9671766, 1.0687857},
-                                             {0.7633677, 0.9953769, 0.9990481},
-                                             {1.0608857, 1.160058, 0.9814076},
-                                             {1.1750504, 1.1985078, 1.2158847},
-                                             {1.1671909, 1.4311509, 1.4611306}};
-    std::vector<std::vector<int64_t>> expectedIndices = {{4, 2, 3}, {1, 2, 0}, {0, 2, 1}, {2, 1, 4}, {2, 1, 3}, {2, 1, 0}, {0, 3, 2}, {0, 4, 2}};
-
-    std::vector<vptree::VPTree<arrayf, float, dist_l1_f_avx2>::VPTreeSearchResultElement> results;
-    vptree::SerializableVPTree<arrayf, float, dist_l1_f_avx2, vptree::ndarraySerializer<float>, vptree::ndarrayDeserializer<float>> tree;
-    tree.set(points);
-    tree.searchKNN(queries, 3, results);
-}
 } // namespace vptree::tests
diff --git a/pynear/tests/test_vptree.py b/pynear/tests/test_vptree.py
@@ -3,17 +3,21 @@
 # Copyright 2021 Pablo Carneiro Elias
 #
 
-import os
-import sys
 from collections import Counter
 from functools import partial
+import os
+import sys
 from typing import Callable
 from typing import Tuple
 
 import numpy as np
 import pytest
 
 import pynear
+from pynear import dist_chebyshev
+from pynear import dist_hamming
+from pynear import dist_l1
+from pynear import dist_l2
 
 seed = os.environ.get("PYNEAR_TEST_SEED", None)
 if seed is not None:
@@ -23,23 +27,49 @@
 np.set_printoptions(threshold=sys.maxsize)
 
 def hamming_distance_pairwise(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-    r = (1 << np.arange(8))[:, None, None, None]
-    return np.count_nonzero((np.bitwise_xor(a[:, None, :], b[None, :, :]) & r) != 0, axis=(0, -1))
+    result = []
+    for ai in a:
+        dists = []
+        for bi in b:
+            dists.append(dist_hamming(ai, bi))
 
+        result.append(dists)
+
+    return np.array(result).astype(np.uint8)
 
 def euclidean_distance_pairwise(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-    diff = b[None, :, :] - a[:, None, :]
-    return np.sqrt(np.sum(diff * diff, axis=-1))
+    result = []
+    for ai in a:
+        dists = []
+        for bi in b:
+            dists.append(dist_l2(ai, bi))
+
+        result.append(dists)
+
+    return np.array(result)
 
 
 def manhattan_distance_pairwise(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-    diff = b[None, :, :] - a[:, None, :]
-    return np.sum(np.abs(diff), axis=-1)
+    result = []
+    for ai in a:
+        dists = []
+        for bi in b:
+            dists.append(dist_l1(ai, bi))
 
+        result.append(dists)
+
+    return np.array(result)
 
 def chebyshev_distance_pairwise(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-    diff = b[None, :, :] - a[:, None, :]
-    return np.max(np.abs(diff), axis=-1)
+    result = []
+    for ai in a:
+        dists = []
+        for bi in b:
+            dists.append(dist_chebyshev(ai, bi))
+
+        result.append(dists)
+
+    return np.array(result)
 
 
 def test_empty_index():
@@ -79,8 +109,8 @@ def hamming_distance(a, b) -> np.ndarray:
         r = (1 << np.arange(8))[:, None]
         return np.count_nonzero((np.bitwise_xor(a, b) & r) != 0)
 
-    arr1 = np.random.randint(0, 10, (5, 4), dtype=np.uint8)
-    arr2 = np.random.randint(0, 10, (3, 4), dtype=np.uint8)
+    arr1 = np.random.randint(0, 10, (5, 8), dtype=np.uint8)
+    arr2 = np.random.randint(0, 10, (3, 8), dtype=np.uint8)
 
     truth = np.empty((arr1.shape[0], arr2.shape[0]), dtype=np.uint64)
     for i in range(arr1.shape[0]):
@@ -180,7 +210,11 @@ def test_k_equals_dataset(vptree_cls, exaustive_metric):
     vptree_indices, vptree_distances = vptree.searchKNN(queries, k)
 
     vptree_indices = np.array(vptree_indices, dtype=np.uint64)[:, ::-1]
-    vptree_distances = np.array(vptree_distances, dtype=np.float32)[:, ::-1]
+    vptree_distances = np.array(vptree_distances, dtype=np.float64)[:, ::-1]
+    dist_diff = vptree_distances - exaustive_distances 
+    ind_diff = vptree_indices - exaustive_indices
+    print(">>>>>>>>>>>>", dist_diff[dist_diff > 1e-7] )
+    print(">>>>>>>>>>>>", np.argwhere(ind_diff != 0))
 
     np.testing.assert_allclose(exaustive_distances, vptree_distances, rtol=1e-06)
     if _num_dups(exaustive_distances) == 0:
@@ -277,11 +311,11 @@ def test_query_larger_than_dataset(vptree_cls, exaustive_metric):
 
     k = 3
 
-    exaustive_indices, exaustive_distances = exaustive_metric(data, queries, k)
+    exaustive_indices, exaustive_distances = exaustive_metric(data, np.array([queries[0]]), k)
 
     vptree = vptree_cls()
     vptree.set(data)
-    vptree_indices, vptree_distances = vptree.searchKNN(queries, k)
+    vptree_indices, vptree_distances = vptree.searchKNN(np.array([queries[0]]), k)
 
     vptree_indices = np.array(vptree_indices, dtype=np.uint64)[:, ::-1]
     vptree_distances = np.array(vptree_distances, dtype=np.float32)[:, ::-1]