From cb7b398b04828273cc6a3da88e00fe8ef389da92 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 01:27:30 +0000 Subject: [PATCH 01/23] New methods loadIndexFromStream and saveIndexToStream expose de-/serialization logic of HierarchicalNSW class via std::i/ostream. --- hnswlib/hnswalg.h | 63 +++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 7d0eb443..d2c36f0c 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -26,6 +26,10 @@ namespace hnswlib { loadIndex(location, s, max_elements); } + HierarchicalNSW(SpaceInterface *s, std::istream & input, bool nmslib = false, size_t max_elements=0) { + loadIndexFromStream(input, s, max_elements); + } + HierarchicalNSW(SpaceInterface *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) : link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) { max_elements_ = max_elements; @@ -57,8 +61,6 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); - - //initializations for special treatment of the first node enterpoint_node_ = -1; maxlevel_ = -1; @@ -102,6 +104,8 @@ namespace hnswlib { double mult_, revSize_; int maxlevel_; + std::mutex global; + size_t ef_; VisitedListPool *visited_list_pool_; std::mutex cur_element_count_guard_; @@ -511,8 +515,6 @@ namespace hnswlib { return next_closest_entry_point; } - std::mutex global; - size_t ef_; void setEf(size_t ef) { ef_ = ef; @@ -598,10 +600,7 @@ namespace hnswlib { max_elements_=new_max_elements; } - - void saveIndex(const std::string &location) { - std::ofstream output(location, std::ios::binary); - std::streampos position; + void saveIndexToStream(std::ostream &output) { writeBinaryPOD(output, offsetLevel0_); writeBinaryPOD(output, max_elements_); @@ -626,17 +625,17 @@ namespace hnswlib { if (linkListSize) output.write(linkLists_[i], linkListSize); } - output.close(); - } - - void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { + } - std::ifstream input(location, std::ios::binary); - - if (!input.is_open()) - throw std::runtime_error("Cannot open file"); + void saveIndex(const std::string &location) { + std::ofstream output(location, std::ios::binary); + std::streampos position; + saveIndexToStream(output); + output.close(); + } + void loadIndexFromStream(std::istream & input, SpaceInterface *s, size_t max_elements_i=0) { // get file size: input.seekg(0,input.end); @@ -663,14 +662,12 @@ namespace hnswlib { readBinaryPOD(input, mult_); readBinaryPOD(input, ef_construction_); - data_size_ = s->get_data_size(); fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); auto pos=input.tellg(); - /// Optional - check if index is ok: input.seekg(cur_element_count * size_data_per_element_,input.cur); @@ -696,15 +693,11 @@ namespace hnswlib { input.seekg(pos,input.beg); - data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); if (data_level0_memory_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0"); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); - - - size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -715,7 +708,6 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); - linkLists_ = (char **) malloc(sizeof(void *) * max_elements); if (linkLists_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); @@ -746,11 +738,22 @@ namespace hnswlib { has_deletions_=true; } - input.close(); return; } + + + void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { + std::ifstream input(location, std::ios::binary); + if (!input.is_open()) + throw std::runtime_error("Cannot open file"); + + loadIndexFromStream(input, s, max_elements_i); + input.close(); + return; + } + template std::vector getDataByLabel(labeltype label) { @@ -874,7 +877,7 @@ namespace hnswlib { for (auto&& cand : sCand) { if (cand == neigh) continue; - + dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_); if (candidates.size() < elementsToKeep) { candidates.emplace(distance, cand); @@ -1137,7 +1140,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } @@ -1186,19 +1189,19 @@ namespace hnswlib { std::unordered_set s; for (int j=0; j 0); - assert(data[j] < cur_element_count); + assert(data[j] < cur_element_count); assert (data[j] != i); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; - + } assert(s.size() == size); } } if(cur_element_count > 1){ int min1=inbound_connections_num[0], max1=inbound_connections_num[0]; - for(int i=0; i < cur_element_count; i++){ + for(int i=0; i < cur_element_count; i++){ assert(inbound_connections_num[i] > 0); min1=std::min(inbound_connections_num[i],min1); max1=std::max(inbound_connections_num[i],max1); @@ -1206,7 +1209,7 @@ namespace hnswlib { std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n"; } std::cout << "integrity ok, checked " << connections_checked << " connections\n"; - + } }; From e161db863efba6e39c4f2cd5013a6de9f439fb12 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 01:28:27 +0000 Subject: [PATCH 02/23] Implement __getstate__ and __setstate__ to allow pickling of hnswlib.Index objects; add new properties to Index class: space_name, dim, max_elements, element_count, ef_construction, M, num_threads, ef. Properties num_threads and ef are read-write-able, other parameters are read-only. --- python_bindings/bindings.cpp | 176 +++++++++++++++++++++++------------ 1 file changed, 119 insertions(+), 57 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 1b88ca23..84839cd6 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -13,7 +13,7 @@ namespace py = pybind11; * only handles a subset of functionality (no reductions etc) * Process ids from start (inclusive) to end (EXCLUSIVE) * - * The method is borrowed from nmslib + * The method is borrowed from nmslib */ template inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { @@ -74,24 +74,24 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn template class Index { public: - Index(const std::string &space_name, const int dim) : - space_name(space_name), dim(dim) { - normalize=false; - if(space_name=="l2") { - l2space = new hnswlib::L2Space(dim); - } - else if(space_name=="ip") { - l2space = new hnswlib::InnerProductSpace(dim); - } - else if(space_name=="cosine") { - l2space = new hnswlib::InnerProductSpace(dim); - normalize=true; - } - appr_alg = NULL; - ep_added = true; - index_inited = false; - num_threads_default = std::thread::hardware_concurrency(); + Index(const std::string &space_name, const int dim) : + space_name(space_name), dim(dim) { + normalize=false; + if(space_name=="l2") { + l2space = new hnswlib::L2Space(dim); } + else if(space_name=="ip") { + l2space = new hnswlib::InnerProductSpace(dim); + } + else if(space_name=="cosine") { + l2space = new hnswlib::InnerProductSpace(dim); + normalize=true; + } + appr_alg = NULL; + ep_added = true; + index_inited = false; + num_threads_default = std::thread::hardware_concurrency(); + } void init_new_index(const size_t maxElements, const size_t M, const size_t efConstruction, const size_t random_seed) { if (appr_alg) { @@ -103,17 +103,12 @@ class Index { ep_added = false; } + + void set_ef(size_t ef) { appr_alg->ef_ = ef; } - size_t get_ef_construction() { - return appr_alg->ef_construction_; - } - - size_t get_M() { - return appr_alg->M_; - } void set_num_threads(int num_threads) { this->num_threads_default = num_threads; @@ -122,15 +117,28 @@ class Index { void saveIndex(const std::string &path_to_index) { appr_alg->saveIndex(path_to_index); } + void saveIndexToStream(std::ostream & output) const { + appr_alg->saveIndexToStream(output); + } + + void loadIndexFromStream(std::istream & input, size_t max_elements) { + if (appr_alg) { + std::cerr<<"Warning: Calling load_index from istream for an already inited index. Old index is being deallocated." << std::endl; + delete appr_alg; + } + appr_alg = new hnswlib::HierarchicalNSW(l2space, input, false, max_elements); + cur_l = appr_alg->cur_element_count; + } void loadIndex(const std::string &path_to_index, size_t max_elements) { - if (appr_alg) { - std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; - delete appr_alg; - } - appr_alg = new hnswlib::HierarchicalNSW(l2space, path_to_index, false, max_elements); - cur_l = appr_alg->cur_element_count; + if (appr_alg) { + std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; + delete appr_alg; + } + appr_alg = new hnswlib::HierarchicalNSW(l2space, path_to_index, false, max_elements); + cur_l = appr_alg->cur_element_count; } + void normalize_vector(float *data, float *norm_array){ float norm=0.0f; for(int i=0;i norm_array(dim); - if(normalize){ - normalize_vector(vector_data, norm_array.data()); - vector_data = norm_array.data(); - - } - appr_alg->addPoint((void *) vector_data, (size_t) id); - start = 1; - ep_added = true; + int start = 0; + if (!ep_added) { + size_t id = ids.size() ? ids.at(0) : (cur_l); + float *vector_data=(float *) items.data(0); + std::vector norm_array(dim); + if(normalize){ + normalize_vector(vector_data, norm_array.data()); + vector_data = norm_array.data(); } + appr_alg->addPoint((void *) vector_data, (size_t) id); + start = 1; + ep_added = true; + } py::gil_scoped_release l; if(normalize==false) { @@ -214,7 +221,7 @@ class Index { std::vector norm_array(num_threads * dim); ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) { // normalize vector: - size_t start_idx = threadId * dim; + size_t start_idx = threadId * dim; normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); size_t id = ids.size() ? ids.at(row) : (cur_l+row); @@ -370,7 +377,6 @@ class Index { std::string space_name; int dim; - bool index_inited; bool ep_added; bool normalize; @@ -386,31 +392,87 @@ class Index { } }; + + PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); py::class_>(m, "Index") .def(py::init(), py::arg("space"), py::arg("dim")) - .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, - py::arg("ef_construction")=200, py::arg("random_seed")=100) + .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) .def("add_items", &Index::addItems, py::arg("data"), py::arg("ids") = py::none(), py::arg("num_threads")=-1) .def("get_items", &Index::getDataReturnList, py::arg("ids") = py::none()) .def("get_ids_list", &Index::getIdsList) .def("set_ef", &Index::set_ef, py::arg("ef")) - .def("get_ef_construction", &Index::get_ef_construction) - .def("get_M", &Index::get_M) .def("set_num_threads", &Index::set_num_threads, py::arg("num_threads")) .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def("get_max_elements", &Index::getMaxElements) - .def("get_current_count", &Index::getCurrentCount) - .def("__repr__", - [](const Index &a) { - return ""; - } - ); + .def_readonly("space_name", &Index::space_name) + .def_readonly("dim", &Index::dim) + .def_readwrite("num_threads", &Index::num_threads_default) + .def_property("ef", + [](const Index & index) { + return index.index_inited ? index.appr_alg->ef_ : 10; + }, + [](Index & index, const size_t ef_) { + if (index.index_inited) + index.appr_alg->ef_ = ef_; + else + throw std::runtime_error("must call init_index prior to setting ef parameter"); + }) + .def_property_readonly("max_elements", [](const Index & index) { + return index.index_inited ? index.appr_alg->max_elements_ : 0; + }) + .def_property_readonly("element_count", [](const Index & index) { + return index.index_inited ? index.appr_alg->cur_element_count : 0; + }) + .def_property_readonly("ef_construction", [](const Index & index) { + return index.index_inited ? index.appr_alg->ef_construction_ : 0; + }) + .def_property_readonly("M", [](const Index & index) { + return index.index_inited ? index.appr_alg->M_ : 0; + }) + .def("__getstate__", [](const Index & index) { // __getstate__ + std::stringstream output(std::stringstream::out|std::stringstream::binary); + + + if (index.index_inited) + index.saveIndexToStream(output); + + /* Return a tuple that fully encodes the state of the object */ + return py::make_tuple(index.space_name, index.dim, + index.index_inited, index.ep_added, + index.normalize, index.num_threads_default, + py::bytes(output.str()), + index.index_inited == false ? 10 : index.appr_alg->ef_, + index.index_inited == false ? 0 : index.appr_alg->max_elements_, + index.index_inited == false ? 0 : index.appr_alg->cur_element_count + ); + }) + .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ + if (t.size() != 10) + throw std::runtime_error("Invalid state!"); + + /* Invoke Index constructor (need to use in-place version) */ + new (&index) Index(t[0].cast(), t[1].cast()); + index.index_inited=t[2].cast(); + index.ep_added=t[3].cast(); + index.normalize=t[4].cast(); + index.num_threads_default=t[5].cast(); + + if (index.index_inited){ + std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); + index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state + index.appr_alg->ef_=(t[7].cast()); + } + + }) + .def("__repr__", [](const Index &a) { + return ""; + }); + return m.ptr(); } From e0eacad7d008a12c5e16d14a1a320b99352fca08 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 01:30:34 +0000 Subject: [PATCH 03/23] Verify knn_query results match before/after pickling hnswlib.Index objects; use brute-force knn search to verify knn_query gives recall of (almost) 100% --- python_bindings/tests/bindings_test_pickle.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 python_bindings/tests/bindings_test_pickle.py diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py new file mode 100644 index 00000000..0d57d946 --- /dev/null +++ b/python_bindings/tests/bindings_test_pickle.py @@ -0,0 +1,144 @@ +import unittest + +import numpy as np + + +def get_dist(metric, pt1, pt2): + if metric == 'l2': + return np.sum((pt1-pt2)**2) + elif metric == 'ip': + return 1. - np.sum(np.multiply(pt1,pt2)) + elif metric == 'cosine': + return 1. - np.sum(np.multiply(pt1,pt2)) / (np.sum(pt1**2) * np.sum(pt2**2))**.5 + +def brute_force_distances(metric, items, query_items, k): + dists=np.zeros((query_items.shape[0], items.shape[0])) + for ii in range(items.shape[0]): + for jj in range(query_items.shape[0]): + dists[jj,ii]=get_dist(metric, items[ii, :], query_items[jj, :]) + + labels = np.argsort(dists, axis=1) + dists = np.sort(dists, axis=1) + + + return labels[:,:k], dists[:,:k] + + +class PickleSelfTestCase(unittest.TestCase): + + def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): + brute_l, brute_d = brute_force_distances(metric, items, query_items, k) + err_total = 0 + for jj in range(query_items.shape[0]): + err = np.sum(np.isin(brute_l[jj, :], ann_l[jj, :], invert=True)) + if err > 0: + print(f"Warning: {err} labels are missing from ann results (k={k}, err_thresh={err_thresh})") + + if err > err_thresh: + err_total += 1 + + self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") + + wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) + if wrong_dists > 0: + dists_count=brute_d.shape[0]*brute_d.shape[1] + print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") + + self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + + def testPickle(self): + import hnswlib + import pickle + + ef_construction = 725 + M = 64 + ef = 725 + + num_elements = 5000 + num_test_elements = 100 + + num_threads = 4 + k = 15 + + label_err_thresh=5 ### max number of missing labels allowed per test item + item_err_thresh=5 ### max number of items allowed with incorrect labels + + dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max + ### number of value pairs that are allowed to be different in d1 and d2 + ### i.e., number of values that are (d1-d2)**2>1e-3 + + for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: + + # Generating sample data + data = np.float32(np.random.random((num_elements, dim))) + test_data = np.float32(np.random.random((num_test_elements, dim))) + + # Declaring index + p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip + print(f"Running pickle tests for {p}") + + p.num_threads=num_threads # by default using all available cores + + p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) + p0.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) + + p.ef=ef ### Note: ef parameter can be set only after calling p.init_index, + p0.ef=ef ### so we have to set p0.ef + + p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + ### add items to ann index p,p0,p1 + p.add_items(data) + p1.add_items(data) + p0.add_items(data) + + p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") + self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") + self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") + + ### Test if returned distances are same + l, d = p.knn_query(test_data, k=k) + l0, d0 = p0.knn_query(test_data, k=k) + l1, d1 = p1.knn_query(test_data, k=k) + l2, d2 = p2.knn_query(test_data, k=k) + + self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p and p0 must match") + self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") + self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") + + ### check if ann results match brute-force search + ### allow for 2 labels to be missing from ann results + self.check_ann_results(space, data, test_data, k, l, d, + err_thresh = label_err_thresh, + total_thresh = item_err_thresh, + dists_thresh = dists_err_thresh) + + self.check_ann_results(space, data, test_data, k, l2, d2, + err_thresh=label_err_thresh, + total_thresh=item_err_thresh, + dists_thresh=dists_err_thresh) + + ### Check ef parameter value + self.assertEqual(p.ef, ef, "incorrect value of p.ef") + self.assertEqual(p0.ef, ef, "incorrect value of p0.ef") + self.assertEqual(p2.ef, ef, "incorrect value of p2.ef") + self.assertEqual(p1.ef, ef, "incorrect value of p1.ef") + + ### Check M parameter value + self.assertEqual(p.M, M, "incorrect value of p.M") + self.assertEqual(p0.M, M, "incorrect value of p0.M") + self.assertEqual(p1.M, M, "incorrect value of p1.M") + self.assertEqual(p2.M, M, "incorrect value of p2.M") + + ### Check ef_construction parameter value + self.assertEqual(p.ef_construction, ef_construction, "incorrect value of p.ef_construction") + self.assertEqual(p0.ef_construction, ef_construction, "incorrect value of p0.ef_construction") + self.assertEqual(p1.ef_construction, ef_construction, "incorrect value of p1.ef_construction") + self.assertEqual(p2.ef_construction, ef_construction, "incorrect value of p2.ef_construction") + + +if __name__ == "__main__": + unittest.main() From ec4f4b1a89ca9043e7e90de598589fd97aff9be9 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 03:04:59 +0000 Subject: [PATCH 04/23] add documeentation --- README.md | 19 ++++++++++++++++++- python_bindings/bindings.cpp | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 559c5dfd..b2950508 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,17 @@ Index methods: * `get_current_count()` - returns the current number of element stored in the index - +Index properties: + +* `space` - name of the space (can be one of 'l2', 'ip', 'cosine'). This property is read-only. +* `dim` - dimensionality of the space. This property is read-only. +* `M` - parameter that defines the maximum number of outgoing connections in the graph. This property is read-only. +* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. This property is read-only. +* `ef` - parameter controlling query time/accuracy trade-off. This property supports read and write operations. Note: setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. +* `num_threads` - number of threads used in `add_items` or `knn_query` by default. This property supports read and write operations. Calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. +* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). This property is read-only. +* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). This property is read-only. + @@ -84,6 +94,7 @@ Index methods: ```python import hnswlib import numpy as np +import pickle dim = 128 num_elements = 10000 @@ -106,6 +117,12 @@ p.set_ef(50) # ef should always be > k # Query dataset, k - number of closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k = 1) + +# Index objects support pickling: +p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p + +print(f"Index parameters: space={p_copy.space}, dim={p_copy.dim}, M={p_copy.M}, ef_construction={p_copy.ef_construction} ") +print(f" ef={p_copy.ef}, element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") ``` An example with updates after serialization/deserialization: diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 84839cd6..589f4024 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -410,7 +410,7 @@ PYBIND11_PLUGIN(hnswlib) { .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def_readonly("space_name", &Index::space_name) + .def_readonly("space", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) .def_property("ef", From a3646cc6e50dca51ffd5e85aafb4e776ee3185e1 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 15:10:52 +0000 Subject: [PATCH 05/23] clean-up readme --- README.md | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b2950508..1c4b8a3a 100644 --- a/README.md +++ b/README.md @@ -76,18 +76,27 @@ Index methods: * `get_current_count()` - returns the current number of element stored in the index -Index properties: +Read-only properties of Index class: -* `space` - name of the space (can be one of 'l2', 'ip', 'cosine'). This property is read-only. -* `dim` - dimensionality of the space. This property is read-only. -* `M` - parameter that defines the maximum number of outgoing connections in the graph. This property is read-only. -* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. This property is read-only. -* `ef` - parameter controlling query time/accuracy trade-off. This property supports read and write operations. Note: setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. -* `num_threads` - number of threads used in `add_items` or `knn_query` by default. This property supports read and write operations. Calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. -* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). This property is read-only. -* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). This property is read-only. +* `space` - name of the space (can be one of "l2", "ip", or "cosine"). - +* `dim` - dimensionality of the space. + +* `M` - parameter that defines the maximum number of outgoing connections in the graph. + +* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. + +* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). + +* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). + +Properties of Index class that support reading and writing: + +* `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. + +* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. + + #### Python bindings examples @@ -121,8 +130,12 @@ labels, distances = p.knn_query(data, k = 1) # Index objects support pickling: p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p -print(f"Index parameters: space={p_copy.space}, dim={p_copy.dim}, M={p_copy.M}, ef_construction={p_copy.ef_construction} ") -print(f" ef={p_copy.ef}, element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") +### Index parameters are exposed as class properties: +print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") +print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") +print(f"Index size and capacity: element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") +print(f"Search parameter: ef={p_copy.ef}") + ``` An example with updates after serialization/deserialization: From a1ba4e50c818bd3b0b8d63a0f3fe82eb08e6e281 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 15:15:15 +0000 Subject: [PATCH 06/23] clean-up readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1c4b8a3a..aef43e09 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ p.set_ef(50) # ef should always be > k labels, distances = p.knn_query(data, k = 1) # Index objects support pickling: -p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p +p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip ### Index parameters are exposed as class properties: print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") From cf3846c150435c1d0d28c8a12ba165fd53c40030 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 15:22:58 +0000 Subject: [PATCH 07/23] clean-up readme --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index aef43e09..8722222d 100644 --- a/README.md +++ b/README.md @@ -86,15 +86,15 @@ Read-only properties of Index class: * `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction. -* `max_elements` - current capacity of the index (equivalent to `p.get_max_elements()`). +* `max_elements` - current capacity of the index. Equivalent to `p.get_max_elements()`. -* `element_count` - number of items in the index (equivalent to `p.get_current_count()`). +* `element_count` - number of items in the index. Equivalent to `p.get_current_count()`. Properties of Index class that support reading and writing: * `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. -* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. +* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to setting `p.num_threads=3`. @@ -133,8 +133,8 @@ p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle ### Index parameters are exposed as class properties: print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") -print(f"Index size and capacity: element_count={p_copy.element_count}, max_elements={p_copy.max_elements}") -print(f"Search parameter: ef={p_copy.ef}") +print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") ``` From 27471cd617cb4bebb4098dcd43890ef65419adb4 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 12 Oct 2020 16:03:05 +0000 Subject: [PATCH 08/23] clean-up readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8722222d..28accb84 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib. #### Short API description * `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`. -Index methods: +`hnswlib.Index` methods: * `init_index(max_elements, ef_construction = 200, M = 16, random_seed = 100)` initializes the index from with no elements. * `max_elements` defines the maximum number of elements that can be stored in the structure(can be increased/shrunk). * `ef_construction` defines a construction time/accuracy trade-off (see [ALGO_PARAMS.md](ALGO_PARAMS.md)). @@ -76,7 +76,7 @@ Index methods: * `get_current_count()` - returns the current number of element stored in the index -Read-only properties of Index class: +Read-only properties of `hnswlib.Index` class: * `space` - name of the space (can be one of "l2", "ip", or "cosine"). @@ -90,7 +90,7 @@ Read-only properties of Index class: * `element_count` - number of items in the index. Equivalent to `p.get_current_count()`. -Properties of Index class that support reading and writing: +Properties of `hnswlib.Index` that support reading and writing: * `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. From 4220956d9935847b4488673db5d204a5cf03386a Mon Sep 17 00:00:00 2001 From: Dmitriy Bespalov Date: Mon, 12 Oct 2020 13:00:35 -0400 Subject: [PATCH 09/23] Update bindings_test_pickle.py use 200 test items --- python_bindings/tests/bindings_test_pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 0d57d946..3d011726 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -55,7 +55,7 @@ def testPickle(self): ef = 725 num_elements = 5000 - num_test_elements = 100 + num_test_elements = 200 num_threads = 4 k = 15 From 72b650190e6c9faf536ca202e168eade4958c7c8 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 02:31:26 -0400 Subject: [PATCH 10/23] Revert "New methods loadIndexFromStream and saveIndexToStream expose de-/serialization logic of HierarchicalNSW class via std::i/ostream." This reverts commit cb7b398b04828273cc6a3da88e00fe8ef389da92. --- hnswlib/hnswalg.h | 63 ++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index d2c36f0c..7d0eb443 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -26,10 +26,6 @@ namespace hnswlib { loadIndex(location, s, max_elements); } - HierarchicalNSW(SpaceInterface *s, std::istream & input, bool nmslib = false, size_t max_elements=0) { - loadIndexFromStream(input, s, max_elements); - } - HierarchicalNSW(SpaceInterface *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) : link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) { max_elements_ = max_elements; @@ -61,6 +57,8 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); + + //initializations for special treatment of the first node enterpoint_node_ = -1; maxlevel_ = -1; @@ -104,8 +102,6 @@ namespace hnswlib { double mult_, revSize_; int maxlevel_; - std::mutex global; - size_t ef_; VisitedListPool *visited_list_pool_; std::mutex cur_element_count_guard_; @@ -515,6 +511,8 @@ namespace hnswlib { return next_closest_entry_point; } + std::mutex global; + size_t ef_; void setEf(size_t ef) { ef_ = ef; @@ -600,7 +598,10 @@ namespace hnswlib { max_elements_=new_max_elements; } - void saveIndexToStream(std::ostream &output) { + + void saveIndex(const std::string &location) { + std::ofstream output(location, std::ios::binary); + std::streampos position; writeBinaryPOD(output, offsetLevel0_); writeBinaryPOD(output, max_elements_); @@ -625,17 +626,17 @@ namespace hnswlib { if (linkListSize) output.write(linkLists_[i], linkListSize); } - - } - - void saveIndex(const std::string &location) { - std::ofstream output(location, std::ios::binary); - std::streampos position; - saveIndexToStream(output); output.close(); } - void loadIndexFromStream(std::istream & input, SpaceInterface *s, size_t max_elements_i=0) { + void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { + + + std::ifstream input(location, std::ios::binary); + + if (!input.is_open()) + throw std::runtime_error("Cannot open file"); + // get file size: input.seekg(0,input.end); @@ -662,12 +663,14 @@ namespace hnswlib { readBinaryPOD(input, mult_); readBinaryPOD(input, ef_construction_); + data_size_ = s->get_data_size(); fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); auto pos=input.tellg(); + /// Optional - check if index is ok: input.seekg(cur_element_count * size_data_per_element_,input.cur); @@ -693,11 +696,15 @@ namespace hnswlib { input.seekg(pos,input.beg); + data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); if (data_level0_memory_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0"); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); + + + size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -708,6 +715,7 @@ namespace hnswlib { visited_list_pool_ = new VisitedListPool(1, max_elements); + linkLists_ = (char **) malloc(sizeof(void *) * max_elements); if (linkLists_ == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); @@ -738,19 +746,8 @@ namespace hnswlib { has_deletions_=true; } - - return; - } - - - - void loadIndex(const std::string &location, SpaceInterface *s, size_t max_elements_i=0) { - std::ifstream input(location, std::ios::binary); - if (!input.is_open()) - throw std::runtime_error("Cannot open file"); - - loadIndexFromStream(input, s, max_elements_i); input.close(); + return; } @@ -877,7 +874,7 @@ namespace hnswlib { for (auto&& cand : sCand) { if (cand == neigh) continue; - + dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_); if (candidates.size() < elementsToKeep) { candidates.emplace(distance, cand); @@ -1140,7 +1137,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } @@ -1189,19 +1186,19 @@ namespace hnswlib { std::unordered_set s; for (int j=0; j 0); - assert(data[j] < cur_element_count); + assert(data[j] < cur_element_count); assert (data[j] != i); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; - + } assert(s.size() == size); } } if(cur_element_count > 1){ int min1=inbound_connections_num[0], max1=inbound_connections_num[0]; - for(int i=0; i < cur_element_count; i++){ + for(int i=0; i < cur_element_count; i++){ assert(inbound_connections_num[i] > 0); min1=std::min(inbound_connections_num[i],min1); max1=std::max(inbound_connections_num[i],max1); @@ -1209,7 +1206,7 @@ namespace hnswlib { std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n"; } std::cout << "integrity ok, checked " << connections_checked << " connections\n"; - + } }; From 3a62b41e11dade2eb846f72a8b0b80ae9217db15 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 13:27:32 -0400 Subject: [PATCH 11/23] use python's buffer protocol to avoid making copies of ann data (state of Index object) when calling Python <-> C++ --- python_bindings/bindings.cpp | 441 +++++++++++++++++++++++++++++------ 1 file changed, 373 insertions(+), 68 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 589f4024..33d71879 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -5,6 +5,8 @@ #include "hnswlib/hnswlib.h" #include #include +#include +#include namespace py = pybind11; @@ -71,6 +73,44 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } +// +// std::priority_queue> +// searchKnn(const void *query_data, size_t k) const { +// std::priority_queue> result; +// if (cur_element_count == 0) return result; +// +// tableint currObj = enterpoint_node_; +// dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); +// +// for (int level = maxlevel_; level > 0; level--) { +// bool changed = true; +// while (changed) { +// changed = false; +// unsigned int *data; +// +// data = (unsigned int *) get_linklist(currObj, level); +// int size = getListCount(data); +// metric_hops++; +// metric_distance_computations+=size; +// +// tableint *datal = (tableint *) (data + 1); +// for (int i = 0; i < size; i++) { +// tableint cand = datal[i]; +// if (cand < 0 || cand > max_elements_) +// throw std::runtime_error("cand error"); +// dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); +// +// if (d < curdist) { +// curdist = d; +// currObj = cand; +// changed = true; +// } +// } +// } +// } +// + + template class Index { public: @@ -91,6 +131,27 @@ class Index { ep_added = true; index_inited = false; num_threads_default = std::thread::hardware_concurrency(); + + default_ef=10; + + } + std::string space_name; + int dim; + size_t seed; + size_t default_ef; + + bool index_inited; + bool ep_added; + bool normalize; + int num_threads_default; + hnswlib::labeltype cur_l; + hnswlib::HierarchicalNSW *appr_alg; + hnswlib::SpaceInterface *l2space; + + ~Index() { + delete l2space; + if (appr_alg) + delete appr_alg; } void init_new_index(const size_t maxElements, const size_t M, const size_t efConstruction, const size_t random_seed) { @@ -101,11 +162,14 @@ class Index { appr_alg = new hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); index_inited = true; ep_added = false; + appr_alg->ef_ = default_ef; + seed=random_seed; } - void set_ef(size_t ef) { + default_ef=ef; + if (appr_alg) appr_alg->ef_ = ef; } @@ -117,18 +181,6 @@ class Index { void saveIndex(const std::string &path_to_index) { appr_alg->saveIndex(path_to_index); } - void saveIndexToStream(std::ostream & output) const { - appr_alg->saveIndexToStream(output); - } - - void loadIndexFromStream(std::istream & input, size_t max_elements) { - if (appr_alg) { - std::cerr<<"Warning: Calling load_index from istream for an already inited index. Old index is being deallocated." << std::endl; - delete appr_alg; - } - appr_alg = new hnswlib::HierarchicalNSW(l2space, input, false, max_elements); - cur_l = appr_alg->cur_element_count; - } void loadIndex(const std::string &path_to_index, size_t max_elements) { if (appr_alg) { @@ -261,6 +313,183 @@ class Index { return ids; } + inline void assert_true(bool expr, const std::string & msg) { + if (expr == false) + throw std::runtime_error("assert failed: "+msg); + return; + } + + + py::tuple getAnnData() const { + + unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; + unsigned int link_npy_size = appr_alg->cur_element_count * appr_alg->maxlevel_ * appr_alg->size_links_per_element_; + unsigned int link_npy_stride = appr_alg->maxlevel_ * appr_alg->size_links_per_element_; + + char* data_level0_npy = (char *) malloc(level0_npy_size); + char* link_list_npy = (char *) malloc(link_npy_size); + + memset(data_level0_npy, 0, level0_npy_size); + memset(link_list_npy, 0, link_npy_size); + + memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); + + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + if (linkListSize){ + memcpy(link_list_npy+(link_npy_stride * i), appr_alg->linkLists_[i], linkListSize); + // std::cout << linkListSize << " " << appr_alg->maxlevel_ << " " << appr_alg->element_levels_[i] << " generator: " << appr_alg->level_generator_ << std::endl; + } + } + + py::capsule free_when_done_l0(data_level0_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_ll(link_list_npy, [](void *f) { + delete[] f; + }); + + return py::make_tuple(appr_alg->offsetLevel0_, + appr_alg->max_elements_, + appr_alg->cur_element_count, + appr_alg->size_data_per_element_, + appr_alg->label_offset_, + appr_alg->offsetData_, + appr_alg->maxlevel_, + appr_alg->enterpoint_node_, + appr_alg->maxM_, + appr_alg->maxM0_, + appr_alg->M_, + appr_alg->mult_, + appr_alg->ef_construction_, + appr_alg->ef_, + appr_alg->has_deletions_, + appr_alg->size_links_per_element_, + appr_alg->label_lookup_, + appr_alg->element_levels_, + py::array_t( + {level0_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + data_level0_npy, // the data pointer + free_when_done_l0), + py::array_t( + {link_npy_size}, // shape + {sizeof(char)}, // C-style contiguous strides for double + link_list_npy, // the data pointer + free_when_done_ll) + ); + + } + + + py::tuple getIndexParams() const { + return py::make_tuple(py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), + index_inited == true ? getAnnData() : py::make_tuple()); + + } + + + static Index * createFromParams(const py::tuple t) { + py::tuple index_params=t[0].cast(); + py::tuple ann_params=t[1].cast(); + + auto space_name_=index_params[0].cast(); + auto dim_=index_params[1].cast(); + auto index_inited_=index_params[2].cast(); + + Index *new_index = new Index(index_params[0].cast(), index_params[1].cast()); + + new_index->seed = index_params[6].cast(); + + + if (index_inited_){ + //// hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); + new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, ann_params[1].cast(), ann_params[10].cast(), ann_params[12].cast(), new_index->seed); + new_index->cur_l = ann_params[2].cast(); + } + + new_index->index_inited = index_inited_; + new_index->ep_added=index_params[3].cast(); + new_index->num_threads_default=index_params[5].cast(); + new_index->default_ef=index_params[7].cast(); + + if (index_inited_) + new_index->setAnnData(ann_params); + + + return new_index; + } + + void setAnnData(const py::tuple t) { + assert_true(appr_alg->offsetLevel0_ == t[0].cast(), "Invalid value of offsetLevel0_ "); + assert_true(appr_alg->max_elements_ == t[1].cast(), "Invalid value of max_elements_ "); + + appr_alg->cur_element_count = t[2].cast(); + + assert_true(appr_alg->size_data_per_element_ == t[3].cast(), "Invalid value of size_data_per_element_ "); + assert_true(appr_alg->label_offset_ == t[4].cast(), "Invalid value of label_offset_ "); + assert_true(appr_alg->offsetData_ == t[5].cast(), "Invalid value of offsetData_ "); + + appr_alg->maxlevel_ = t[6].cast(); + appr_alg->enterpoint_node_ = t[7].cast(); + + assert_true(appr_alg->maxM_ == t[8].cast(), "Invalid value of maxM_ "); + assert_true(appr_alg->maxM0_ == t[9].cast(), "Invalid value of maxM0_ "); + assert_true(appr_alg->M_ == t[10].cast(), "Invalid value of M_ "); + assert_true(appr_alg->mult_ == t[11].cast(), "Invalid value of mult_ "); + assert_true(appr_alg->ef_construction_ == t[12].cast(), "Invalid value of ef_construction_ "); + + appr_alg->ef_ = t[13].cast(); + appr_alg->has_deletions_=t[14].cast(); + assert_true(appr_alg->size_links_per_element_ == t[15].cast(), "Invalid value of size_links_per_element_ "); + + auto label_lookup_dict = t[16].cast(); + auto element_levels_list = t[17].cast(); + auto data_level0_npy = t[18].cast>(); + auto link_list_npy = t[19].cast>(); + + for (auto el: label_lookup_dict){ + appr_alg->label_lookup_.insert( + std::make_pair( + el.first.cast(), + el.second.cast())); + } + + + int idx = 0; + for (auto el : element_levels_list){ + appr_alg->element_levels_[idx]=el.cast(); + idx++; + } + + + memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(), data_level0_npy.nbytes()); + + for (size_t i = 0; i < appr_alg->max_elements_; i++) { + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + if (linkListSize == 0) { + appr_alg->linkLists_[i] = nullptr; + } else { + appr_alg->linkLists_[i] = (char *) malloc(linkListSize); + if (appr_alg->linkLists_[i] == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist"); + + memcpy(appr_alg->linkLists_[i], (link_list_npy.data()+(appr_alg->maxlevel_ * appr_alg->size_links_per_element_ * i)), linkListSize); + + } + } + + + // TODO: use global lock for de-/serialization + // std::unique_lock templock(global); + // int maxlevelcopy = maxlevel_; + // if (curlevel <= maxlevelcopy) + // templock.unlock(); + + } + + py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); @@ -317,7 +546,7 @@ class Index { float *data= (float *) items.data(row); size_t start_idx = threadId * dim; - normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); + normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); std::priority_queue> result = appr_alg->searchKnn( (void *) (norm_array.data()+start_idx), k); @@ -374,22 +603,6 @@ class Index { return appr_alg->cur_element_count; } - std::string space_name; - int dim; - - bool index_inited; - bool ep_added; - bool normalize; - int num_threads_default; - hnswlib::labeltype cur_l; - hnswlib::HierarchicalNSW *appr_alg; - hnswlib::SpaceInterface *l2space; - - ~Index() { - delete l2space; - if (appr_alg) - delete appr_alg; - } }; @@ -397,7 +610,9 @@ class Index { PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); + // py::class_, std::shared_ptr >>(m, "Index") py::class_>(m, "Index") + .def(py::init(&Index::createFromParams), py::arg("params")) //createFromParams(const py::tuple t) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) @@ -410,18 +625,23 @@ PYBIND11_PLUGIN(hnswlib) { .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) - .def_readonly("space", &Index::space_name) + .def_readonly("space_name", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) .def_property("ef", [](const Index & index) { - return index.index_inited ? index.appr_alg->ef_ : 10; + return index.index_inited ? index.appr_alg->ef_ : index.default_ef; }, [](Index & index, const size_t ef_) { - if (index.index_inited) + // index.set_ef(ef_); + index.default_ef=ef_; + if (index.appr_alg) index.appr_alg->ef_ = ef_; - else - throw std::runtime_error("must call init_index prior to setting ef parameter"); + + // if (index.index_inited) + // index.appr_alg->ef_ = ef_; + // else + // throw std::runtime_error("must call init_index prior to setting ef parameter"); }) .def_property_readonly("max_elements", [](const Index & index) { return index.index_inited ? index.appr_alg->max_elements_ : 0; @@ -435,41 +655,126 @@ PYBIND11_PLUGIN(hnswlib) { .def_property_readonly("M", [](const Index & index) { return index.index_inited ? index.appr_alg->M_ : 0; }) - .def("__getstate__", [](const Index & index) { // __getstate__ - std::stringstream output(std::stringstream::out|std::stringstream::binary); - - - if (index.index_inited) - index.saveIndexToStream(output); - - /* Return a tuple that fully encodes the state of the object */ - return py::make_tuple(index.space_name, index.dim, - index.index_inited, index.ep_added, - index.normalize, index.num_threads_default, - py::bytes(output.str()), - index.index_inited == false ? 10 : index.appr_alg->ef_, - index.index_inited == false ? 0 : index.appr_alg->max_elements_, - index.index_inited == false ? 0 : index.appr_alg->cur_element_count - ); - }) - .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ - if (t.size() != 10) - throw std::runtime_error("Invalid state!"); - - /* Invoke Index constructor (need to use in-place version) */ - new (&index) Index(t[0].cast(), t[1].cast()); - index.index_inited=t[2].cast(); - index.ep_added=t[3].cast(); - index.normalize=t[4].cast(); - index.num_threads_default=t[5].cast(); - - if (index.index_inited){ - std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); - index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state - index.appr_alg->ef_=(t[7].cast()); - } + .def(py::pickle( + [](const Index &ind) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + return ind.getIndexParams(); + }, + [](py::tuple t) { // __setstate__ + if (t.size() != 2) + throw std::runtime_error("Invalid state!"); + return Index::createFromParams(t); + } + )) + + .def("check_integrity", [](const Index & index) { + index.appr_alg->checkIntegrity(); + std::cout<< index.default_ef << " " << index.appr_alg->ef_ << std::endl; + return index.appr_alg->ef_; + // return index.getIndexParams(); + // return index.appr_alg->element_levels_; + + // std::stringstream output(std::stringstream::out|std::stringstream::binary); + // + // .def("get_params", &Index::getIndexParams) + // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { + // + // if (index.index_inited) + // index.saveIndexToStream(output); + // + // /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(index.space_name, index.dim, + // index.index_inited, index.ep_added, + // index.normalize, index.num_threads_default, + // py::bytes(output.str()), + // index.index_inited == false ? 10 : index.appr_alg->ef_, + // index.index_inited == false ? 0 : index.appr_alg->max_elements_, + // index.index_inited == false ? 0 : index.appr_alg->cur_element_count + // ); }) + + + // .def(py::pickle( + // [](const Index & index) { // __getstate__ + // /* Return a tuple that fully encodes the state of the object */ + // return index.getIndexParams(); + // }, + // [](Index & index, py::tuple t) { // __setstate__ + // if (t.size() != 2) + // throw std::runtime_error("Invalid state!"); + // + // /* Invoke Index constructor (need to use in-place version) */ + // // py::tuple index_params = t[0].cast(); + // // Index new_index(index_params[0].cast(), index_params[1].cast()); + // index.setIndexParams(t); + // return index; + // + // /* Create a new C++ instance */ + // // Pickleable p(t[0].cast()); + // + // /* Assign any additional state */ + // // p.setExtra(t[1].cast()); + // + // // return p; + // } + // )) + + // .def("__getstate__", &Index::getIndexParams) // __getstate__ + // .def("__setstate__", &Index::setIndexParams) // __setstate__ + // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ + // py::tuple index_params = t[0].cast(); + // new (&index) Index(index_params[0].cast(), index_params[1].cast()); + // index.setIndexParams(t); + // return index; + // }) + // .def("__getstate__", [](const Index & index) { // __getstate__ + // return index.getIndexParams(); + // + // // std::stringstream output(std::stringstream::out|std::stringstream::binary); + // // + // // .def("get_params", &Index::getIndexParams) + // // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { + // // + // // if (index.index_inited) + // // index.saveIndexToStream(output); + // // + // // /* Return a tuple that fully encodes the state of the object */ + // // return py::make_tuple(index.space_name, index.dim, + // // index.index_inited, index.ep_added, + // // index.normalize, index.num_threads_default, + // // py::bytes(output.str()), + // // index.index_inited == false ? 10 : index.appr_alg->ef_, + // // index.index_inited == false ? 0 : index.appr_alg->max_elements_, + // // index.index_inited == false ? 0 : index.appr_alg->cur_element_count + // // ); + // }) + // .def("set_state", [](Index & index, py::tuple t) { // __setstate__ + // index.setIndexParams(t); + // }) + // + // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ + // // delete &index; + // /* Invoke Index constructor (need to use in-place version) */ + // // py::tuple index_params = t[0].cast(); + // // new (&index) Index(index_params[0].cast(), index_params[1].cast()); + // index.setIndexParams(t); + // // if (t.size() != 10) + // // throw std::runtime_error("Invalid state!"); + // // + // + // // index.index_inited=t[2].cast(); + // // index.ep_added=t[3].cast(); + // // index.normalize=t[4].cast(); + // // index.num_threads_default=t[5].cast(); + // // + // // if (index.index_inited){ + // // std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); + // // index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state + // // index.appr_alg->ef_=(t[7].cast()); + // // } + // + // }) .def("__repr__", [](const Index &a) { return ""; }); From fe6d2faaa734b57344240f607ef3a3c78e7731b7 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 13:29:40 -0400 Subject: [PATCH 12/23] replace tab characters with spaces --- hnswlib/hnswalg.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 7d0eb443..3e74d856 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -874,7 +874,7 @@ namespace hnswlib { for (auto&& cand : sCand) { if (cand == neigh) continue; - + dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_); if (candidates.size() < elementsToKeep) { candidates.emplace(distance, cand); @@ -1137,7 +1137,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { + if (has_deletions_) { top_candidates=searchBaseLayerST( currObj, query_data, std::max(ef_, k)); } @@ -1186,19 +1186,19 @@ namespace hnswlib { std::unordered_set s; for (int j=0; j 0); - assert(data[j] < cur_element_count); + assert(data[j] < cur_element_count); assert (data[j] != i); inbound_connections_num[data[j]]++; s.insert(data[j]); connections_checked++; - + } assert(s.size() == size); } } if(cur_element_count > 1){ int min1=inbound_connections_num[0], max1=inbound_connections_num[0]; - for(int i=0; i < cur_element_count; i++){ + for(int i=0; i < cur_element_count; i++){ assert(inbound_connections_num[i] > 0); min1=std::min(inbound_connections_num[i],min1); max1=std::max(inbound_connections_num[i],max1); @@ -1206,7 +1206,7 @@ namespace hnswlib { std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n"; } std::cout << "integrity ok, checked " << connections_checked << " connections\n"; - + } }; From c9fb60dde52649cfd143c8c5a72e6a09ddb32625 Mon Sep 17 00:00:00 2001 From: "Bespalov, Dmitriy (CORP)" Date: Fri, 23 Oct 2020 13:30:51 -0400 Subject: [PATCH 13/23] test each space (ip/cosine/l2) as a separate unittest --- python_bindings/tests/bindings_test_pickle.py | 241 ++++++++++-------- 1 file changed, 128 insertions(+), 113 deletions(-) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 3d011726..a6b74a9d 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -1,6 +1,8 @@ import unittest import numpy as np +import hnswlib +import pickle def get_dist(metric, pt1, pt2): @@ -24,120 +26,133 @@ def brute_force_distances(metric, items, query_items, k): return labels[:,:k], dists[:,:k] -class PickleSelfTestCase(unittest.TestCase): +def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): + brute_l, brute_d = brute_force_distances(metric, items, query_items, k) + err_total = 0 + for jj in range(query_items.shape[0]): + err = np.sum(np.isin(brute_l[jj, :], ann_l[jj, :], invert=True)) + if err > 0: + print(f"Warning: {err} labels are missing from ann results (k={k}, err_thresh={err_thresh})") - def check_ann_results(self, metric, items, query_items, k, ann_l, ann_d, err_thresh=0, total_thresh=0, dists_thresh=0): - brute_l, brute_d = brute_force_distances(metric, items, query_items, k) - err_total = 0 - for jj in range(query_items.shape[0]): - err = np.sum(np.isin(brute_l[jj, :], ann_l[jj, :], invert=True)) - if err > 0: - print(f"Warning: {err} labels are missing from ann results (k={k}, err_thresh={err_thresh})") - - if err > err_thresh: - err_total += 1 - - self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") - - wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) - if wrong_dists > 0: - dists_count=brute_d.shape[0]*brute_d.shape[1] - print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") - - self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") - - def testPickle(self): - import hnswlib - import pickle - - ef_construction = 725 - M = 64 - ef = 725 - - num_elements = 5000 - num_test_elements = 200 - - num_threads = 4 - k = 15 - - label_err_thresh=5 ### max number of missing labels allowed per test item - item_err_thresh=5 ### max number of items allowed with incorrect labels - - dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max - ### number of value pairs that are allowed to be different in d1 and d2 - ### i.e., number of values that are (d1-d2)**2>1e-3 - - for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: - - # Generating sample data - data = np.float32(np.random.random((num_elements, dim))) - test_data = np.float32(np.random.random((num_test_elements, dim))) - - # Declaring index - p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip - print(f"Running pickle tests for {p}") - - p.num_threads=num_threads # by default using all available cores - - p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index - p.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) - p0.init_index(max_elements = num_elements, ef_construction = ef_construction, M = M) - - p.ef=ef ### Note: ef parameter can be set only after calling p.init_index, - p0.ef=ef ### so we have to set p0.ef - - p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items - - ### add items to ann index p,p0,p1 - p.add_items(data) - p1.add_items(data) - p0.add_items(data) - - p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items - - self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") - self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") - self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") - - ### Test if returned distances are same - l, d = p.knn_query(test_data, k=k) - l0, d0 = p0.knn_query(test_data, k=k) - l1, d1 = p1.knn_query(test_data, k=k) - l2, d2 = p2.knn_query(test_data, k=k) - - self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p and p0 must match") - self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") - self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") - - ### check if ann results match brute-force search - ### allow for 2 labels to be missing from ann results - self.check_ann_results(space, data, test_data, k, l, d, - err_thresh = label_err_thresh, - total_thresh = item_err_thresh, - dists_thresh = dists_err_thresh) - - self.check_ann_results(space, data, test_data, k, l2, d2, - err_thresh=label_err_thresh, - total_thresh=item_err_thresh, - dists_thresh=dists_err_thresh) - - ### Check ef parameter value - self.assertEqual(p.ef, ef, "incorrect value of p.ef") - self.assertEqual(p0.ef, ef, "incorrect value of p0.ef") - self.assertEqual(p2.ef, ef, "incorrect value of p2.ef") - self.assertEqual(p1.ef, ef, "incorrect value of p1.ef") - - ### Check M parameter value - self.assertEqual(p.M, M, "incorrect value of p.M") - self.assertEqual(p0.M, M, "incorrect value of p0.M") - self.assertEqual(p1.M, M, "incorrect value of p1.M") - self.assertEqual(p2.M, M, "incorrect value of p2.M") - - ### Check ef_construction parameter value - self.assertEqual(p.ef_construction, ef_construction, "incorrect value of p.ef_construction") - self.assertEqual(p0.ef_construction, ef_construction, "incorrect value of p0.ef_construction") - self.assertEqual(p1.ef_construction, ef_construction, "incorrect value of p1.ef_construction") - self.assertEqual(p2.ef_construction, ef_construction, "incorrect value of p2.ef_construction") + if err > err_thresh: + err_total += 1 + + self.assertLessEqual( err_total, total_thresh, f"Error: knn_query returned incorrect labels for {err_total} items (k={k})") + + wrong_dists=np.sum(((brute_d- ann_d)**2.)>1e-3) + if wrong_dists > 0: + dists_count=brute_d.shape[0]*brute_d.shape[1] + print(f"Warning: {wrong_dists} ann distance values are different from brute-force values (total # of values={dists_count}, dists_thresh={dists_thresh})") + + self.assertLessEqual( wrong_dists, dists_thresh, msg=f"Error: {wrong_dists} ann distance values are different from brute-force values") + +def test_space_main(self, space, dim): + + # Generating sample data + data = np.float32(np.random.random((self.num_elements, dim))) + test_data = np.float32(np.random.random((self.num_test_elements, dim))) + + # Declaring index + p = hnswlib.Index(space=space, dim=dim) # possible options are l2, cosine or ip + print(f"Running pickle tests for {p}") + + p.num_threads=self.num_threads # by default using all available cores + + p0=pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) + p0.init_index(max_elements = self.num_elements, ef_construction = self.ef_construction, M = self.M) + + p.ef=self.ef + p0.ef=self.ef + + p1=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + ### add items to ann index p,p0,p1 + p.add_items(data) + p1.add_items(data) + p0.add_items(data) + + p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + + self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") + self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") + self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") + + ### Test if returned distances are same + l, d = p.knn_query(test_data, k=self.k) + l0, d0 = p0.knn_query(test_data, k=self.k) + l1, d1 = p1.knn_query(test_data, k=self.k) + l2, d2 = p2.knn_query(test_data, k=self.k) + + self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match") + self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") + self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") + + ### check if ann results match brute-force search + ### allow for 2 labels to be missing from ann results + check_ann_results(self, space, data, test_data, self.k, l, d, + err_thresh = self.label_err_thresh, + total_thresh = self.item_err_thresh, + dists_thresh = self.dists_err_thresh) + + check_ann_results(self, space, data, test_data, self.k, l2, d2, + err_thresh=self.label_err_thresh, + total_thresh=self.item_err_thresh, + dists_thresh=self.dists_err_thresh) + + ### Check ef parameter value + self.assertEqual(p.ef, self.ef, "incorrect value of p.ef") + self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef") + self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef") + self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef") + + ### Check M parameter value + self.assertEqual(p.M, self.M, "incorrect value of p.M") + self.assertEqual(p0.M, self.M, "incorrect value of p0.M") + self.assertEqual(p1.M, self.M, "incorrect value of p1.M") + self.assertEqual(p2.M, self.M, "incorrect value of p2.M") + + ### Check ef_construction parameter value + self.assertEqual(p.ef_construction, self.ef_construction, "incorrect value of p.ef_construction") + self.assertEqual(p0.ef_construction, self.ef_construction, "incorrect value of p0.ef_construction") + self.assertEqual(p1.ef_construction, self.ef_construction, "incorrect value of p1.ef_construction") + self.assertEqual(p2.ef_construction, self.ef_construction, "incorrect value of p2.ef_construction") + + + +class PickleUnitTests(unittest.TestCase): + + def setUp(self): + + self.ef_construction = 725 + self.M = 64 + self.ef = 725 + + self.num_elements = 5000 + self.num_test_elements = 200 + + self.num_threads = 4 + self.k = 25 + + self.label_err_thresh=5 ### max number of missing labels allowed per test item + self.item_err_thresh=5 ### max number of items allowed with incorrect labels + + self.dists_err_thresh=50 ### for two matrices, d1 and d2, dists_err_thresh controls max + ### number of value pairs that are allowed to be different in d1 and d2 + ### i.e., number of values that are (d1-d2)**2>1e-3 + + def testInnerProductSpace(self): + test_space_main(self, 'ip', 48) + + def testL2Space(self): + test_space_main(self, 'l2', 153) + + def testCosineSpace(self): + test_space_main(self, 'cosine', 512) + + # + # for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: + # test_space_main if __name__ == "__main__": From 3c4510db09677d97afd7b5b00deffc42496e21c4 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Sun, 25 Oct 2020 04:33:08 +0000 Subject: [PATCH 14/23] return array_t pointers --- hnswlib/hnswalg.h | 1 - python_bindings/bindings.cpp | 116 ++--------------------------------- 2 files changed, 4 insertions(+), 113 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 3e74d856..7c2c01c3 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -637,7 +637,6 @@ namespace hnswlib { if (!input.is_open()) throw std::runtime_error("Cannot open file"); - // get file size: input.seekg(0,input.end); std::streampos total_filesize=input.tellg(); diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 33d71879..f066368c 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -273,7 +273,7 @@ class Index { std::vector norm_array(num_threads * dim); ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) { // normalize vector: - size_t start_idx = threadId * dim; + size_t start_idx = threadId * dim; normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); size_t id = ids.size() ? ids.at(row) : (cur_l+row); @@ -339,7 +339,6 @@ class Index { unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; if (linkListSize){ memcpy(link_list_npy+(link_npy_stride * i), appr_alg->linkLists_[i], linkListSize); - // std::cout << linkListSize << " " << appr_alg->maxlevel_ << " " << appr_alg->element_levels_[i] << " generator: " << appr_alg->level_generator_ << std::endl; } } @@ -368,12 +367,12 @@ class Index { appr_alg->size_links_per_element_, appr_alg->label_lookup_, appr_alg->element_levels_, - py::array_t( + new py::array_t( {level0_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double data_level0_npy, // the data pointer free_when_done_l0), - py::array_t( + new py::array_t( {link_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double link_list_npy, // the data pointer @@ -546,7 +545,7 @@ class Index { float *data= (float *) items.data(row); size_t start_idx = threadId * dim; - normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); + normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); std::priority_queue> result = appr_alg->searchKnn( (void *) (norm_array.data()+start_idx), k); @@ -633,15 +632,9 @@ PYBIND11_PLUGIN(hnswlib) { return index.index_inited ? index.appr_alg->ef_ : index.default_ef; }, [](Index & index, const size_t ef_) { - // index.set_ef(ef_); index.default_ef=ef_; if (index.appr_alg) index.appr_alg->ef_ = ef_; - - // if (index.index_inited) - // index.appr_alg->ef_ = ef_; - // else - // throw std::runtime_error("must call init_index prior to setting ef parameter"); }) .def_property_readonly("max_elements", [](const Index & index) { return index.index_inited ? index.appr_alg->max_elements_ : 0; @@ -672,109 +665,8 @@ PYBIND11_PLUGIN(hnswlib) { index.appr_alg->checkIntegrity(); std::cout<< index.default_ef << " " << index.appr_alg->ef_ << std::endl; return index.appr_alg->ef_; - // return index.getIndexParams(); - // return index.appr_alg->element_levels_; - - // std::stringstream output(std::stringstream::out|std::stringstream::binary); - // - // .def("get_params", &Index::getIndexParams) - // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { - // - // if (index.index_inited) - // index.saveIndexToStream(output); - // - // /* Return a tuple that fully encodes the state of the object */ - // return py::make_tuple(index.space_name, index.dim, - // index.index_inited, index.ep_added, - // index.normalize, index.num_threads_default, - // py::bytes(output.str()), - // index.index_inited == false ? 10 : index.appr_alg->ef_, - // index.index_inited == false ? 0 : index.appr_alg->max_elements_, - // index.index_inited == false ? 0 : index.appr_alg->cur_element_count - // ); }) - - // .def(py::pickle( - // [](const Index & index) { // __getstate__ - // /* Return a tuple that fully encodes the state of the object */ - // return index.getIndexParams(); - // }, - // [](Index & index, py::tuple t) { // __setstate__ - // if (t.size() != 2) - // throw std::runtime_error("Invalid state!"); - // - // /* Invoke Index constructor (need to use in-place version) */ - // // py::tuple index_params = t[0].cast(); - // // Index new_index(index_params[0].cast(), index_params[1].cast()); - // index.setIndexParams(t); - // return index; - // - // /* Create a new C++ instance */ - // // Pickleable p(t[0].cast()); - // - // /* Assign any additional state */ - // // p.setExtra(t[1].cast()); - // - // // return p; - // } - // )) - - // .def("__getstate__", &Index::getIndexParams) // __getstate__ - // .def("__setstate__", &Index::setIndexParams) // __setstate__ - // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ - // py::tuple index_params = t[0].cast(); - // new (&index) Index(index_params[0].cast(), index_params[1].cast()); - // index.setIndexParams(t); - // return index; - // }) - // .def("__getstate__", [](const Index & index) { // __getstate__ - // return index.getIndexParams(); - // - // // std::stringstream output(std::stringstream::out|std::stringstream::binary); - // // - // // .def("get_params", &Index::getIndexParams) - // // .def("set_params", &Index::setIndexParams, py::arg("t"))// [](Index & index, py::tuple t) { - // // - // // if (index.index_inited) - // // index.saveIndexToStream(output); - // // - // // /* Return a tuple that fully encodes the state of the object */ - // // return py::make_tuple(index.space_name, index.dim, - // // index.index_inited, index.ep_added, - // // index.normalize, index.num_threads_default, - // // py::bytes(output.str()), - // // index.index_inited == false ? 10 : index.appr_alg->ef_, - // // index.index_inited == false ? 0 : index.appr_alg->max_elements_, - // // index.index_inited == false ? 0 : index.appr_alg->cur_element_count - // // ); - // }) - // .def("set_state", [](Index & index, py::tuple t) { // __setstate__ - // index.setIndexParams(t); - // }) - // - // .def("__setstate__", [](Index & index, py::tuple t) { // __setstate__ - // // delete &index; - // /* Invoke Index constructor (need to use in-place version) */ - // // py::tuple index_params = t[0].cast(); - // // new (&index) Index(index_params[0].cast(), index_params[1].cast()); - // index.setIndexParams(t); - // // if (t.size() != 10) - // // throw std::runtime_error("Invalid state!"); - // // - // - // // index.index_inited=t[2].cast(); - // // index.ep_added=t[3].cast(); - // // index.normalize=t[4].cast(); - // // index.num_threads_default=t[5].cast(); - // // - // // if (index.index_inited){ - // // std::stringstream input(t[6].cast(), std::stringstream::in|std::stringstream::binary); - // // index.loadIndexFromStream(input, t[8].cast()); // use max_elements from state - // // index.appr_alg->ef_=(t[7].cast()); - // // } - // - // }) .def("__repr__", [](const Index &a) { return ""; }); From 64c51548373011a1f48eadb6d6b28f93edc8cfd1 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Sun, 25 Oct 2020 16:10:17 +0000 Subject: [PATCH 15/23] expose static method of Index class as copy constructor in python --- python_bindings/bindings.cpp | 90 +++++++++++++----------------------- 1 file changed, 33 insertions(+), 57 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index f066368c..cd1d41f7 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -73,43 +73,6 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } -// -// std::priority_queue> -// searchKnn(const void *query_data, size_t k) const { -// std::priority_queue> result; -// if (cur_element_count == 0) return result; -// -// tableint currObj = enterpoint_node_; -// dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); -// -// for (int level = maxlevel_; level > 0; level--) { -// bool changed = true; -// while (changed) { -// changed = false; -// unsigned int *data; -// -// data = (unsigned int *) get_linklist(currObj, level); -// int size = getListCount(data); -// metric_hops++; -// metric_distance_computations+=size; -// -// tableint *datal = (tableint *) (data + 1); -// for (int i = 0; i < size; i++) { -// tableint cand = datal[i]; -// if (cand < 0 || cand > max_elements_) -// throw std::runtime_error("cand error"); -// dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); -// -// if (d < curdist) { -// curdist = d; -// currObj = cand; -// changed = true; -// } -// } -// } -// } -// - template class Index { @@ -321,10 +284,18 @@ class Index { py::tuple getAnnData() const { + std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; - unsigned int link_npy_size = appr_alg->cur_element_count * appr_alg->maxlevel_ * appr_alg->size_links_per_element_; - unsigned int link_npy_stride = appr_alg->maxlevel_ * appr_alg->size_links_per_element_; + unsigned int link_npy_size = 0; + std::vector link_npy_offsets(appr_alg->cur_element_count); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + link_npy_offsets[i]=link_npy_size; + if (linkListSize) + link_npy_size += linkListSize; + } char* data_level0_npy = (char *) malloc(level0_npy_size); char* link_list_npy = (char *) malloc(link_npy_size); @@ -338,7 +309,7 @@ class Index { for (size_t i = 0; i < appr_alg->cur_element_count; i++){ unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; if (linkListSize){ - memcpy(link_list_npy+(link_npy_stride * i), appr_alg->linkLists_[i], linkListSize); + memcpy(link_list_npy+link_npy_offsets[i], appr_alg->linkLists_[i], linkListSize); } } @@ -367,12 +338,12 @@ class Index { appr_alg->size_links_per_element_, appr_alg->label_lookup_, appr_alg->element_levels_, - new py::array_t( + py::array_t( {level0_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double data_level0_npy, // the data pointer free_when_done_l0), - new py::array_t( + py::array_t( {link_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double link_list_npy, // the data pointer @@ -401,7 +372,6 @@ class Index { new_index->seed = index_params[6].cast(); - if (index_inited_){ //// hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, ann_params[1].cast(), ann_params[10].cast(), ann_params[12].cast(), new_index->seed); @@ -420,7 +390,14 @@ class Index { return new_index; } + static Index * createFromIndex(const Index & index) { + return createFromParams(index.getIndexParams()); + } + void setAnnData(const py::tuple t) { + + std::unique_lock templock(appr_alg->global); + assert_true(appr_alg->offsetLevel0_ == t[0].cast(), "Invalid value of offsetLevel0_ "); assert_true(appr_alg->max_elements_ == t[1].cast(), "Invalid value of max_elements_ "); @@ -448,6 +425,8 @@ class Index { auto data_level0_npy = t[18].cast>(); auto link_list_npy = t[19].cast>(); + + for (auto el: label_lookup_dict){ appr_alg->label_lookup_.insert( std::make_pair( @@ -462,6 +441,15 @@ class Index { idx++; } + unsigned int link_npy_size = 0; + std::vector link_npy_offsets(appr_alg->cur_element_count); + + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; + link_npy_offsets[i]=link_npy_size; + if (linkListSize) + link_npy_size += linkListSize; + } memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(), data_level0_npy.nbytes()); @@ -474,18 +462,12 @@ class Index { if (appr_alg->linkLists_[i] == nullptr) throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist"); - memcpy(appr_alg->linkLists_[i], (link_list_npy.data()+(appr_alg->maxlevel_ * appr_alg->size_links_per_element_ * i)), linkListSize); + memcpy(appr_alg->linkLists_[i], link_list_npy.data()+link_npy_offsets[i], linkListSize); } } - // TODO: use global lock for de-/serialization - // std::unique_lock templock(global); - // int maxlevelcopy = maxlevel_; - // if (curlevel <= maxlevelcopy) - // templock.unlock(); - } @@ -609,9 +591,9 @@ class Index { PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); - // py::class_, std::shared_ptr >>(m, "Index") py::class_>(m, "Index") .def(py::init(&Index::createFromParams), py::arg("params")) //createFromParams(const py::tuple t) + .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) @@ -661,12 +643,6 @@ PYBIND11_PLUGIN(hnswlib) { } )) - .def("check_integrity", [](const Index & index) { - index.appr_alg->checkIntegrity(); - std::cout<< index.default_ef << " " << index.appr_alg->ef_ << std::endl; - return index.appr_alg->ef_; - }) - .def("__repr__", [](const Index &a) { return ""; }); From 7b445c8aee93b13166ecc6e6a6aa694ee65bdc6c Mon Sep 17 00:00:00 2001 From: dbespalov Date: Sun, 25 Oct 2020 23:41:37 +0000 Subject: [PATCH 16/23] do not waste space when returning serialized appr_alg->linkLists_ --- python_bindings/bindings.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index cd1d41f7..35783868 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -360,7 +360,7 @@ class Index { } - static Index * createFromParams(const py::tuple t) { + static Index * createFromParams(const py::tuple & t) { py::tuple index_params=t[0].cast(); py::tuple ann_params=t[1].cast(); @@ -394,7 +394,7 @@ class Index { return createFromParams(index.getIndexParams()); } - void setAnnData(const py::tuple t) { + void setAnnData(const py::tuple & t) { std::unique_lock templock(appr_alg->global); @@ -471,7 +471,7 @@ class Index { } - py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { + py::object knnQuery_return_numpy(py::object & input, size_t k = 1, int num_threads = -1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); @@ -636,7 +636,7 @@ PYBIND11_PLUGIN(hnswlib) { /* Return a tuple that fully encodes the state of the object */ return ind.getIndexParams(); }, - [](py::tuple t) { // __setstate__ + [](py::tuple & t) { // __setstate__ if (t.size() != 2) throw std::runtime_error("Invalid state!"); return Index::createFromParams(t); From c02f1dccbf121cf58cfade0477602a5120014071 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Mon, 26 Oct 2020 22:58:10 +0000 Subject: [PATCH 17/23] serialize element_lookup_ and element_level_ as array_t arrays; pass python types by value everywhere --- python_bindings/bindings.cpp | 88 ++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 29 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 35783868..25ac4bd1 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -299,12 +299,27 @@ class Index { char* data_level0_npy = (char *) malloc(level0_npy_size); char* link_list_npy = (char *) malloc(link_npy_size); + int* element_levels_npy = (int *) malloc(appr_alg->element_levels_.size()*sizeof(int)); + + // std::unordered_map label_lookup_; + hnswlib::labeltype* label_lookup_key_npy = (hnswlib::labeltype *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); + hnswlib::tableint* label_lookup_val_npy = (hnswlib::tableint *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); + + memset(label_lookup_key_npy, -1, appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); + memset(label_lookup_val_npy, -1, appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); + + size_t idx=0; + for ( auto it = appr_alg->label_lookup_.begin(); it != appr_alg->label_lookup_.end(); ++it ){ + label_lookup_key_npy[idx]= it->first; + label_lookup_val_npy[idx]= it->second; + idx++; + } memset(data_level0_npy, 0, level0_npy_size); memset(link_list_npy, 0, link_npy_size); memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); - + memcpy(element_levels_npy, appr_alg->element_levels_.data(), appr_alg->element_levels_.size() * sizeof(int)); for (size_t i = 0; i < appr_alg->cur_element_count; i++){ unsigned int linkListSize = appr_alg->element_levels_[i] > 0 ? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i] : 0; @@ -316,6 +331,15 @@ class Index { py::capsule free_when_done_l0(data_level0_npy, [](void *f) { delete[] f; }); + py::capsule free_when_done_lvl(element_levels_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_lb(label_lookup_key_npy, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_id(label_lookup_val_npy, [](void *f) { + delete[] f; + }); py::capsule free_when_done_ll(link_list_npy, [](void *f) { delete[] f; }); @@ -336,8 +360,21 @@ class Index { appr_alg->ef_, appr_alg->has_deletions_, appr_alg->size_links_per_element_, - appr_alg->label_lookup_, - appr_alg->element_levels_, + py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double + label_lookup_key_npy, // the data pointer + free_when_done_lb), + py::array_t( + {appr_alg->label_lookup_.size()}, // shape + {sizeof(hnswlib::tableint)}, // C-style contiguous strides for double + label_lookup_val_npy, // the data pointer + free_when_done_id), + py::array_t( + {appr_alg->element_levels_.size()}, // shape + {sizeof(int)}, // C-style contiguous strides for double + element_levels_npy, // the data pointer + free_when_done_lvl), py::array_t( {level0_npy_size}, // shape {sizeof(char)}, // C-style contiguous strides for double @@ -360,7 +397,7 @@ class Index { } - static Index * createFromParams(const py::tuple & t) { + static Index * createFromParams(const py::tuple t) { py::tuple index_params=t[0].cast(); py::tuple ann_params=t[1].cast(); @@ -394,7 +431,7 @@ class Index { return createFromParams(index.getIndexParams()); } - void setAnnData(const py::tuple & t) { + void setAnnData(const py::tuple t) { std::unique_lock templock(appr_alg->global); @@ -420,26 +457,23 @@ class Index { appr_alg->has_deletions_=t[14].cast(); assert_true(appr_alg->size_links_per_element_ == t[15].cast(), "Invalid value of size_links_per_element_ "); - auto label_lookup_dict = t[16].cast(); - auto element_levels_list = t[17].cast(); - auto data_level0_npy = t[18].cast>(); - auto link_list_npy = t[19].cast>(); - - + // std::unordered_map label_lookup_; + auto label_lookup_key_npy = t[16].cast >(); + auto label_lookup_val_npy = t[17].cast >(); + auto element_levels_npy = t[18].cast >(); + auto data_level0_npy = t[19].cast >(); + auto link_list_npy = t[20].cast >(); - for (auto el: label_lookup_dict){ - appr_alg->label_lookup_.insert( - std::make_pair( - el.first.cast(), - el.second.cast())); + for (size_t i = 0; i < appr_alg->cur_element_count; i++){ + if (label_lookup_val_npy.data()[i] < 0){ + throw std::runtime_error("internal id cannot be negative!"); + } + else{ + appr_alg->label_lookup_.insert(std::make_pair(label_lookup_key_npy.data()[i], label_lookup_val_npy.data()[i])); + } } - - int idx = 0; - for (auto el : element_levels_list){ - appr_alg->element_levels_[idx]=el.cast(); - idx++; - } + memcpy(appr_alg->element_levels_.data(), element_levels_npy.data(), element_levels_npy.nbytes()); unsigned int link_npy_size = 0; std::vector link_npy_offsets(appr_alg->cur_element_count); @@ -467,11 +501,9 @@ class Index { } } +} - } - - - py::object knnQuery_return_numpy(py::object & input, size_t k = 1, int num_threads = -1) { + py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); @@ -586,8 +618,6 @@ class Index { }; - - PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); @@ -636,7 +666,7 @@ PYBIND11_PLUGIN(hnswlib) { /* Return a tuple that fully encodes the state of the object */ return ind.getIndexParams(); }, - [](py::tuple & t) { // __setstate__ + [](py::tuple t) { // __setstate__ if (t.size() != 2) throw std::runtime_error("Invalid state!"); return Index::createFromParams(t); From 1f251021d6bb581ce1d7e9054de2bd54eadf37df Mon Sep 17 00:00:00 2001 From: dbespalov Date: Tue, 3 Nov 2020 05:47:46 +0000 Subject: [PATCH 18/23] warn that serialization is not thread safe with add_items --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 28accb84..4d74b003 100644 --- a/README.md +++ b/README.md @@ -92,9 +92,9 @@ Read-only properties of `hnswlib.Index` class: Properties of `hnswlib.Index` that support reading and writing: -* `ef` - parameter controlling query time/accuracy trade-off. Note that setting property `p.ef` prior to index initialization with `p.init_index(...)` will raise an error. +* `ef` - parameter controlling query time/accuracy trade-off. -* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to setting `p.num_threads=3`. +* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`. @@ -127,7 +127,9 @@ p.set_ef(50) # ef should always be > k # Query dataset, k - number of closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k = 1) -# Index objects support pickling: +# Index objects support pickling +# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! +# Note: ef parameter is included in serialization; random number generator is initialized with random_seeed on Index load p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip ### Index parameters are exposed as class properties: From 1165370b76c4ddf2f67e3b0913f079386a34fc40 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Tue, 3 Nov 2020 05:48:44 +0000 Subject: [PATCH 19/23] warn that serialization is not thread safe with add_items; add todo block for serialization of random generators --- python_bindings/bindings.cpp | 41 +++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 25ac4bd1..38dcb18e 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -154,14 +154,14 @@ class Index { cur_l = appr_alg->cur_element_count; } - void normalize_vector(float *data, float *norm_array){ - float norm=0.0f; - for(int i=0;i items(input); @@ -185,7 +185,6 @@ class Index { throw std::runtime_error("wrong dimensionality of the vectors"); // avoid using threads when the number of searches is small: - if(rows<=num_threads*4){ num_threads=1; } @@ -284,6 +283,7 @@ class Index { py::tuple getAnnData() const { + /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; @@ -301,7 +301,6 @@ class Index { char* link_list_npy = (char *) malloc(link_npy_size); int* element_levels_npy = (int *) malloc(appr_alg->element_levels_.size()*sizeof(int)); - // std::unordered_map label_lookup_; hnswlib::labeltype* label_lookup_key_npy = (hnswlib::labeltype *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::labeltype)); hnswlib::tableint* label_lookup_val_npy = (hnswlib::tableint *) malloc(appr_alg->label_lookup_.size()*sizeof(hnswlib::tableint)); @@ -315,7 +314,6 @@ class Index { idx++; } - memset(data_level0_npy, 0, level0_npy_size); memset(link_list_npy, 0, link_npy_size); memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size); @@ -391,8 +389,12 @@ class Index { py::tuple getIndexParams() const { + /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ + /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ return py::make_tuple(py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), index_inited == true ? getAnnData() : py::make_tuple()); + /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + } @@ -407,10 +409,11 @@ class Index { Index *new_index = new Index(index_params[0].cast(), index_params[1].cast()); + /* TODO: deserialize state of random generators into new_index->level_generator_ and new_index->update_probability_generator_ */ + /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ new_index->seed = index_params[6].cast(); if (index_inited_){ - //// hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed); new_index->appr_alg = new hnswlib::HierarchicalNSW(new_index->l2space, ann_params[1].cast(), ann_params[10].cast(), ann_params[12].cast(), new_index->seed); new_index->cur_l = ann_params[2].cast(); } @@ -428,11 +431,14 @@ class Index { } static Index * createFromIndex(const Index & index) { - return createFromParams(index.getIndexParams()); + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ + return createFromParams(index.getIndexParams()); } + void setAnnData(const py::tuple t) { - + /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ + std::unique_lock templock(appr_alg->global); assert_true(appr_alg->offsetLevel0_ == t[0].cast(), "Invalid value of offsetLevel0_ "); @@ -457,7 +463,6 @@ class Index { appr_alg->has_deletions_=t[14].cast(); assert_true(appr_alg->size_links_per_element_ == t[15].cast(), "Invalid value of size_links_per_element_ "); - // std::unordered_map label_lookup_; auto label_lookup_key_npy = t[16].cast >(); auto label_lookup_val_npy = t[17].cast >(); auto element_levels_npy = t[18].cast >(); @@ -622,8 +627,9 @@ PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); py::class_>(m, "Index") - .def(py::init(&Index::createFromParams), py::arg("params")) //createFromParams(const py::tuple t) - .def(py::init(&Index::createFromIndex), py::arg("index")) + .def(py::init(&Index::createFromParams), py::arg("params")) + /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ + .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) .def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1, py::arg("num_threads")=-1) @@ -664,6 +670,7 @@ PYBIND11_PLUGIN(hnswlib) { .def(py::pickle( [](const Index &ind) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ return ind.getIndexParams(); }, [](py::tuple t) { // __setstate__ From 2c040e67252ad771409eb2a03fa0dead4a0a1d56 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Tue, 3 Nov 2020 05:48:57 +0000 Subject: [PATCH 20/23] remove camel casing --- python_bindings/tests/bindings_test_pickle.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index a6b74a9d..6c3a826a 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -19,9 +19,8 @@ def brute_force_distances(metric, items, query_items, k): for jj in range(query_items.shape[0]): dists[jj,ii]=get_dist(metric, items[ii, :], query_items[jj, :]) - labels = np.argsort(dists, axis=1) - dists = np.sort(dists, axis=1) - + labels = np.argsort(dists, axis=1) # equivalent, but faster: np.argpartition(dists, range(k), axis=1) + dists = np.sort(dists, axis=1) # equivalent, but faster: np.partition(dists, range(k), axis=1) return labels[:,:k], dists[:,:k] @@ -141,19 +140,14 @@ def setUp(self): ### number of value pairs that are allowed to be different in d1 and d2 ### i.e., number of values that are (d1-d2)**2>1e-3 - def testInnerProductSpace(self): + def test_inner_product_space(self): test_space_main(self, 'ip', 48) - def testL2Space(self): + def test_l2_space(self): test_space_main(self, 'l2', 153) - def testCosineSpace(self): + def test_cosine_space(self): test_space_main(self, 'cosine', 512) - # - # for space,dim in [('ip', 48), ('l2', 152), ('cosine', 512)]: - # test_space_main - - if __name__ == "__main__": unittest.main() From 6298996c76c87f98d94aa2361636a001c31a3cf6 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Fri, 6 Nov 2020 03:39:49 +0000 Subject: [PATCH 21/23] add static const int data member to class Index that stores serialization version; serialization version is returned as the first element of the parameter tuple; serialization version must match when instantiating Index object from parameter tuple --- python_bindings/bindings.cpp | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 38dcb18e..e8a16512 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -74,6 +74,7 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn } + template class Index { public: @@ -96,8 +97,10 @@ class Index { num_threads_default = std::thread::hardware_concurrency(); default_ef=10; - } + + static const int ser_version = 1; // serialization version + std::string space_name; int dim; size_t seed; @@ -282,8 +285,8 @@ class Index { } - py::tuple getAnnData() const { - /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + py::tuple getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + std::unique_lock templock(appr_alg->global); unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_; @@ -391,7 +394,9 @@ class Index { py::tuple getIndexParams() const { /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ - return py::make_tuple(py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), + + return py::make_tuple(py::int_(Index::ser_version), // serialization version + py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), index_inited == true ? getAnnData() : py::make_tuple()); /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ @@ -400,8 +405,12 @@ class Index { static Index * createFromParams(const py::tuple t) { - py::tuple index_params=t[0].cast(); - py::tuple ann_params=t[1].cast(); + + if (py::int_(Index::ser_version) != t[0].cast()) // check serialization version + throw std::runtime_error("Serialization version mismatch!"); + + py::tuple index_params=t[1].cast(); + py::tuple ann_params=t[2].cast(); auto space_name_=index_params[0].cast(); auto dim_=index_params[1].cast(); @@ -623,6 +632,8 @@ class Index { }; + + PYBIND11_PLUGIN(hnswlib) { py::module m("hnswlib"); @@ -674,7 +685,7 @@ PYBIND11_PLUGIN(hnswlib) { return ind.getIndexParams(); }, [](py::tuple t) { // __setstate__ - if (t.size() != 2) + if (t.size() != 3) throw std::runtime_error("Invalid state!"); return Index::createFromParams(t); } From c8276d86982abd5983377f92ef3fab6b41e1bd6b Mon Sep 17 00:00:00 2001 From: dbespalov Date: Fri, 6 Nov 2020 04:32:13 +0000 Subject: [PATCH 22/23] add todo block to convert parameter tuple to dicts --- python_bindings/bindings.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index e8a16512..75a80381 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -393,12 +393,14 @@ class Index { py::tuple getIndexParams() const { /* TODO: serialize state of random generators appr_alg->level_generator_ and appr_alg->update_probability_generator_ */ - /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ + /* for full reproducibility / to avoid re-initializing generators inside Index::createFromParams */ return py::make_tuple(py::int_(Index::ser_version), // serialization version + + /* TODO: convert the following two py::tuple's to py::dict */ py::make_tuple(space_name, dim, index_inited, ep_added, normalize, num_threads_default, seed, default_ef), - index_inited == true ? getAnnData() : py::make_tuple()); - /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + index_inited == true ? getAnnData() : py::make_tuple()); /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + } @@ -409,8 +411,8 @@ class Index { if (py::int_(Index::ser_version) != t[0].cast()) // check serialization version throw std::runtime_error("Serialization version mismatch!"); - py::tuple index_params=t[1].cast(); - py::tuple ann_params=t[2].cast(); + py::tuple index_params=t[1].cast(); /* TODO: convert this py::tuple to py::dict */ + py::tuple ann_params=t[2].cast(); /* TODO: convert this py::tuple to py::dict */ auto space_name_=index_params[0].cast(); auto dim_=index_params[1].cast(); From 345f71da372a9b416bec0b7777d30aeaa5877e26 Mon Sep 17 00:00:00 2001 From: dbespalov Date: Fri, 6 Nov 2020 04:41:49 +0000 Subject: [PATCH 23/23] add todo block to convert parameter tuple to dicts --- python_bindings/bindings.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 75a80381..d9396247 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -285,7 +285,7 @@ class Index { } - py::tuple getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ + py::tuple getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */ std::unique_lock templock(appr_alg->global); @@ -411,8 +411,8 @@ class Index { if (py::int_(Index::ser_version) != t[0].cast()) // check serialization version throw std::runtime_error("Serialization version mismatch!"); - py::tuple index_params=t[1].cast(); /* TODO: convert this py::tuple to py::dict */ - py::tuple ann_params=t[2].cast(); /* TODO: convert this py::tuple to py::dict */ + py::tuple index_params=t[1].cast(); /* TODO: convert index_params from py::tuple to py::dict */ + py::tuple ann_params=t[2].cast(); /* TODO: convert ann_params from py::tuple to py::dict */ auto space_name_=index_params[0].cast(); auto dim_=index_params[1].cast(); @@ -421,7 +421,7 @@ class Index { Index *new_index = new Index(index_params[0].cast(), index_params[1].cast()); /* TODO: deserialize state of random generators into new_index->level_generator_ and new_index->update_probability_generator_ */ - /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ + /* for full reproducibility / state of generators is serialized inside Index::getIndexParams */ new_index->seed = index_params[6].cast(); if (index_inited_){ @@ -442,13 +442,13 @@ class Index { } static Index * createFromIndex(const Index & index) { - /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ return createFromParams(index.getIndexParams()); } void setAnnData(const py::tuple t) { - /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ + /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */ std::unique_lock templock(appr_alg->global); @@ -641,7 +641,7 @@ PYBIND11_PLUGIN(hnswlib) { py::class_>(m, "Index") .def(py::init(&Index::createFromParams), py::arg("params")) - /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ + /* WARNING: Index::createFromIndex is not thread-safe with Index::addItems */ .def(py::init(&Index::createFromIndex), py::arg("index")) .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &Index::init_new_index, py::arg("max_elements"), py::arg("M")=16, py::arg("ef_construction")=200, py::arg("random_seed")=100) @@ -683,7 +683,7 @@ PYBIND11_PLUGIN(hnswlib) { .def(py::pickle( [](const Index &ind) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ - /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ + /* WARNING: Index::getIndexParams is not thread-safe with Index::addItems */ return ind.getIndexParams(); }, [](py::tuple t) { // __setstate__