diff --git a/.travis.yml b/.travis.yml index 2b92b985..6b194926 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,7 @@ language: python matrix: include: - python: 3.6 + - python: 3.7 install: - | cd python_bindings @@ -12,4 +13,4 @@ install: script: - | cd python_bindings - python setup.py test \ No newline at end of file + python setup.py test diff --git a/ALGO_PARAMS.md b/ALGO_PARAMS.md index 14cec786..4585a82c 100644 --- a/ALGO_PARAMS.md +++ b/ALGO_PARAMS.md @@ -20,10 +20,10 @@ The range ```M```=12-48 is ok for the most of the use cases. When ```M``` is cha Nonetheless, ef and ef_construction parameters can be roughly estimated by assuming that ```M```*```ef_{construction}``` is a constant. -* ```ef_constrution``` - the parameter has the same meaning as ```ef```, but controls the index_time/index_accuracy. Bigger +* ```ef_construction``` - the parameter has the same meaning as ```ef```, but controls the index_time/index_accuracy. Bigger ef_construction leads to longer construction, but better index quality. At some point, increasing ef_construction does not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall -for M nearest neighbor search when ```ef``` =```ef_constuction```: if the recall is lower than 0.9, than there is room +for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room for improvement. * ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index function has a parameter which defines the new maximum number of elements). diff --git a/README.md b/README.md index 8b371449..c79e24c1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # Hnswlib - fast approximate nearest neighbor search Header-only C++ HNSW implementation with python bindings. Paper code for the HNSW 200M SIFT experiment +**NEWS:** + +**Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib is now can be installed via pip!** + Highlights: 1) Lightweight, header-only, no dependencies other than C++ 11. 2) Interfaces for C++, python and R (https://github.com/jlmelville/rcpphnsw). @@ -26,7 +30,7 @@ Note that inner product is not an actual metric. An element can be closer to som For other spaces use the nmslib library https://github.com/nmslib/nmslib. -#### short API description +#### Short API description * `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`. Index methods: @@ -45,7 +49,7 @@ Index methods: * `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`. * `set_ef(ef)` - sets the query time accuracy/speed trade-off, defined by the `ef` parameter ( -[ALGO_PARAMS.md](ALGO_PARAMS.md)). +[ALGO_PARAMS.md](ALGO_PARAMS.md)). Note that the parameter is currently not saved along with the index, so you need to set it manually after loading. * `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closests elements for each element of the * `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). @@ -59,11 +63,13 @@ Index methods: * `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying. -* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). +* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors. * `get_ids_list()` - returns a list of all elements' ids. +* `get_max_elements()` - returns the current capacity of the index +* `get_current_count()` - returns the current number of element stored in the index @@ -166,6 +172,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat ``` ### Bindings installation + +You can install from sources: ```bash apt-get install -y python-setuptools python-pip pip3 install pybind11 numpy setuptools @@ -173,6 +181,9 @@ cd python_bindings python3 setup.py install ``` +or you can install via pip: +`pip install hnswlib` + ### Other implementations * Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib * Faiss libary by facebook, uses own HNSW implementation for coarse quantization (python, C++): @@ -186,9 +197,13 @@ https://github.com/dbaranchuk/ivf-hnsw * Go implementation: https://github.com/Bithack/go-hnsw * Python implementation (as a part of the clustering code by by Matteo Dell'Amico): https://github.com/matteodellamico/flexible-clustering * Java implementation: https://github.com/jelmerk/hnswlib +* Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna * .Net implementation: https://github.com/microsoft/HNSW.Net +### Contributing to the repository +Contributions are highly welcome! +Please make pull requests against the `develop` branch. ### 200M SIFT test reproduction To download and extract the bigann dataset: diff --git a/examples/example.py b/examples/example.py index b9d2ec64..a08955a1 100644 --- a/examples/example.py +++ b/examples/example.py @@ -45,7 +45,7 @@ # Serializing and deleting the index: index_path='first_half.bin' print("Saving index to '%s'" % index_path) -p.save_index("first_half.bin") +p.save_index(index_path) del p # Reiniting, loading the index diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index e8c24d96..5b1bd655 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -1,6 +1,8 @@ #pragma once #include #include +#include +#include namespace hnswlib { template @@ -20,6 +22,8 @@ namespace hnswlib { dist_func_param_ = s->get_dist_func_param(); size_per_element_ = data_size_ + sizeof(labeltype); data_ = (char *) malloc(maxElements * size_per_element_); + if (data_ == nullptr) + std::runtime_error("Not enough memory: BruteforceSearch failed to allocate data"); cur_element_count = 0; } @@ -35,22 +39,37 @@ namespace hnswlib { size_t data_size_; DISTFUNC fstdistfunc_; void *dist_func_param_; + std::mutex index_lock; std::unordered_map dict_external_to_internal; - void addPoint(void *datapoint, labeltype label) { - if(dict_external_to_internal.count(label)) - throw std::runtime_error("Ids have to be unique"); + void addPoint(const void *datapoint, labeltype label) { + + int idx; + { + std::unique_lock lock(index_lock); + + + + auto search=dict_external_to_internal.find(label); + if (search != dict_external_to_internal.end()) { + idx=search->second; + } + else{ + if (cur_element_count >= maxelements_) { + throw std::runtime_error("The number of elements exceeds the specified limit\n"); + } + idx=cur_element_count; + dict_external_to_internal[label] = idx; + cur_element_count++; + } + } + memcpy(data_ + size_per_element_ * idx + data_size_, &label, sizeof(labeltype)); + memcpy(data_ + size_per_element_ * idx, datapoint, data_size_); + - if (cur_element_count >= maxelements_) { - throw std::runtime_error("The number of elements exceeds the specified limit\n"); - }; - memcpy(data_ + size_per_element_ * cur_element_count + data_size_, &label, sizeof(labeltype)); - memcpy(data_ + size_per_element_ * cur_element_count, datapoint, data_size_); - dict_external_to_internal[label]=cur_element_count; - cur_element_count++; }; void removePoint(labeltype cur_external) { @@ -68,8 +87,10 @@ namespace hnswlib { } - std::priority_queue> searchKnn(const void *query_data, size_t k) const { + std::priority_queue> + searchKnn(const void *query_data, size_t k) const { std::priority_queue> topResults; + if (cur_element_count == 0) return topResults; for (int i = 0; i < k; i++) { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); topResults.push(std::pair(dist, *((labeltype *) (data_ + size_per_element_ * i + @@ -90,6 +111,24 @@ namespace hnswlib { return topResults; }; + template + std::vector> + searchKnn(const void* query_data, size_t k, Comp comp) { + std::vector> result; + if (cur_element_count == 0) return result; + + auto ret = searchKnn(query_data, k); + + while (!ret.empty()) { + result.push_back(ret.top()); + ret.pop(); + } + + std::sort(result.begin(), result.end(), comp); + + return result; + } + void saveIndex(const std::string &location) { std::ofstream output(location, std::ios::binary); std::streampos position; @@ -118,12 +157,13 @@ namespace hnswlib { dist_func_param_ = s->get_dist_func_param(); size_per_element_ = data_size_ + sizeof(labeltype); data_ = (char *) malloc(maxelements_ * size_per_element_); + if (data_ == nullptr) + std::runtime_error("Not enough memory: loadIndex failed to allocate data"); input.read(data_, maxelements_ * size_per_element_); input.close(); - return; } }; diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 41665595..afc1222d 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -61,6 +61,8 @@ namespace hnswlib { maxlevel_ = -1; linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); + if (linkLists_ == nullptr) + throw std::runtime_error("Not enough memory: HierarchicalNSW failed to allocate linklists"); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); mult_ = 1 / log(1.0 * M_); revSize_ = 1.0 / mult_; @@ -131,7 +133,7 @@ namespace hnswlib { return return_label; } - inline labeltype setExternalLabel(tableint internal_id, labeltype label) const { + inline void setExternalLabel(tableint internal_id, labeltype label) const { memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label, sizeof(labeltype)); } @@ -150,7 +152,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> - searchBaseLayer(tableint ep_id, void *data_point, int layer) { + searchBaseLayer(tableint ep_id, const void *data_point, int layer) { VisitedList *vl = visited_list_pool_->getFreeVisitedList(); vl_type *visited_array = vl->mass; vl_type visited_array_tag = vl->curV; @@ -197,7 +199,7 @@ namespace hnswlib { _mm_prefetch(getDataByInternalId(*(datal + 1)), _MM_HINT_T0); #endif - for (int j = 0; j < size; j++) { + for (size_t j = 0; j < size; j++) { tableint candidate_id = *(datal + j); // if (candidate_id == 0) continue; #ifdef USE_SSE @@ -275,7 +277,7 @@ namespace hnswlib { _mm_prefetch((char *) (data + 2), _MM_HINT_T0); #endif - for (int j = 1; j <= size; j++) { + for (size_t j = 1; j <= size; j++) { int candidate_id = *(data + j); // if (candidate_id == 0) continue; #ifdef USE_SSE @@ -371,7 +373,7 @@ namespace hnswlib { return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_); }; - void mutuallyConnectNewElement(void *data_point, tableint cur_c, + void mutuallyConnectNewElement(const void *data_point, tableint cur_c, std::priority_queue, std::vector>, CompareByFirst> top_candidates, int level) { @@ -484,6 +486,8 @@ namespace hnswlib { std::priority_queue> searchKnnInternal(void *query_data, int k) { + std::priority_queue> top_candidates; + if (cur_element_count == 0) return top_candidates; tableint currObj = enterpoint_node_; dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); @@ -510,8 +514,6 @@ namespace hnswlib { } } - - std::priority_queue> top_candidates; if (has_deletions_) { std::priority_queue> top_candidates1=searchBaseLayerST(currObj, query_data, ef_); @@ -546,12 +548,16 @@ namespace hnswlib { // Reallocate base layer char * data_level0_memory_new = (char *) malloc(new_max_elements * size_data_per_element_); + if (data_level0_memory_new == nullptr) + throw std::runtime_error("Not enough memory: resizeIndex failed to allocate base layer"); memcpy(data_level0_memory_new, data_level0_memory_,cur_element_count * size_data_per_element_); free(data_level0_memory_); data_level0_memory_=data_level0_memory_new; // Reallocate all other layers char ** linkLists_new = (char **) malloc(sizeof(void *) * new_max_elements); + if (linkLists_new == nullptr) + throw std::runtime_error("Not enough memory: resizeIndex failed to allocate other layers"); memcpy(linkLists_new, linkLists_,cur_element_count * sizeof(void *)); free(linkLists_); linkLists_=linkLists_new; @@ -595,6 +601,10 @@ namespace hnswlib { std::ifstream input(location, std::ios::binary); + if (!input.is_open()) + throw std::runtime_error("Cannot open file"); + + // get file size: input.seekg(0,input.end); std::streampos total_filesize=input.tellg(); @@ -625,16 +635,15 @@ namespace hnswlib { fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); - /// Legacy, check that everything is ok - - bool old_index=false; - auto pos=input.tellg(); + + + /// Optional - check if index is ok: + input.seekg(cur_element_count * size_data_per_element_,input.cur); for (size_t i = 0; i < cur_element_count; i++) { if(input.tellg() < 0 || input.tellg()>=total_filesize){ - old_index = true; - break; + throw std::runtime_error("Index seems to be corrupted or unsupported"); } unsigned int linkListSize; @@ -644,23 +653,23 @@ namespace hnswlib { } } - // check if file is ok, if not this is either corrupted or old index + // throw exception if it either corrupted or old index if(input.tellg()!=total_filesize) - old_index = true; + throw std::runtime_error("Index seems to be corrupted or unsupported"); - if (old_index) { - std::cerr << "Warning: loading of old indexes will be deprecated before 2019.\n" - << "Please resave the index in the new format.\n"; - } input.clear(); + + /// Optional check end + input.seekg(pos,input.beg); data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); + if (data_level0_memory_ == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0"); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); - if(old_index) - input.seekg(((max_elements_-cur_element_count) * size_data_per_element_), input.cur); + size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -674,6 +683,8 @@ namespace hnswlib { linkLists_ = (char **) malloc(sizeof(void *) * max_elements); + if (linkLists_ == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; ef_ = 10; @@ -688,9 +699,19 @@ namespace hnswlib { } else { element_levels_[i] = linkListSize / size_links_per_element_; linkLists_[i] = (char *) malloc(linkListSize); + if (linkLists_[i] == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist"); input.read(linkLists_[i], linkListSize); } } + + has_deletions_=false; + + for (size_t i = 0; i < cur_element_count; i++) { + if(isMarkedDeleted(i)) + has_deletions_=true; + } + input.close(); return; @@ -770,11 +791,11 @@ namespace hnswlib { *((unsigned short int*)(ptr))=*((unsigned short int *)&size); } - void addPoint(void *data_point, labeltype label) { + void addPoint(const void *data_point, labeltype label) { addPoint(data_point, label,-1); } - tableint addPoint(void *data_point, labeltype label, int level) { + tableint addPoint(const void *data_point, labeltype label, int level) { tableint cur_c = 0; { std::unique_lock lock(cur_element_count_guard_); @@ -788,6 +809,7 @@ namespace hnswlib { auto search = label_lookup_.find(label); if (search != label_lookup_.end()) { std::unique_lock lock_el(link_list_locks_[search->second]); + has_deletions_ = true; markDeletedInternal(search->second); } label_lookup_[label] = cur_c; @@ -818,6 +840,8 @@ namespace hnswlib { if (curlevel) { linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1); + if (linkLists_[cur_c] == nullptr) + throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist"); memset(linkLists_[cur_c], 0, size_links_per_element_ * curlevel + 1); } @@ -886,7 +910,11 @@ namespace hnswlib { return cur_c; }; - std::priority_queue> searchKnn(const void *query_data, size_t k) const { + std::priority_queue> + searchKnn(const void *query_data, size_t k) const { + std::priority_queue> result; + if (cur_element_count == 0) return result; + tableint currObj = enterpoint_node_; dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); @@ -925,18 +953,34 @@ namespace hnswlib { currObj, query_data, std::max(ef_, k)); top_candidates.swap(top_candidates1); } - std::priority_queue> results; while (top_candidates.size() > k) { top_candidates.pop(); } while (top_candidates.size() > 0) { std::pair rez = top_candidates.top(); - results.push(std::pair(rez.first, getExternalLabel(rez.second))); + result.push(std::pair(rez.first, getExternalLabel(rez.second))); top_candidates.pop(); } - return results; + return result; }; + template + std::vector> + searchKnn(const void* query_data, size_t k, Comp comp) { + std::vector> result; + if (cur_element_count == 0) return result; + + auto ret = searchKnn(query_data, k); + + while (!ret.empty()) { + result.push_back(ret.top()); + ret.pop(); + } + + std::sort(result.begin(), result.end(), comp); + + return result; + } }; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 3ea73ef2..dbfb1656 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -24,12 +24,21 @@ #endif #include +#include #include namespace hnswlib { typedef size_t labeltype; + template + class pairGreater { + public: + bool operator()(const T& p1, const T& p2) { + return p1.first > p2.first; + } + }; + template static void writeBinaryPOD(std::ostream &out, const T &podRef) { out.write((char *) &podRef, sizeof(T)); @@ -60,8 +69,11 @@ namespace hnswlib { template class AlgorithmInterface { public: - virtual void addPoint(void *datapoint, labeltype label)=0; + virtual void addPoint(const void *datapoint, labeltype label)=0; virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; + template + std::vector> searchKnn(const void*, size_t, Comp) { + } virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ } diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index 4d3ac69a..bc00af72 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -4,16 +4,19 @@ namespace hnswlib { static float - L2Sqr(const void *pVect1, const void *pVect2, const void *qty_ptr) { - //return *((float *)pVect2); + L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + float *pVect1 = (float *) pVect1v; + float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); + float res = 0; - for (unsigned i = 0; i < qty; i++) { - float t = ((float *) pVect1)[i] - ((float *) pVect2)[i]; + for (size_t i = 0; i < qty; i++) { + float t = *pVect1 - *pVect2; + pVect1++; + pVect2++; res += t * t; } return (res); - } #if defined(USE_AVX) @@ -49,10 +52,8 @@ namespace hnswlib { } _mm256_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - - return (res); -} + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; + } #elif defined(USE_SSE) @@ -62,12 +63,9 @@ namespace hnswlib { float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); float PORTABLE_ALIGN32 TmpRes[8]; - // size_t qty4 = qty >> 2; size_t qty16 = qty >> 4; const float *pEnd1 = pVect1 + (qty16 << 4); - // const float* pEnd2 = pVect1 + (qty4 << 2); - // const float* pEnd3 = pVect1 + qty; __m128 diff, v1, v2; __m128 sum = _mm_set1_ps(0); @@ -102,10 +100,24 @@ namespace hnswlib { diff = _mm_sub_ps(v1, v2); sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); } + _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + } +#endif - return (res); +#if defined(USE_SSE) || defined(USE_AVX) + static float + L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty16 = qty >> 4 << 4; + float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16); + float *pVect1 = (float *) pVect1v + qty16; + float *pVect2 = (float *) pVect2v + qty16; + + size_t qty_left = qty - qty16; + float res_tail = L2Sqr(pVect1, pVect2, &qty_left); + return (res + res_tail); } #endif @@ -119,10 +131,9 @@ namespace hnswlib { size_t qty = *((size_t *) qty_ptr); - // size_t qty4 = qty >> 2; - size_t qty16 = qty >> 2; + size_t qty4 = qty >> 2; - const float *pEnd1 = pVect1 + (qty16 << 2); + const float *pEnd1 = pVect1 + (qty4 << 2); __m128 diff, v1, v2; __m128 sum = _mm_set1_ps(0); @@ -136,9 +147,22 @@ namespace hnswlib { sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); } _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + } - return (res); + static float + L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty4 = qty >> 2 << 2; + + float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4); + size_t qty_left = qty - qty4; + + float *pVect1 = (float *) pVect1v + qty4; + float *pVect2 = (float *) pVect2v + qty4; + float res_tail = L2Sqr(pVect1, pVect2, &qty_left); + + return (res + res_tail); } #endif @@ -151,13 +175,14 @@ namespace hnswlib { L2Space(size_t dim) { fstdistfunc_ = L2Sqr; #if defined(USE_SSE) || defined(USE_AVX) - if (dim % 4 == 0) - fstdistfunc_ = L2SqrSIMD4Ext; if (dim % 16 == 0) fstdistfunc_ = L2SqrSIMD16Ext; - /*else{ - throw runtime_error("Data type not supported!"); - }*/ + else if (dim % 4 == 0) + fstdistfunc_ = L2SqrSIMD4Ext; + else if (dim > 16) + fstdistfunc_ = L2SqrSIMD16ExtResiduals; + else if (dim > 4) + fstdistfunc_ = L2SqrSIMD4ExtResiduals; #endif dim_ = dim; data_size_ = dim * sizeof(float); @@ -185,10 +210,6 @@ namespace hnswlib { int res = 0; unsigned char *a = (unsigned char *) pVect1; unsigned char *b = (unsigned char *) pVect2; - /*for (int i = 0; i < qty; i++) { - int t = int((a)[i]) - int((b)[i]); - res += t*t; - }*/ qty = qty >> 2; for (size_t i = 0; i < qty; i++) { @@ -241,4 +262,4 @@ namespace hnswlib { }; -} +} \ No newline at end of file diff --git a/python_bindings/MANIFEST.in b/python_bindings/MANIFEST.in new file mode 100644 index 00000000..5a480e4f --- /dev/null +++ b/python_bindings/MANIFEST.in @@ -0,0 +1 @@ +include hnswlib/*.h \ No newline at end of file diff --git a/python_bindings/Makefile b/python_bindings/Makefile new file mode 100644 index 00000000..02ec523b --- /dev/null +++ b/python_bindings/Makefile @@ -0,0 +1,14 @@ +pypi: dist + twine upload dist/* + +dist: + -rm dist/* + python3 setup.py sdist + +test: + python3 setup.py test + +clean: + rm -rf *.egg-info build dist var first_half.bin tests/__pycache__ hnswlib.cpython-36m-darwin.so + +.PHONY: dist \ No newline at end of file diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 1aa99ac6..ef1dc1d6 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -2,7 +2,7 @@ #include #include #include -#include "../hnswlib/hnswlib.h" +#include "hnswlib/hnswlib.h" #include #include @@ -244,9 +244,9 @@ class Index { return data; } - std::vector getIdsList() { + std::vector getIdsList() { - std::vector ids; + std::vector ids; for(auto kv : appr_alg->label_lookup_) { ids.push_back(kv.first); @@ -359,6 +359,14 @@ class Index { appr_alg->resizeIndex(new_size); } + size_t getMaxElements() const { + return appr_alg->max_elements_; + } + + size_t getCurrentCount() const { + return appr_alg->cur_element_count; + } + std::string space_name; int dim; @@ -397,6 +405,8 @@ PYBIND11_PLUGIN(hnswlib) { .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) + .def("get_max_elements", &Index::getMaxElements) + .def("get_current_count", &Index::getCurrentCount) .def("__repr__", [](const Index &a) { return ""; diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib new file mode 120000 index 00000000..236d6575 --- /dev/null +++ b/python_bindings/hnswlib @@ -0,0 +1 @@ +../hnswlib \ No newline at end of file diff --git a/python_bindings/setup.py b/python_bindings/setup.py index 66de1033..2e863c87 100644 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -4,7 +4,7 @@ import sys import setuptools -__version__ = '0.3' +__version__ = '0.3.4' source_files = ['bindings.cpp'] diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index b351935b..c1887bef 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -3,10 +3,12 @@ class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): + for idx in range(16): print("\n**** Index save-load test ****\n") import hnswlib import numpy as np - + + np.random.seed(idx) dim = 16 num_elements = 10000 @@ -48,7 +50,7 @@ def testRandomSelf(self): self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data1-items)) + diff_with_gt_labels=np.mean(np.abs(data1-items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # Serializing and deleting the index. @@ -81,7 +83,7 @@ def testRandomSelf(self): self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data-items)) + diff_with_gt_labels=np.mean(np.abs(data-items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # deleting index. # Checking that all labels are returned correctly: @@ -95,8 +97,8 @@ def testRandomSelf(self): p.mark_deleted(l[0]) labels2, _ = p.knn_query(data2, k=1) items=p.get_items(labels2) - diff_with_gt_labels=np.max(np.abs(data2-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # console + diff_with_gt_labels=np.mean(np.abs(data2-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console labels1_after, _ = p.knn_query(data1, k=1) @@ -106,6 +108,18 @@ def testRandomSelf(self): self.assertTrue(False) print("All the data in data1 are removed") + # checking saving/loading index with elements marked as deleted + p.save_index("with_deleted.bin") + p = hnswlib.Index(space='l2', dim=dim) + p.load_index("with_deleted.bin") + p.set_ef(100) + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + if __name__ == "__main__": diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py index 1803178d..9411af64 100644 --- a/python_bindings/tests/bindings_test_resize.py +++ b/python_bindings/tests/bindings_test_resize.py @@ -3,11 +3,11 @@ class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - + for idx in range(16): print("\n**** Index resize test ****\n") import hnswlib import numpy as np - + np.random.seed(idx) dim = 16 num_elements = 10000 @@ -29,9 +29,9 @@ def testRandomSelf(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - p.set_ef(100) + p.set_ef(20) - p.set_num_threads(4) # by default using all available cores + p.set_num_threads(idx%8) # by default using all available cores # We split the data in two batches: data1 = data[:num_elements // 2] @@ -43,7 +43,7 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - items=p.get_items(labels) + items=p.get_items(list(range(len(data1)))) # Check the recall: self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) @@ -62,7 +62,7 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) - items=p.get_items(labels) + items=p.get_items(list(range(num_elements))) # Check the recall: self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) diff --git a/sift_1b.cpp b/sift_1b.cpp index 522de5d2..273c9828 100644 --- a/sift_1b.cpp +++ b/sift_1b.cpp @@ -319,6 +319,7 @@ void sift_test1B() { #pragma omp parallel for for (int i = 1; i < vecsize; i++) { unsigned char mass[128]; + int j2=0; #pragma omp critical { @@ -332,6 +333,7 @@ void sift_test1B() { mass[j] = massb[j]; } j1++; + j2=j1; if (j1 % report_every == 0) { cout << j1 / (0.01 * vecsize) << " %, " << report_every / (1000.0 * 1e-6 * stopw.getElapsedTimeMicro()) << " kips " << " Mem: " @@ -339,7 +341,7 @@ void sift_test1B() { stopw.reset(); } } - appr_alg->addPoint((void *) (mass), (size_t) j1); + appr_alg->addPoint((void *) (mass), (size_t) j2); }