Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop condition #490

Merged
merged 15 commits into from
Nov 6, 2023
4 changes: 4 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,14 @@ jobs:
./example_mt_search
./example_mt_filter
./example_mt_replace_deleted
./example_multivector_search
./example_epsilon_search
./searchKnnCloserFirst_test
./searchKnnWithFilter_test
./multiThreadLoad_test
./multiThread_replace_test
./test_updates
./test_updates update
./multivector_search_test
./epsilon_search_test
shell: bash
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ var/
.vscode/
.vs/
**.DS_Store
*.pyc
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ if(HNSWLIB_EXAMPLES)
add_executable(example_search examples/cpp/example_search.cpp)
target_link_libraries(example_search hnswlib)

add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
target_link_libraries(example_epsilon_search hnswlib)

add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
target_link_libraries(example_multivector_search hnswlib)

add_executable(example_filter examples/cpp/example_filter.cpp)
target_link_libraries(example_filter hnswlib)

Expand All @@ -73,6 +79,12 @@ if(HNSWLIB_EXAMPLES)
target_link_libraries(example_mt_replace_deleted hnswlib)

# tests
add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
target_link_libraries(multivector_search_test hnswlib)

add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
target_link_libraries(epsilon_search_test hnswlib)

add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
* filtering during the search with a boolean function
* deleting the elements and reusing the memory of the deleted elements for newly added elements
* multithreaded usage
* multivector search
* epsilon search


### Bindings installation
Expand Down
6 changes: 5 additions & 1 deletion examples/cpp/EXAMPLES.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,8 @@ int main() {
Multithreaded examples:
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)

More examples:
* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp)
* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp)
65 changes: 65 additions & 0 deletions examples/cpp/example_epsilon_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include "../../hnswlib/hnswlib.h"

typedef unsigned int docidtype;
typedef float dist_t;

int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff
int min_candidates = 100; // Minimum number of candidates to search in the epsilon region
// this parameter is similar to ef

int num_quries = 5;
dyashuni marked this conversation as resolved.
Show resolved Hide resolved
float epsilon = 2.0; // Squared distance to query

// Initing index
hnswlib::L2Space space(dim);
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;

size_t data_point_size = space.get_data_size();
char* data = new char[data_point_size * max_elements];
for (int i = 0; i < max_elements; i++) {
char* point_data = data + i * data_point_size;
for (int j = 0; j < dim; j++) {
char* vec_data = point_data + j * sizeof(float);
float value = distrib_real(rng);
*(float*)vec_data = value;
}
}

// Add data to index
for (int i = 0; i < max_elements; i++) {
hnswlib::labeltype label = i;
char* point_data = data + i * data_point_size;
alg_hnsw->addPoint(point_data, label);
}

// Query random vectors
for (int i = 0; i < num_quries; i++) {
char* query_data = new char[data_point_size];
for (int j = 0; j < dim; j++) {
size_t offset = j * sizeof(float);
char* vec_data = query_data + offset;
float value = distrib_real(rng);
*(float*)vec_data = value;
}
std::cout << "Query #" << i << "\n";
hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon, min_candidates);
std::priority_queue<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchStopCondition(query_data, max_elements, nullptr, &stop_condition);
size_t num_vectors = result.size();
std::cout << "Found " << num_vectors << " vectors\n";
delete[] query_data;
}

delete[] data;
delete alg_hnsw;
return 0;
}
81 changes: 81 additions & 0 deletions examples/cpp/example_multivector_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include "../../hnswlib/hnswlib.h"

typedef unsigned int docidtype;
typedef float dist_t;

int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff

int num_quries = 5;
dyashuni marked this conversation as resolved.
Show resolved Hide resolved
int ef_collection = 5; // Number of documents to search
docidtype min_doc_id = 0;
docidtype max_doc_id = 9;

// Initing index
hnswlib::MultiVectorL2Space<docidtype> space(dim);
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);

size_t data_point_size = space.get_data_size();
char* data = new char[data_point_size * max_elements];
for (int i = 0; i < max_elements; i++) {
// set vector value
char* point_data = data + i * data_point_size;
for (int j = 0; j < dim; j++) {
char* vec_data = point_data + j * sizeof(float);
float value = distrib_real(rng);
*(float*)vec_data = value;
}
// set document id
docidtype doc_id = distrib_docid(rng);
space.set_doc_id(point_data, doc_id);
}

// Add data to index
std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
for (int i = 0; i < max_elements; i++) {
hnswlib::labeltype label = i;
char* point_data = data + i * data_point_size;
alg_hnsw->addPoint(point_data, label);
label_docid_lookup[label] = space.get_doc_id(point_data);
}

// Query random vectors
size_t query_size = dim * sizeof(float);
for (int i = 0; i < num_quries; i++) {
char* query_data = new char[query_size];
for (int j = 0; j < dim; j++) {
size_t offset = j * sizeof(float);
char* vec_data = query_data + offset;
float value = distrib_real(rng);
*(float*)vec_data = value;
}
std::cout << "Query #" << i << "\n";
hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, dim);
std::priority_queue<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchStopCondition(query_data, ef_collection, nullptr, &stop_condition);
size_t num_vectors = result.size();

std::unordered_map<docidtype, size_t> doc_counter;
while (!result.empty()) {
hnswlib::labeltype label = result.top().second;
result.pop();
docidtype doc_id = label_docid_lookup[label];
doc_counter[doc_id] += 1;
}
std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n";
delete[] query_data;
}

delete[] data;
delete alg_hnsw;
return 0;
}