Skip to content
Permalink
Browse files

Fix load bugs/messages, update test, deprecate old indices (#148)

* temp debug state

* fix bug in loading index with deleted elements

* adjust condition in test

* add check for file existence

* cleanup
  • Loading branch information
yurymalkov committed Sep 16, 2019
1 parent b3671c5 commit c5c38f0cdc15fac182af0e4eda88e8d52b8103a3
Showing with 41 additions and 18 deletions.
  1. +23 −14 hnswlib/hnswalg.h
  2. +17 −3 python_bindings/tests/bindings_test_labels.py
  3. +1 −1 python_bindings/tests/bindings_test_resize.py
@@ -595,6 +595,10 @@ namespace hnswlib {

std::ifstream input(location, std::ios::binary);

if (!input.is_open())
throw std::runtime_error("Cannot open file");


// get file size:
input.seekg(0,input.end);
std::streampos total_filesize=input.tellg();
@@ -625,16 +629,15 @@ namespace hnswlib {
fstdistfunc_ = s->get_dist_func();
dist_func_param_ = s->get_dist_func_param();

/// Legacy, check that everything is ok

bool old_index=false;

auto pos=input.tellg();


/// Optional - check if index is ok:

input.seekg(cur_element_count * size_data_per_element_,input.cur);
for (size_t i = 0; i < cur_element_count; i++) {
if(input.tellg() < 0 || input.tellg()>=total_filesize){
old_index = true;
break;
throw std::runtime_error("Index seems to be corrupted or unsupported");
}

unsigned int linkListSize;
@@ -644,23 +647,21 @@ namespace hnswlib {
}
}

// check if file is ok, if not this is either corrupted or old index
// throw exception if it either corrupted or old index
if(input.tellg()!=total_filesize)
old_index = true;
throw std::runtime_error("Index seems to be corrupted or unsupported");

if (old_index) {
std::cerr << "Warning: loading of old indexes will be deprecated before 2019.\n"
<< "Please resave the index in the new format.\n";
}
input.clear();

/// Optional check end

input.seekg(pos,input.beg);


data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_);
input.read(data_level0_memory_, cur_element_count * size_data_per_element_);

if(old_index)
input.seekg(((max_elements_-cur_element_count) * size_data_per_element_), input.cur);



size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
@@ -691,6 +692,14 @@ namespace hnswlib {
input.read(linkLists_[i], linkListSize);
}
}

has_deletions_=false;

for (size_t i = 0; i < cur_element_count; i++) {
if(isMarkedDeleted(i))
has_deletions_=true;
}

input.close();

return;
@@ -3,10 +3,12 @@

class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
for idx in range(16):
print("\n**** Index save-load test ****\n")
import hnswlib
import numpy as np


np.random.seed(idx)
dim = 16
num_elements = 10000

@@ -95,8 +97,8 @@ def testRandomSelf(self):
p.mark_deleted(l[0])
labels2, _ = p.knn_query(data2, k=1)
items=p.get_items(labels2)
diff_with_gt_labels=np.max(np.abs(data2-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # console
diff_with_gt_labels=np.mean(np.abs(data2-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console


labels1_after, _ = p.knn_query(data1, k=1)
@@ -106,6 +108,18 @@ def testRandomSelf(self):
self.assertTrue(False)
print("All the data in data1 are removed")

# checking saving/loading index with elements marked as deleted
p.save_index("with_deleted.bin")
p = hnswlib.Index(space='l2', dim=dim)
p.load_index("with_deleted.bin")
p.set_ef(100)

labels1_after, _ = p.knn_query(data1, k=1)
for la in labels1_after:
for lb in labels1:
if la[0] == lb[0]:
self.assertTrue(False)



if __name__ == "__main__":
@@ -3,7 +3,7 @@

class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
for idx in range(32):
for idx in range(16):
print("\n**** Index resize test ****\n")
import hnswlib
import numpy as np

0 comments on commit c5c38f0

Please sign in to comment.
You can’t perform that action at this time.