Skip to content

Commit

Permalink
Merge branch 'rakri/multifilter_with_query_planning' of github.com:mi…
Browse files Browse the repository at this point in the history
…crosoft/DiskANN into rakri/multifilter_with_query_planning
  • Loading branch information
Suryansh Gupta committed May 6, 2024
2 parents 4e110dd + 3c66392 commit 582921f
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 86 deletions.
4 changes: 2 additions & 2 deletions apps/search_memory_index.cpp
Expand Up @@ -249,7 +249,7 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
{
std::ofstream query_stats_file;
query_stats_file.open(result_path_prefix + "_query_stats.txt");
query_stats_file << "cmps\tnum correct\tfilt time\tcmp time" << std::endl;
query_stats_file << "cmps\tnum correct\tfilt time\tcmp time\tlatency" << std::endl;
for (size_t i = 0; i < query_num; i++)
{
std::set<uint32_t> gt, res;
Expand Down Expand Up @@ -277,7 +277,7 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
}
}
query_stats_file << cmp_stats[i] << "\t" << cur_recall << "\t" << filter_match_time[i] << "\t"
<< dist_cmp_time[i] << "\t";
<< dist_cmp_time[i] << "\t" << latency_stats[i] << "\t";
for (auto const &r : res)
query_stats_file << r << " ";
query_stats_file << std::endl;
Expand Down
4 changes: 2 additions & 2 deletions include/cluster_store.h
Expand Up @@ -47,7 +47,7 @@ template <typename data_t> class AbstractClusterStore
// potentially after pre-processing the vectors if the metric deems so
// e.g., normalizing vectors for cosine distance over floating-point vectors
// useful for bulk or static index building.
virtual void assign_data_to_clusters(const data_t *vectors, std::vector<uint32_t> &ids) = 0;
virtual void assign_data_to_clusters( data_t *vectors, std::vector<uint32_t> &ids) = 0;

// operations on vectors
// like populate_data function, but over one vector at a time useful for
Expand Down Expand Up @@ -82,7 +82,7 @@ template <typename data_t> class InMemClusterStore : public AbstractClusterStore
// potentially after pre-processing the vectors if the metric deems so
// e.g., normalizing vectors for cosine distance over floating-point vectors
// useful for bulk or static index building.
virtual void assign_data_to_clusters(const data_t *vectors, std::vector<uint32_t> &ids) override;
virtual void assign_data_to_clusters(data_t *vectors, std::vector<uint32_t> &ids) override;

// operations on vectors
// like populate_data function, but over one vector at a time useful for
Expand Down
11 changes: 7 additions & 4 deletions include/index.h
Expand Up @@ -32,7 +32,7 @@
#define OVERHEAD_FACTOR 1.1
#define EXPAND_IF_FULL 0
#define DEFAULT_MAXC 750
#define INSTRUMENT true
//#define INSTRUMENT true

inline double time_to_intersect = 0.;
inline double time_to_cluster = 0.;
Expand Down Expand Up @@ -267,6 +267,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas

void parse_label_file(const std::string &label_file, size_t &num_pts_labels);

void parse_sample_label_file(const std::string &label_file, size_t &num_samples);

std::vector<std::pair<LabelT, uint32_t>> sort_filter_counts(const std::vector<LabelT> &filter_label);

uint32_t sample_intersection(roaring::Roaring &intersection_bitmap, const std::vector<LabelT> &filter_label);
Expand Down Expand Up @@ -413,9 +415,10 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
tsl::robin_set<LabelT> _labels;
std::string _labels_file;
std::unordered_map<LabelT, uint32_t> _label_to_start_id;
std::unordered_map<LabelT, roaring::Roaring> _labels_to_points;
std::unordered_map<LabelT, roaring::Roaring> _labels_to_points_samples;
std::vector<std::unordered_map<LabelT, roaring::Roaring>> _clusters_to_labels_to_points;
std::vector<roaring::Roaring> _labels_to_points;
std::vector<roaring::Roaring> _labels_to_points_sample;
float _sample_prob = 0;
std::vector<std::vector<roaring::Roaring>> _clusters_to_labels_to_points;
std::unordered_map<uint32_t, uint32_t> _medoid_counts;
diskann::InMemClusterStore<T> *_ivf_clusters = nullptr;

Expand Down
14 changes: 13 additions & 1 deletion include/scratch.h
Expand Up @@ -38,6 +38,10 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>
{
return _L;
}
inline float* get_query_float()
{
return _aligned_query_float;
}
inline uint32_t get_R()
{
return _R;
Expand All @@ -62,6 +66,11 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>
{
return _best_l_nodes;
}
inline std::vector<uint32_t> &closest_clusters()
{
return _closest_clusters;
}

inline std::vector<float> &occlude_factor()
{
return _occlude_factor;
Expand Down Expand Up @@ -97,10 +106,10 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>

inline roaring::Roaring &get_valid_bitmap()
{
_last_intersection.removeRangeClosed(_last_intersection.minimum(), _last_intersection.maximum());
return _last_intersection;
}


private:
uint32_t _L;
uint32_t _R;
Expand Down Expand Up @@ -141,6 +150,9 @@ template <typename T> class InMemQueryScratch : public AbstractScratch<T>
std::vector<uint32_t> _occlude_list_output;

roaring::Roaring _last_intersection;
// _to calculate the closest clusters during filtered search in clustered index
std::vector<uint32_t> _closest_clusters;
float* _aligned_query_float;
};

//
Expand Down
27 changes: 19 additions & 8 deletions src/cluster_store.cpp
Expand Up @@ -40,16 +40,19 @@ template <typename data_t> uint32_t InMemClusterStore<data_t>::load(const std::s

_posting_lists.resize(this->_num_clusters);
for (unsigned i = 0; i < this->_num_clusters; i++) {
unsigned cur_count;
if (cur_count != 0)
non_empty_clusters.emplace_back(i);
unsigned cur_count = 0;

in.read((char *) &cur_count, sizeof(unsigned));

if (cur_count > 0) {
non_empty_clusters.emplace_back(i);
uint32_t* vals = new uint32_t[cur_count];
in.read((char *) vals, (uint64_t)cur_count * sizeof(unsigned));

_posting_lists[i] = RoaringIdList(cur_count, vals);
// roaring_bitmap_add_many((roaring_bitmap_t*)_posting_lists[i].get_bitmap(), cur_count, vals);
delete[] vals;
}

// roaring_bitmap_add_many((roaring_bitmap_t*)_posting_lists[i].get_bitmap(), cur_count, vals);
total_count += cur_count;
}
in.close();
Expand Down Expand Up @@ -102,16 +105,24 @@ template <typename data_t> size_t InMemClusterStore<data_t>::save(const std::str

template <typename data_t> void InMemClusterStore<data_t>::add_cetroids(float *clusters, uint32_t num_clusters) {
this->_num_clusters = num_clusters;

diskann::cout<<"Set num clusters to " << num_clusters << ", and dim to " << this->_dim << std::endl;
this->_cluster_centroids = new float[(uint64_t)num_clusters*this->_dim];
std::memcpy(this->_cluster_centroids, clusters, (uint64_t)num_clusters*this->_dim);
std::memcpy(this->_cluster_centroids, clusters, (uint64_t)num_clusters*this->_dim*sizeof(float));

_posting_lists.clear();
_posting_lists.resize(num_clusters);
}

template <typename data_t> void InMemClusterStore<data_t>::assign_data_to_clusters(const data_t *vectors, std::vector<uint32_t> &ids) {
template <typename data_t> void InMemClusterStore<data_t>::assign_data_to_clusters(data_t *vectors, std::vector<uint32_t> &ids) {
uint64_t num_pts = ids.size();
float* vectors_float = new float[num_pts*this->_dim];
float* vectors_float;
if (sizeof(data_t) != sizeof(float)) {
vectors_float = new float[num_pts*this->_dim];
diskann::convert_types<data_t, float>(vectors, vectors_float, num_pts, this->_dim);
} else {
vectors_float = (float*) vectors;
}

uint32_t* closest_centers = new uint32_t[num_pts];
math_utils::compute_closest_centers(vectors_float, num_pts, this->_dim, this->_cluster_centroids, this->_num_clusters, 1,
Expand Down

0 comments on commit 582921f

Please sign in to comment.