Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enhance: support sparse cardinal hnsw index #33656

Merged
merged 1 commit into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions internal/core/src/index/IndexFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@
return std::make_unique<VectorDiskAnnIndex<bin1>>(
index_type, metric_type, version, file_manager_context);
}
case DataType::VECTOR_SPARSE_FLOAT: {
return std::make_unique<VectorDiskAnnIndex<float>>(
index_type, metric_type, version, file_manager_context);

Check warning on line 218 in internal/core/src/index/IndexFactory.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/index/IndexFactory.cpp#L216-L218

Added lines #L216 - L218 were not covered by tests
}
default:
throw SegcoreError(
DataTypeInvalid,
Expand Down Expand Up @@ -328,6 +332,14 @@
space,
file_manager_context);
}
case DataType::VECTOR_SPARSE_FLOAT: {
return std::make_unique<VectorDiskAnnIndex<float>>(

Check warning on line 336 in internal/core/src/index/IndexFactory.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/index/IndexFactory.cpp#L335-L336

Added lines #L335 - L336 were not covered by tests
index_type,
metric_type,
version,
space,
file_manager_context);

Check warning on line 341 in internal/core/src/index/IndexFactory.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/index/IndexFactory.cpp#L341

Added line #L341 was not covered by tests
}
default:
throw SegcoreError(
DataTypeInvalid,
Expand Down
10 changes: 6 additions & 4 deletions internal/core/src/segcore/FieldIndexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@
: FieldIndexing(field_meta, segcore_config),
built_(false),
sync_with_index_(false),
config_(std::make_unique<VecIndexConfig>(segment_max_row_count,
field_index_meta,
segcore_config,
SegmentType::Growing)) {
config_(std::make_unique<VecIndexConfig>(
segment_max_row_count,
field_index_meta,
segcore_config,
SegmentType::Growing,

Check warning on line 39 in internal/core/src/segcore/FieldIndexing.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/segcore/FieldIndexing.cpp#L39

Added line #L39 was not covered by tests
IsSparseFloatVectorDataType(field_meta.get_data_type()))) {
recreate_index();
}

Expand Down
15 changes: 10 additions & 5 deletions internal/core/src/segcore/IndexConfigGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ namespace milvus::segcore {
VecIndexConfig::VecIndexConfig(const int64_t max_index_row_cout,
const FieldIndexMeta& index_meta_,
const SegcoreConfig& config,
const SegmentType& segment_type)
: max_index_row_count_(max_index_row_cout), config_(config) {
const SegmentType& segment_type,
const bool is_sparse)
: max_index_row_count_(max_index_row_cout),
config_(config),
is_sparse_(is_sparse) {
origin_index_type_ = index_meta_.GetIndexType();
metric_type_ = index_meta_.GeMetricType();
// Currently for dense vector index, if the segment is growing, we use IVFCC
Expand All @@ -29,11 +32,15 @@ VecIndexConfig::VecIndexConfig(const int64_t max_index_row_cout,
// But for sparse vector index(INDEX_SPARSE_INVERTED_INDEX and
// INDEX_SPARSE_WAND), those index themselves can be used as the temp index
// type, so we can avoid the extra step of "releast temp and load".
// When using HNSW(cardinal) for sparse, we use INDEX_SPARSE_INVERTED_INDEX
// as the growing index.

if (origin_index_type_ ==
knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX ||
origin_index_type_ == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
index_type_ = origin_index_type_;
} else if (is_sparse_) {
index_type_ = knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX;
} else {
index_type_ = support_index_types.at(segment_type);
}
Expand All @@ -58,9 +65,7 @@ VecIndexConfig::GetBuildThreshold() const noexcept {
// For sparse, do not impose a threshold and start using index with any
// number of rows. Unlike dense vector index, growing sparse vector index
// does not require a minimum number of rows to train.
if (origin_index_type_ ==
knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX ||
origin_index_type_ == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
if (is_sparse_) {
return 0;
}
assert(VecIndexConfig::index_build_ratio.count(index_type_));
Expand Down
5 changes: 4 additions & 1 deletion internal/core/src/segcore/IndexConfigGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ class VecIndexConfig {
VecIndexConfig(const int64_t max_index_row_count,
const FieldIndexMeta& index_meta_,
const SegcoreConfig& config,
const SegmentType& segment_type);
const SegmentType& segment_type,
const bool is_sparse);

int64_t
GetBuildThreshold() const noexcept;
Expand Down Expand Up @@ -72,6 +73,8 @@ class VecIndexConfig {

knowhere::MetricType metric_type_;

bool is_sparse_;

knowhere::Json build_params_;

knowhere::Json search_params_;
Expand Down
3 changes: 2 additions & 1 deletion internal/core/src/segcore/SegmentSealedImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1561,7 +1561,8 @@
new VecIndexConfig(row_count,
field_index_meta,
segcore_config_,
SegmentType::Sealed));
SegmentType::Sealed,

Check warning on line 1564 in internal/core/src/segcore/SegmentSealedImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/segcore/SegmentSealedImpl.cpp#L1564

Added line #L1564 was not covered by tests
is_sparse));
if (row_count < field_binlog_config->GetBuildThreshold()) {
return false;
}
Expand Down
71 changes: 56 additions & 15 deletions internal/core/src/storage/DiskFileManagerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,10 +452,18 @@

auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
auto local_data_path = storage::GenFieldRawDataPathPrefix(
local_chunk_manager, segment_id, field_id) +
"raw_data";
local_chunk_manager->CreateFile(local_data_path);
std::string local_data_path;
bool file_created = false;

Check warning on line 456 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L455-L456

Added lines #L455 - L456 were not covered by tests

auto init_file_info = [&](milvus::DataType dt) {
local_data_path = storage::GenFieldRawDataPathPrefix(

Check warning on line 459 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L458-L459

Added lines #L458 - L459 were not covered by tests
local_chunk_manager, segment_id, field_id) +
"raw_data";
if (dt == milvus::DataType::VECTOR_SPARSE_FLOAT) {
local_data_path += ".sparse_u32_f32";

Check warning on line 463 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L462-L463

Added lines #L462 - L463 were not covered by tests
}
local_chunk_manager->CreateFile(local_data_path);

Check warning on line 465 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L465

Added line #L465 was not covered by tests
};

// get batch raw data from s3 and write batch data to disk file
// TODO: load and write of different batches at the same time
Expand All @@ -473,17 +481,50 @@
for (int i = 0; i < batch_size; ++i) {
auto field_data = field_datas[i].get()->GetFieldData();
num_rows += uint32_t(field_data->get_num_rows());
AssertInfo(dim == 0 || dim == field_data->get_dim(),
"inconsistent dim value in multi binlogs!");
dim = field_data->get_dim();

auto data_size =
field_data->get_num_rows() * dim * sizeof(DataType);
local_chunk_manager->Write(local_data_path,
write_offset,
const_cast<void*>(field_data->Data()),
data_size);
write_offset += data_size;
auto data_type = field_data->get_data_type();
if (!file_created) {
init_file_info(data_type);
file_created = true;

Check warning on line 487 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L484-L487

Added lines #L484 - L487 were not covered by tests
}
if (data_type == milvus::DataType::VECTOR_SPARSE_FLOAT) {
dim = std::max(

Check warning on line 490 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L489-L490

Added lines #L489 - L490 were not covered by tests
dim,
(uint32_t)(
std::dynamic_pointer_cast<FieldData<SparseFloatVector>>(
field_data)
->Dim()));

Check warning on line 495 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L495

Added line #L495 was not covered by tests
auto sparse_rows =
static_cast<const knowhere::sparse::SparseRow<float>*>(
field_data->Data());
for (size_t i = 0; i < field_data->Length(); ++i) {
auto row = sparse_rows[i];
auto row_byte_size = row.data_byte_size();
uint32_t nnz = row.size();
local_chunk_manager->Write(local_data_path,

Check warning on line 503 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L498-L503

Added lines #L498 - L503 were not covered by tests
write_offset,
const_cast<uint32_t*>(&nnz),
sizeof(nnz));
write_offset += sizeof(nnz);
local_chunk_manager->Write(local_data_path,

Check warning on line 508 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L507-L508

Added lines #L507 - L508 were not covered by tests
write_offset,
row.data(),
row_byte_size);
write_offset += row_byte_size;

Check warning on line 512 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L512

Added line #L512 was not covered by tests
}
} else {
AssertInfo(dim == 0 || dim == field_data->get_dim(),

Check warning on line 515 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L515

Added line #L515 was not covered by tests
"inconsistent dim value in multi binlogs!");
dim = field_data->get_dim();

Check warning on line 517 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L517

Added line #L517 was not covered by tests

auto data_size =
field_data->get_num_rows() * dim * sizeof(DataType);
local_chunk_manager->Write(

Check warning on line 521 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L519-L521

Added lines #L519 - L521 were not covered by tests
local_data_path,
write_offset,
const_cast<void*>(field_data->Data()),

Check warning on line 524 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L524

Added line #L524 was not covered by tests
data_size);
write_offset += data_size;

Check warning on line 526 in internal/core/src/storage/DiskFileManagerImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/storage/DiskFileManagerImpl.cpp#L526

Added line #L526 was not covered by tests
}
}
};

Expand Down
Loading