Skip to content

Commit

Permalink
fix: fix binary vector data size (#33750)
Browse files Browse the repository at this point in the history
issue: #22837

- fix byte size wrong for binary vectors
- fix the expect/actual error msg

Signed-off-by: chasingegg <chao.gao@zilliz.com>
  • Loading branch information
chasingegg committed Jun 18, 2024
1 parent a789c60 commit 0d20303
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 23 deletions.
22 changes: 22 additions & 0 deletions internal/core/src/common/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,16 @@ GetDataTypeSize(DataType data_type, int dim = 1) {
}
}

template <typename T>
inline size_t
GetVecRowSize(int64_t dim) {
if constexpr (std::is_same_v<T, bin1>) {
return (dim / 8) * sizeof(bin1);
} else {
return dim * sizeof(T);
}
}

// TODO: use magic_enum when available
inline std::string
GetDataTypeName(DataType data_type) {
Expand Down Expand Up @@ -393,6 +403,18 @@ IndexIsSparse(const IndexType& index_type) {
index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND;
}

inline bool
IsFloatVectorMetricType(const MetricType& metric_type) {
return metric_type == knowhere::metric::L2 ||
metric_type == knowhere::metric::IP ||
metric_type == knowhere::metric::COSINE;
}

inline bool
IsBinaryVectorMetricType(const MetricType& metric_type) {
return !IsFloatVectorMetricType(metric_type);
}

// Plus 1 because we can't use greater(>) symbol
constexpr size_t REF_SIZE_THRESHOLD = 16 + 1;

Expand Down
4 changes: 2 additions & 2 deletions internal/core/src/index/IndexFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ IndexFactory::CreateVectorIndex(
index_type, metric_type, version, file_manager_context);
}
case DataType::VECTOR_BINARY: {
return std::make_unique<VectorMemIndex<uint8_t>>(
return std::make_unique<VectorMemIndex<bin1>>(
index_type, metric_type, version, file_manager_context);
}
case DataType::VECTOR_FLOAT16: {
Expand Down Expand Up @@ -360,7 +360,7 @@ IndexFactory::CreateVectorIndex(
create_index_info, file_manager_context, space);
}
case DataType::VECTOR_BINARY: {
return std::make_unique<VectorMemIndex<uint8_t>>(
return std::make_unique<VectorMemIndex<bin1>>(
create_index_info, file_manager_context, space);
}
case DataType::VECTOR_FLOAT16: {
Expand Down
14 changes: 14 additions & 0 deletions internal/core/src/index/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,20 @@ SetValueToConfig(Config& cfg, const std::string& key, const T value) {
cfg[key] = value;
}

template <typename T>
inline void
CheckMetricTypeSupport(const MetricType& metric_type) {
if constexpr (std::is_same_v<T, bin1>) {
AssertInfo(
IsBinaryVectorMetricType(metric_type),
"binary vector does not float vector metric type: " + metric_type);
} else {
AssertInfo(
IsFloatVectorMetricType(metric_type),
"float vector does not binary vector metric type: " + metric_type);
}
}

int64_t
GetDimFromConfig(const Config& config);

Expand Down
11 changes: 4 additions & 7 deletions internal/core/src/index/VectorDiskIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ VectorDiskAnnIndex<T>::VectorDiskAnnIndex(
const IndexVersion& version,
const storage::FileManagerContext& file_manager_context)
: VectorIndex(index_type, metric_type) {
CheckMetricTypeSupport<T>(metric_type);
file_manager_ =
std::make_shared<storage::DiskFileManagerImpl>(file_manager_context);
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
Expand Down Expand Up @@ -80,6 +81,7 @@ VectorDiskAnnIndex<T>::VectorDiskAnnIndex(
std::shared_ptr<milvus_storage::Space> space,
const storage::FileManagerContext& file_manager_context)
: space_(space), VectorIndex(index_type, metric_type) {
CheckMetricTypeSupport<T>(metric_type);
file_manager_ = std::make_shared<storage::DiskFileManagerImpl>(
file_manager_context, file_manager_context.space_);
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
Expand Down Expand Up @@ -316,7 +318,7 @@ VectorDiskAnnIndex<T>::BuildWithDataset(const DatasetPtr& dataset,
local_chunk_manager->Write(local_data_path, offset, &dim, sizeof(dim));
offset += sizeof(dim);

auto data_size = num * dim * sizeof(T);
size_t data_size = static_cast<size_t>(num) * milvus::GetVecRowSize<T>(dim);
auto raw_data = const_cast<void*>(milvus::GetDatasetTensor(dataset));
local_chunk_manager->Write(local_data_path, offset, raw_data, data_size);

Expand Down Expand Up @@ -448,12 +450,7 @@ VectorDiskAnnIndex<T>::GetVector(const DatasetPtr dataset) const {
auto tensor = res.value()->GetTensor();
auto row_num = res.value()->GetRows();
auto dim = res.value()->GetDim();
int64_t data_size;
if constexpr (std::is_same_v<T, bin1>) {
data_size = dim / 8 * row_num;
} else {
data_size = dim * row_num * sizeof(T);
}
int64_t data_size = milvus::GetVecRowSize<T>(dim) * row_num;
std::vector<uint8_t> raw_data;
raw_data.resize(data_size);
memcpy(raw_data.data(), tensor, data_size);
Expand Down
9 changes: 3 additions & 6 deletions internal/core/src/index/VectorMemIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ VectorMemIndex<T>::VectorMemIndex(
const IndexVersion& version,
const storage::FileManagerContext& file_manager_context)
: VectorIndex(index_type, metric_type) {
CheckMetricTypeSupport<T>(metric_type);
AssertInfo(!is_unsupported(index_type, metric_type),
index_type + " doesn't support metric: " + metric_type);
if (file_manager_context.Valid()) {
Expand Down Expand Up @@ -90,6 +91,7 @@ VectorMemIndex<T>::VectorMemIndex(
: VectorIndex(create_index_info.index_type, create_index_info.metric_type),
space_(space),
create_index_info_(create_index_info) {
CheckMetricTypeSupport<T>(create_index_info.metric_type);
AssertInfo(!is_unsupported(create_index_info.index_type,
create_index_info.metric_type),
create_index_info.index_type +
Expand Down Expand Up @@ -668,12 +670,7 @@ VectorMemIndex<T>::GetVector(const DatasetPtr dataset) const {
auto tensor = res.value()->GetTensor();
auto row_num = res.value()->GetRows();
auto dim = res.value()->GetDim();
int64_t data_size;
if constexpr (std::is_same_v<T, bin1>) {
data_size = dim / 8 * row_num;
} else {
data_size = dim * row_num * sizeof(T);
}
int64_t data_size = milvus::GetVecRowSize<T>(dim) * row_num;
std::vector<uint8_t> raw_data;
raw_data.resize(data_size);
memcpy(raw_data.data(), tensor, data_size);
Expand Down
6 changes: 3 additions & 3 deletions internal/core/src/storage/DiskFileManagerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ DiskFileManagerImpl::CacheRawDataToDisk(
field_data->FillFieldData(col_data);
dim = field_data->get_dim();
auto data_size =
field_data->get_num_rows() * index_meta_.dim * sizeof(DataType);
field_data->get_num_rows() * milvus::GetVecRowSize<DataType>(dim);
local_chunk_manager->Write(local_data_path,
write_offset,
const_cast<void*>(field_data->Data()),
Expand Down Expand Up @@ -516,8 +516,8 @@ DiskFileManagerImpl::CacheRawDataToDisk(std::vector<std::string> remote_files) {
"inconsistent dim value in multi binlogs!");
dim = field_data->get_dim();

auto data_size =
field_data->get_num_rows() * dim * sizeof(DataType);
auto data_size = field_data->get_num_rows() *
milvus::GetVecRowSize<DataType>(dim);
local_chunk_manager->Write(
local_data_path,
write_offset,
Expand Down
4 changes: 2 additions & 2 deletions internal/core/unittest/test_growing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ class GrowingTest
public:
void
SetUp() override {
auto index_type = std::get<0>(GetParam());
auto metric_type = std::get<1>(GetParam());
index_type = std::get<0>(GetParam());
metric_type = std::get<1>(GetParam());
if (index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT ||
index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC) {
data_type = DataType::VECTOR_FLOAT;
Expand Down
3 changes: 1 addition & 2 deletions internal/core/unittest/test_indexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,7 @@ class IndexTest : public ::testing::TestWithParam<Param> {
index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
is_sparse = true;
vec_field_data_type = milvus::DataType::VECTOR_SPARSE_FLOAT;
} else if (index_type == knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT ||
index_type == knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP) {
} else if (IsBinaryVectorMetricType(metric_type)) {
is_binary = true;
vec_field_data_type = milvus::DataType::VECTOR_BINARY;
} else {
Expand Down
2 changes: 1 addition & 1 deletion internal/proxy/validate_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ func (v *validateUtil) checkAligned(data []*schemapb.FieldData, schema *typeutil
}
errDimMismatch := func(fieldName string, dataDim int64, schemaDim int64) error {
msg := fmt.Sprintf("the dim (%d) of field data(%s) is not equal to schema dim (%d)", dataDim, fieldName, schemaDim)
return merr.WrapErrParameterInvalid(dataDim, schemaDim, msg)
return merr.WrapErrParameterInvalid(schemaDim, dataDim, msg)
}
for _, field := range data {
switch field.GetType() {
Expand Down

0 comments on commit 0d20303

Please sign in to comment.