diff --git a/internal/core/src/common/FieldDataInterface.h b/internal/core/src/common/FieldDataInterface.h index f5ce6a4299e4c..17916f08e6259 100644 --- a/internal/core/src/common/FieldDataInterface.h +++ b/internal/core/src/common/FieldDataInterface.h @@ -425,7 +425,7 @@ class FieldDataSparseVectorImpl } private: - int64_t vec_dim_; + int64_t vec_dim_ = 0; }; class FieldDataArrayImpl : public FieldDataImpl { diff --git a/internal/core/src/common/Types.h b/internal/core/src/common/Types.h index c0e742a031c46..ef7adc5872153 100644 --- a/internal/core/src/common/Types.h +++ b/internal/core/src/common/Types.h @@ -179,6 +179,12 @@ using IndexVersion = knowhere::IndexVersion; // TODO :: type define milvus index type(vector index type and scalar index type) using IndexType = knowhere::IndexType; +inline bool +IndexIsSparse(const IndexType& index_type) { + return index_type == knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX || + index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND; +} + // Plus 1 because we can't use greater(>) symbol constexpr size_t REF_SIZE_THRESHOLD = 16 + 1; diff --git a/internal/core/src/common/Utils.h b/internal/core/src/common/Utils.h index 690e08498f51a..4e4a0a3527ad3 100644 --- a/internal/core/src/common/Utils.h +++ b/internal/core/src/common/Utils.h @@ -241,23 +241,25 @@ SparseBytesToRows(const Iterable& rows) { return res; } -// SparseRowsToProto converts a vector of knowhere::sparse::SparseRow to +// SparseRowsToProto converts a list of knowhere::sparse::SparseRow to // a milvus::proto::schema::SparseFloatArray. The resulting proto is a deep copy -// of the source data. +// of the source data. source(i) returns the i-th row to be copied. inline void -SparseRowsToProto(const knowhere::sparse::SparseRow* source, - int64_t rows, - milvus::proto::schema::SparseFloatArray* proto) { +SparseRowsToProto( + const std::function*(size_t)>& + source, + int64_t rows, + milvus::proto::schema::SparseFloatArray* proto) { int64_t max_dim = 0; for (size_t i = 0; i < rows; ++i) { - if (source + i == nullptr) { + const auto* row = source(i); + if (row == nullptr) { // empty row proto->add_contents(); continue; } - auto& row = source[i]; - max_dim = std::max(max_dim, row.dim()); - proto->add_contents(row.data(), row.data_byte_size()); + max_dim = std::max(max_dim, row->dim()); + proto->add_contents(row->data(), row->data_byte_size()); } proto->set_dim(max_dim); } diff --git a/internal/core/src/common/VectorTrait.h b/internal/core/src/common/VectorTrait.h index 8062910ce2f43..67e92deb46e37 100644 --- a/internal/core/src/common/VectorTrait.h +++ b/internal/core/src/common/VectorTrait.h @@ -68,33 +68,6 @@ template constexpr bool IsSparse = std::is_same_v || std::is_same_v>; -template -struct EmbeddedTypeImpl; - -template -struct EmbeddedTypeImpl>> { - using type = T; -}; - -template -struct EmbeddedTypeImpl>> { - using type = std::conditional_t< - std::is_same_v, - float, - std::conditional_t< - std::is_same_v, - float16, - std::conditional_t< - std::is_same_v, - bfloat16, - std::conditional_t, - void, - uint8_t>>>>; -}; - -template -using EmbeddedType = typename EmbeddedTypeImpl::type; - struct FundamentalTag {}; struct StringTag {}; diff --git a/internal/core/src/index/VectorDiskIndex.cpp b/internal/core/src/index/VectorDiskIndex.cpp index 6fd2784a1ba29..d2c4ef3a03a2f 100644 --- a/internal/core/src/index/VectorDiskIndex.cpp +++ b/internal/core/src/index/VectorDiskIndex.cpp @@ -416,6 +416,11 @@ VectorDiskAnnIndex::HasRawData() const { template std::vector VectorDiskAnnIndex::GetVector(const DatasetPtr dataset) const { + auto index_type = GetIndexType(); + if (IndexIsSparse(index_type)) { + PanicInfo(ErrorCode::UnexpectedError, + "failed to get vector, index is sparse"); + } auto res = index_.GetVectorByIds(*dataset); if (!res.has_value()) { PanicInfo(ErrorCode::UnexpectedError, @@ -423,7 +428,6 @@ VectorDiskAnnIndex::GetVector(const DatasetPtr dataset) const { KnowhereStatusString(res.error()), res.what())); } - auto index_type = GetIndexType(); auto tensor = res.value()->GetTensor(); auto row_num = res.value()->GetRows(); auto dim = res.value()->GetDim(); diff --git a/internal/core/src/index/VectorDiskIndex.h b/internal/core/src/index/VectorDiskIndex.h index 47dd3d66639cb..b9d634cb5cd84 100644 --- a/internal/core/src/index/VectorDiskIndex.h +++ b/internal/core/src/index/VectorDiskIndex.h @@ -98,6 +98,12 @@ class VectorDiskAnnIndex : public VectorIndex { std::vector GetVector(const DatasetPtr dataset) const override; + std::unique_ptr[]> + GetSparseVector(const DatasetPtr dataset) const override { + PanicInfo(ErrorCode::Unsupported, + "get sparse vector not supported for disk index"); + } + void CleanLocalData() override; diff --git a/internal/core/src/index/VectorIndex.h b/internal/core/src/index/VectorIndex.h index 0dc15a65bdc2b..f3d5bccb53d6d 100644 --- a/internal/core/src/index/VectorIndex.h +++ b/internal/core/src/index/VectorIndex.h @@ -76,6 +76,9 @@ class VectorIndex : public IndexBase { virtual std::vector GetVector(const DatasetPtr dataset) const = 0; + virtual std::unique_ptr[]> + GetSparseVector(const DatasetPtr dataset) const = 0; + IndexType GetIndexType() const { return index_type_; diff --git a/internal/core/src/index/VectorMemIndex.cpp b/internal/core/src/index/VectorMemIndex.cpp index bb3b2ba2c2635..fababfb7e352c 100644 --- a/internal/core/src/index/VectorMemIndex.cpp +++ b/internal/core/src/index/VectorMemIndex.cpp @@ -491,7 +491,7 @@ VectorMemIndex::Build(const Config& config) { build_config.update(config); build_config.erase("insert_files"); build_config.erase(VEC_OPT_FIELDS); - if (GetIndexType().find("SPARSE") == std::string::npos) { + if (!IndexIsSparse(GetIndexType())) { int64_t total_size = 0; int64_t total_num_rows = 0; int64_t dim = 0; @@ -534,6 +534,7 @@ VectorMemIndex::Build(const Config& config) { AssertInfo(ptr, "failed to cast field data to sparse rows"); for (size_t i = 0; i < field_data->Length(); ++i) { // this does a deep copy of field_data's data. + AssertInfo(dim >= ptr[i].dim(), "bad dim"); vec[offset + i] = ptr[i]; } offset += field_data->Length(); @@ -636,12 +637,17 @@ VectorMemIndex::HasRawData() const { template std::vector VectorMemIndex::GetVector(const DatasetPtr dataset) const { + auto index_type = GetIndexType(); + if (IndexIsSparse(index_type)) { + PanicInfo(ErrorCode::UnexpectedError, + "failed to get vector, index is sparse"); + } + auto res = index_.GetVectorByIds(*dataset); if (!res.has_value()) { PanicInfo(ErrorCode::UnexpectedError, "failed to get vector, " + KnowhereStatusString(res.error())); } - auto index_type = GetIndexType(); auto tensor = res.value()->GetTensor(); auto row_num = res.value()->GetRows(); auto dim = res.value()->GetDim(); @@ -657,6 +663,21 @@ VectorMemIndex::GetVector(const DatasetPtr dataset) const { return raw_data; } +template +std::unique_ptr[]> +VectorMemIndex::GetSparseVector(const DatasetPtr dataset) const { + auto res = index_.GetVectorByIds(*dataset); + if (!res.has_value()) { + PanicInfo(ErrorCode::UnexpectedError, + "failed to get vector, " + KnowhereStatusString(res.error())); + } + // release and transfer ownership to the result unique ptr. + res.value()->SetIsOwner(false); + return std::unique_ptr[]>( + static_cast*>( + res.value()->GetTensor())); +} + template void VectorMemIndex::LoadFromFile(const Config& config) { diff --git a/internal/core/src/index/VectorMemIndex.h b/internal/core/src/index/VectorMemIndex.h index 287557a844407..cda1f2fc7c370 100644 --- a/internal/core/src/index/VectorMemIndex.h +++ b/internal/core/src/index/VectorMemIndex.h @@ -85,6 +85,9 @@ class VectorMemIndex : public VectorIndex { std::vector GetVector(const DatasetPtr dataset) const override; + std::unique_ptr[]> + GetSparseVector(const DatasetPtr dataset) const override; + BinarySet Upload(const Config& config = {}) override; diff --git a/internal/core/src/mmap/Column.h b/internal/core/src/mmap/Column.h index b8acfcc1e73f1..ffa3c2af7f98b 100644 --- a/internal/core/src/mmap/Column.h +++ b/internal/core/src/mmap/Column.h @@ -45,7 +45,9 @@ class ColumnBase { public: // memory mode ctor ColumnBase(size_t reserve, const FieldMeta& field_meta) - : type_size_(field_meta.get_sizeof()) { + : type_size_(datatype_is_sparse_vector(field_meta.get_data_type()) + ? 1 + : field_meta.get_sizeof()) { // simdjson requires a padding following the json data padding_ = field_meta.get_data_type() == DataType::JSON ? simdjson::SIMDJSON_PADDING @@ -55,7 +57,7 @@ class ColumnBase { return; } - cap_size_ = field_meta.get_sizeof() * reserve; + cap_size_ = type_size_ * reserve; // use anon mapping so we are able to free these memory with munmap only data_ = static_cast(mmap(nullptr, @@ -72,8 +74,10 @@ class ColumnBase { // mmap mode ctor ColumnBase(const File& file, size_t size, const FieldMeta& field_meta) - : type_size_(field_meta.get_sizeof()), - num_rows_(size / field_meta.get_sizeof()) { + : type_size_(datatype_is_sparse_vector(field_meta.get_data_type()) + ? 1 + : field_meta.get_sizeof()), + num_rows_(size / type_size_) { padding_ = field_meta.get_data_type() == DataType::JSON ? simdjson::SIMDJSON_PADDING : 0; diff --git a/internal/core/src/query/Plan.cpp b/internal/core/src/query/Plan.cpp index 87f14887535c0..087abe310476d 100644 --- a/internal/core/src/query/Plan.cpp +++ b/internal/core/src/query/Plan.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "Plan.h" +#include "common/Utils.h" #include "PlanProto.h" #include "generated/ShowPlanNodeVisitor.h" @@ -34,9 +35,8 @@ std::unique_ptr ParsePlaceholderGroup(const Plan* plan, const uint8_t* blob, const int64_t blob_len) { - namespace set = milvus::proto::common; auto result = std::make_unique(); - set::PlaceholderGroup ph_group; + milvus::proto::common::PlaceholderGroup ph_group; auto ok = ph_group.ParseFromArray(blob, blob_len); Assert(ok); for (auto& info : ph_group.placeholders()) { @@ -46,22 +46,26 @@ ParsePlaceholderGroup(const Plan* plan, auto field_id = plan->tag2field_.at(element.tag_); auto& field_meta = plan->schema_[field_id]; element.num_of_queries_ = info.values_size(); - AssertInfo(element.num_of_queries_, "must have queries"); - Assert(element.num_of_queries_ > 0); - element.line_sizeof_ = info.values().Get(0).size(); - if (field_meta.get_sizeof() != element.line_sizeof_) { - throw SegcoreError( - DimNotMatch, - fmt::format("vector dimension mismatch, expected vector " - "size(byte) {}, actual {}.", - field_meta.get_sizeof(), - element.line_sizeof_)); - } - auto& target = element.blob_; - target.reserve(element.line_sizeof_ * element.num_of_queries_); - for (auto& line : info.values()) { - Assert(element.line_sizeof_ == line.size()); - target.insert(target.end(), line.begin(), line.end()); + AssertInfo(element.num_of_queries_ > 0, "must have queries"); + if (info.type() == + milvus::proto::common::PlaceholderType::SparseFloatVector) { + element.sparse_matrix_ = SparseBytesToRows(info.values()); + } else { + auto line_size = info.values().Get(0).size(); + if (field_meta.get_sizeof() != line_size) { + throw SegcoreError( + DimNotMatch, + fmt::format("vector dimension mismatch, expected vector " + "size(byte) {}, actual {}.", + field_meta.get_sizeof(), + line_size)); + } + auto& target = element.blob_; + target.reserve(line_size * element.num_of_queries_); + for (auto& line : info.values()) { + Assert(line_size == line.size()); + target.insert(target.end(), line.begin(), line.end()); + } } result->emplace_back(std::move(element)); } diff --git a/internal/core/src/query/PlanImpl.h b/internal/core/src/query/PlanImpl.h index d015387f63d22..089902e95742f 100644 --- a/internal/core/src/query/PlanImpl.h +++ b/internal/core/src/query/PlanImpl.h @@ -64,19 +64,30 @@ struct Plan { struct Placeholder { std::string tag_; int64_t num_of_queries_; - int64_t line_sizeof_; + // TODO(SPARSE): add a dim_ field here, use the dim passed in search request + // instead of the dim in schema, since the dim of sparse float column is + // dynamic. This change will likely affect lots of code, thus I'll do it in + // a separate PR, and use dim=0 for sparse float vector searches for now. + + // only one of blob_ and sparse_matrix_ should be set. blob_ is used for + // dense vector search and sparse_matrix_ is for sparse vector search. aligned_vector blob_; + std::unique_ptr[]> sparse_matrix_; - template - const T* + const void* get_blob() const { - return reinterpret_cast(blob_.data()); + if (blob_.empty()) { + return sparse_matrix_.get(); + } + return blob_.data(); } - template - T* + void* get_blob() { - return reinterpret_cast(blob_.data()); + if (blob_.empty()) { + return sparse_matrix_.get(); + } + return blob_.data(); } }; diff --git a/internal/core/src/query/PlanNode.h b/internal/core/src/query/PlanNode.h index 69c970c314774..de39c0afd1370 100644 --- a/internal/core/src/query/PlanNode.h +++ b/internal/core/src/query/PlanNode.h @@ -67,6 +67,12 @@ struct BFloat16VectorANNS : VectorPlanNode { accept(PlanNodeVisitor&) override; }; +struct SparseFloatVectorANNS : VectorPlanNode { + public: + void + accept(PlanNodeVisitor&) override; +}; + struct RetrievePlanNode : PlanNode { public: void diff --git a/internal/core/src/query/PlanProto.cpp b/internal/core/src/query/PlanProto.cpp index d782ec160969c..50a4008eeca3a 100644 --- a/internal/core/src/query/PlanProto.cpp +++ b/internal/core/src/query/PlanProto.cpp @@ -217,6 +217,9 @@ ProtoParser::PlanNodeFromProto(const planpb::PlanNode& plan_node_proto) { } else if (anns_proto.vector_type() == milvus::proto::plan::VectorType::BFloat16Vector) { return std::make_unique(); + } else if (anns_proto.vector_type() == + milvus::proto::plan::VectorType::SparseFloatVector) { + return std::make_unique(); } else { return std::make_unique(); } diff --git a/internal/core/src/query/SearchBruteForce.cpp b/internal/core/src/query/SearchBruteForce.cpp index 4eb3b13eea271..ecb3b31dca7d0 100644 --- a/internal/core/src/query/SearchBruteForce.cpp +++ b/internal/core/src/query/SearchBruteForce.cpp @@ -36,7 +36,8 @@ CheckBruteForceSearchParam(const FieldMeta& field, "[BruteForceSearch] Data type isn't vector type"); bool is_float_data_type = (data_type == DataType::VECTOR_FLOAT || data_type == DataType::VECTOR_FLOAT16 || - data_type == DataType::VECTOR_BFLOAT16); + data_type == DataType::VECTOR_BFLOAT16 || + data_type == DataType::VECTOR_SPARSE_FLOAT); bool is_float_metric_type = IsFloatMetricType(metric_type); AssertInfo(is_float_data_type == is_float_metric_type, "[BruteForceSearch] Data type and metric type miss-match"); @@ -85,7 +86,24 @@ BruteForceSearch(const dataset::SearchDataset& dataset, sub_result.mutable_seg_offsets().resize(nq * topk); sub_result.mutable_distances().resize(nq * topk); - if (search_cfg.contains(RADIUS)) { + if (data_type == DataType::VECTOR_SPARSE_FLOAT) { + // TODO(SPARSE): support sparse brute force range search + AssertInfo(!search_cfg.contains(RADIUS) && !search_cfg.contains(RANGE_FILTER), + "sparse vector not support range search"); + base_dataset->SetIsSparse(true); + query_dataset->SetIsSparse(true); + auto stat = knowhere::BruteForce::SearchSparseWithBuf( + base_dataset, + query_dataset, + sub_result.mutable_seg_offsets().data(), + sub_result.mutable_distances().data(), + search_cfg, + bitset); + milvus::tracer::AddEvent("knowhere_finish_BruteForce_SearchWithBuf"); + if (stat != knowhere::Status::success) { + throw SegcoreError(KnowhereError, KnowhereStatusString(stat)); + } + } else if (search_cfg.contains(RADIUS)) { if (search_cfg.contains(RANGE_FILTER)) { CheckRangeSearchParam(search_cfg[RADIUS], search_cfg[RANGE_FILTER], @@ -195,6 +213,7 @@ BruteForceSearchIterators(const dataset::SearchDataset& dataset, base_dataset, query_dataset, search_cfg, bitset); break; default: + // TODO(SPARSE): support sparse brute force iterator PanicInfo(ErrorCode::Unsupported, "Unsupported dataType for chunk brute force iterator:{}", data_type); diff --git a/internal/core/src/query/SearchOnGrowing.cpp b/internal/core/src/query/SearchOnGrowing.cpp index 6d3e40ca5f55b..c2573195cec64 100644 --- a/internal/core/src/query/SearchOnGrowing.cpp +++ b/internal/core/src/query/SearchOnGrowing.cpp @@ -32,14 +32,18 @@ FloatSegmentIndexSearch(const segcore::SegmentGrowingImpl& segment, auto vecfield_id = info.field_id_; auto& field = schema[vecfield_id]; + auto is_sparse = field.get_data_type() == DataType::VECTOR_SPARSE_FLOAT; + // TODO(SPARSE): see todo in PlanImpl.h::PlaceHolder. + auto dim = is_sparse ? 0 : field.get_dim(); - AssertInfo(field.get_data_type() == DataType::VECTOR_FLOAT, - "[FloatSearch]Field data type isn't VECTOR_FLOAT"); + AssertInfo(field.get_data_type() == DataType::VECTOR_FLOAT || + field.get_data_type() == DataType::VECTOR_SPARSE_FLOAT, + "[FloatSearch]Field data type isn't VECTOR_FLOAT or VECTOR_SPARSE_FLOAT"); dataset::SearchDataset search_dataset{info.metric_type_, num_queries, info.topk_, info.round_decimal_, - field.get_dim(), + dim, query_data}; if (indexing_record.is_in(vecfield_id)) { const auto& field_indexing = @@ -48,8 +52,12 @@ FloatSegmentIndexSearch(const segcore::SegmentGrowingImpl& segment, auto indexing = field_indexing.get_segment_indexing(); SearchInfo search_conf = field_indexing.get_search_params(info); auto vec_index = dynamic_cast(indexing); - SearchOnIndex( - search_dataset, *vec_index, search_conf, bitset, search_result); + SearchOnIndex(search_dataset, + *vec_index, + search_conf, + bitset, + search_result, + is_sparse); } } @@ -76,7 +84,6 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment, AssertInfo(datatype_is_vector(data_type), "[SearchOnGrowing]Data type isn't vector type"); - auto dim = field.get_dim(); auto topk = info.topk_; auto metric_type = info.metric_type_; auto round_decimal = info.round_decimal_; @@ -87,6 +94,10 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment, segment, info, query_data, num_queries, bitset, search_result); } else { SubSearchResult final_qr(num_queries, topk, metric_type, round_decimal); + // TODO(SPARSE): see todo in PlanImpl.h::PlaceHolder. + auto dim = field.get_data_type() == DataType::VECTOR_SPARSE_FLOAT + ? 0 + : field.get_dim(); dataset::SearchDataset search_dataset{ metric_type, num_queries, topk, round_decimal, dim, query_data}; std::shared_lock read_chunk_mutex( diff --git a/internal/core/src/query/SearchOnIndex.cpp b/internal/core/src/query/SearchOnIndex.cpp index f0a992ae92ce8..45de711f6c357 100644 --- a/internal/core/src/query/SearchOnIndex.cpp +++ b/internal/core/src/query/SearchOnIndex.cpp @@ -18,12 +18,14 @@ SearchOnIndex(const dataset::SearchDataset& search_dataset, const index::VectorIndex& indexing, const SearchInfo& search_conf, const BitsetView& bitset, - SearchResult& search_result) { + SearchResult& search_result, + bool is_sparse) { auto num_queries = search_dataset.num_queries; auto dim = search_dataset.dim; auto metric_type = search_dataset.metric_type; auto dataset = knowhere::GenDataSet(num_queries, dim, search_dataset.query_data); + dataset->SetIsSparse(is_sparse); if (!PrepareVectorIteratorsFromIndex(search_conf, num_queries, dataset, diff --git a/internal/core/src/query/SearchOnIndex.h b/internal/core/src/query/SearchOnIndex.h index 6a388951ae9bb..3913cd3cd442c 100644 --- a/internal/core/src/query/SearchOnIndex.h +++ b/internal/core/src/query/SearchOnIndex.h @@ -24,6 +24,7 @@ SearchOnIndex(const dataset::SearchDataset& search_dataset, const index::VectorIndex& indexing, const SearchInfo& search_conf, const BitsetView& bitset, - SearchResult& search_result); + SearchResult& search_result, + bool is_sparse = false); } // namespace milvus::query diff --git a/internal/core/src/query/SearchOnSealed.cpp b/internal/core/src/query/SearchOnSealed.cpp index 83330ce39f6e0..8bc806062a4d2 100644 --- a/internal/core/src/query/SearchOnSealed.cpp +++ b/internal/core/src/query/SearchOnSealed.cpp @@ -34,8 +34,9 @@ SearchOnSealedIndex(const Schema& schema, auto field_id = search_info.field_id_; auto& field = schema[field_id]; - // Assert(field.get_data_type() == DataType::VECTOR_FLOAT); - auto dim = field.get_dim(); + auto is_sparse = field.get_data_type() == DataType::VECTOR_SPARSE_FLOAT; + // TODO(SPARSE): see todo in PlanImpl.h::PlaceHolder. + auto dim = is_sparse ? 0 : field.get_dim(); AssertInfo(record.is_ready(field_id), "[SearchOnSealed]Record isn't ready"); // Keep the field_indexing smart pointer, until all reference by raw dropped. @@ -44,6 +45,7 @@ SearchOnSealedIndex(const Schema& schema, "Metric type of field index isn't the same with search info"); auto dataset = knowhere::GenDataSet(num_queries, dim, query_data); + dataset->SetIsSparse(is_sparse); auto vec_index = dynamic_cast(field_indexing->indexing_.get()); if (!PrepareVectorIteratorsFromIndex(search_info, @@ -80,11 +82,16 @@ SearchOnSealed(const Schema& schema, auto field_id = search_info.field_id_; auto& field = schema[field_id]; + // TODO(SPARSE): see todo in PlanImpl.h::PlaceHolder. + auto dim = field.get_data_type() == DataType::VECTOR_SPARSE_FLOAT + ? 0 + : field.get_dim(); + query::dataset::SearchDataset dataset{search_info.metric_type_, num_queries, search_info.topk_, search_info.round_decimal_, - field.get_dim(), + dim, query_data}; auto data_type = field.get_data_type(); diff --git a/internal/core/src/query/generated/ExecPlanNodeVisitor.h b/internal/core/src/query/generated/ExecPlanNodeVisitor.h index 16599de1f05ff..d3b69a388d94a 100644 --- a/internal/core/src/query/generated/ExecPlanNodeVisitor.h +++ b/internal/core/src/query/generated/ExecPlanNodeVisitor.h @@ -34,6 +34,9 @@ class ExecPlanNodeVisitor : public PlanNodeVisitor { void visit(BFloat16VectorANNS& node) override; + void + visit(SparseFloatVectorANNS& node) override; + void visit(RetrievePlanNode& node) override; diff --git a/internal/core/src/query/generated/ExtractInfoPlanNodeVisitor.h b/internal/core/src/query/generated/ExtractInfoPlanNodeVisitor.h index 652ef01c52bab..48f813b7d5886 100644 --- a/internal/core/src/query/generated/ExtractInfoPlanNodeVisitor.h +++ b/internal/core/src/query/generated/ExtractInfoPlanNodeVisitor.h @@ -30,6 +30,9 @@ class ExtractInfoPlanNodeVisitor : public PlanNodeVisitor { void visit(BFloat16VectorANNS& node) override; + void + visit(SparseFloatVectorANNS& node) override; + void visit(RetrievePlanNode& node) override; diff --git a/internal/core/src/query/generated/PlanNode.cpp b/internal/core/src/query/generated/PlanNode.cpp index 4cc501e74f517..540ad68aa925f 100644 --- a/internal/core/src/query/generated/PlanNode.cpp +++ b/internal/core/src/query/generated/PlanNode.cpp @@ -35,6 +35,11 @@ BFloat16VectorANNS::accept(PlanNodeVisitor& visitor) { visitor.visit(*this); } +void +SparseFloatVectorANNS::accept(PlanNodeVisitor& visitor) { + visitor.visit(*this); +} + void RetrievePlanNode::accept(PlanNodeVisitor& visitor) { visitor.visit(*this); diff --git a/internal/core/src/query/generated/PlanNodeVisitor.h b/internal/core/src/query/generated/PlanNodeVisitor.h index be180a97d52fe..60dda9c3eb7fe 100644 --- a/internal/core/src/query/generated/PlanNodeVisitor.h +++ b/internal/core/src/query/generated/PlanNodeVisitor.h @@ -31,6 +31,9 @@ class PlanNodeVisitor { virtual void visit(BFloat16VectorANNS&) = 0; + virtual void + visit(SparseFloatVectorANNS&) = 0; + virtual void visit(RetrievePlanNode&) = 0; }; diff --git a/internal/core/src/query/generated/ShowPlanNodeVisitor.h b/internal/core/src/query/generated/ShowPlanNodeVisitor.h index 1a9dfbac5cc43..ec94659465471 100644 --- a/internal/core/src/query/generated/ShowPlanNodeVisitor.h +++ b/internal/core/src/query/generated/ShowPlanNodeVisitor.h @@ -34,6 +34,9 @@ class ShowPlanNodeVisitor : public PlanNodeVisitor { void visit(BFloat16VectorANNS& node) override; + void + visit(SparseFloatVectorANNS& node) override; + void visit(RetrievePlanNode& node) override; diff --git a/internal/core/src/query/generated/VerifyPlanNodeVisitor.h b/internal/core/src/query/generated/VerifyPlanNodeVisitor.h index a57a1f2059775..40836460da340 100644 --- a/internal/core/src/query/generated/VerifyPlanNodeVisitor.h +++ b/internal/core/src/query/generated/VerifyPlanNodeVisitor.h @@ -33,6 +33,9 @@ class VerifyPlanNodeVisitor : public PlanNodeVisitor { void visit(BFloat16VectorANNS& node) override; + void + visit(SparseFloatVectorANNS& node) override; + void visit(RetrievePlanNode& node) override; diff --git a/internal/core/src/query/visitors/ExecPlanNodeVisitor.cpp b/internal/core/src/query/visitors/ExecPlanNodeVisitor.cpp index 20392318ea7f1..833c822487f7d 100644 --- a/internal/core/src/query/visitors/ExecPlanNodeVisitor.cpp +++ b/internal/core/src/query/visitors/ExecPlanNodeVisitor.cpp @@ -149,7 +149,7 @@ ExecPlanNodeVisitor::VectorVisitorImpl(VectorPlanNode& node) { AssertInfo(segment, "support SegmentSmallIndex Only"); SearchResult search_result; auto& ph = placeholder_group_->at(0); - auto src_data = ph.get_blob>(); + auto src_data = ph.get_blob(); auto num_queries = ph.num_of_queries_; // TODO: add API to unify row_count @@ -308,4 +308,9 @@ ExecPlanNodeVisitor::visit(BFloat16VectorANNS& node) { VectorVisitorImpl(node); } +void +ExecPlanNodeVisitor::visit(SparseFloatVectorANNS& node) { + VectorVisitorImpl(node); +} + } // namespace milvus::query diff --git a/internal/core/src/query/visitors/ExtractInfoPlanNodeVisitor.cpp b/internal/core/src/query/visitors/ExtractInfoPlanNodeVisitor.cpp index 04528bbf11cc7..2de8f92df6d38 100644 --- a/internal/core/src/query/visitors/ExtractInfoPlanNodeVisitor.cpp +++ b/internal/core/src/query/visitors/ExtractInfoPlanNodeVisitor.cpp @@ -65,6 +65,15 @@ ExtractInfoPlanNodeVisitor::visit(BFloat16VectorANNS& node) { } } +void +ExtractInfoPlanNodeVisitor::visit(SparseFloatVectorANNS& node) { + plan_info_.add_involved_field(node.search_info_.field_id_); + if (node.predicate_.has_value()) { + ExtractInfoExprVisitor expr_visitor(plan_info_); + node.predicate_.value()->accept(expr_visitor); + } +} + void ExtractInfoPlanNodeVisitor::visit(RetrievePlanNode& node) { // Assert(node.predicate_.has_value()); diff --git a/internal/core/src/query/visitors/ShowPlanNodeVisitor.cpp b/internal/core/src/query/visitors/ShowPlanNodeVisitor.cpp index e71a1b54ffc4b..baa07e8126c15 100644 --- a/internal/core/src/query/visitors/ShowPlanNodeVisitor.cpp +++ b/internal/core/src/query/visitors/ShowPlanNodeVisitor.cpp @@ -144,6 +144,30 @@ ShowPlanNodeVisitor::visit(BFloat16VectorANNS& node) { ret_ = json_body; } +void +ShowPlanNodeVisitor::visit(SparseFloatVectorANNS& node) { + assert(!ret_); + auto& info = node.search_info_; + Json json_body{ + {"node_type", "SparseFloatVectorANNS"}, // + {"metric_type", info.metric_type_}, // + {"field_id_", info.field_id_.get()}, // + {"topk", info.topk_}, // + {"search_params", info.search_params_}, // + {"placeholder_tag", node.placeholder_tag_}, // + }; + if (node.predicate_.has_value()) { + ShowExprVisitor expr_show; + AssertInfo(node.predicate_.value(), + "[ShowPlanNodeVisitor]Can't get value from node predict"); + json_body["predicate"] = + expr_show.call_child(node.predicate_->operator*()); + } else { + json_body["predicate"] = "None"; + } + ret_ = json_body; +} + void ShowPlanNodeVisitor::visit(RetrievePlanNode& node) { } diff --git a/internal/core/src/query/visitors/VerifyPlanNodeVisitor.cpp b/internal/core/src/query/visitors/VerifyPlanNodeVisitor.cpp index 73ffa1041678b..2612e37daaa38 100644 --- a/internal/core/src/query/visitors/VerifyPlanNodeVisitor.cpp +++ b/internal/core/src/query/visitors/VerifyPlanNodeVisitor.cpp @@ -42,6 +42,10 @@ void VerifyPlanNodeVisitor::visit(BFloat16VectorANNS&) { } +void +VerifyPlanNodeVisitor::visit(SparseFloatVectorANNS&) { +} + void VerifyPlanNodeVisitor::visit(RetrievePlanNode&) { } diff --git a/internal/core/src/segcore/ConcurrentVector.h b/internal/core/src/segcore/ConcurrentVector.h index 05287460d9d81..aaa900405b807 100644 --- a/internal/core/src/segcore/ConcurrentVector.h +++ b/internal/core/src/segcore/ConcurrentVector.h @@ -129,6 +129,9 @@ class VectorBase { virtual bool empty() = 0; + virtual void + clear() = 0; + protected: const int64_t size_per_chunk_; }; @@ -282,7 +285,7 @@ class ConcurrentVectorImpl : public VectorBase { } void - clear() { + clear() override { chunks_.clear(); } diff --git a/internal/core/src/segcore/FieldIndexing.cpp b/internal/core/src/segcore/FieldIndexing.cpp index 7455346080ed1..3eb545812fa1b 100644 --- a/internal/core/src/segcore/FieldIndexing.cpp +++ b/internal/core/src/segcore/FieldIndexing.cpp @@ -70,6 +70,9 @@ VectorFieldIndexing::BuildIndexRange(int64_t ack_beg, } } +// for sparse float vector: +// * element_size is not used +// * output_raw pooints at a milvus::schema::proto::SparseFloatArray. void VectorFieldIndexing::GetDataFromIndex(const int64_t* seg_offsets, int64_t count, @@ -80,10 +83,16 @@ VectorFieldIndexing::GetDataFromIndex(const int64_t* seg_offsets, ids_ds->SetDim(1); ids_ds->SetIds(seg_offsets); ids_ds->SetIsOwner(false); - - auto vector = index_->GetVector(ids_ds); - - std::memcpy(output, vector.data(), count * element_size); + if (field_meta_.get_data_type() == DataType::VECTOR_SPARSE_FLOAT) { + auto vector = index_->GetSparseVector(ids_ds); + SparseRowsToProto( + [vec_ptr = vector.get()](size_t i) { return vec_ptr + i; }, + count, + reinterpret_cast(output)); + } else { + auto vector = index_->GetVector(ids_ds); + std::memcpy(output, vector.data(), count * element_size); + } } void @@ -242,7 +251,9 @@ VectorFieldIndexing::AppendSegmentIndexDense(int64_t reserved_offset, knowhere::Json VectorFieldIndexing::get_build_params() const { auto config = config_->GetBuildBaseParams(); - config[knowhere::meta::DIM] = std::to_string(field_meta_.get_dim()); + if (!datatype_is_sparse_vector(field_meta_.get_data_type())) { + config[knowhere::meta::DIM] = std::to_string(field_meta_.get_dim()); + } config[knowhere::meta::NUM_BUILD_THREAD] = std::to_string(1); // for sparse float vector: drop_ratio_build config is not allowed to be set // on growing segment index. @@ -255,10 +266,6 @@ VectorFieldIndexing::get_search_params(const SearchInfo& searchInfo) const { return conf; } -idx_t -VectorFieldIndexing::get_index_cursor() { - return index_cur_.load(); -} bool VectorFieldIndexing::sync_data_with_index() const { return sync_with_index_.load(); diff --git a/internal/core/src/segcore/FieldIndexing.h b/internal/core/src/segcore/FieldIndexing.h index 27609014caad0..ccd392997a5ed 100644 --- a/internal/core/src/segcore/FieldIndexing.h +++ b/internal/core/src/segcore/FieldIndexing.h @@ -86,9 +86,6 @@ class FieldIndexing { return field_meta_; } - virtual idx_t - get_index_cursor() = 0; - int64_t get_size_per_chunk() const { return segcore_config_.get_chunk_rows(); @@ -143,10 +140,6 @@ class ScalarFieldIndexing : public FieldIndexing { PanicInfo(Unsupported, "scalar index don't support get data from index"); } - idx_t - get_index_cursor() override { - return 0; - } int64_t get_build_threshold() const override { @@ -201,6 +194,9 @@ class VectorFieldIndexing : public FieldIndexing { const VectorBase* field_raw_data, const void* data_source) override; + // for sparse float vector: + // * element_size is not used + // * output_raw pooints at a milvus::schema::proto::SparseFloatArray. void GetDataFromIndex(const int64_t* seg_offsets, int64_t count, @@ -229,9 +225,6 @@ class VectorFieldIndexing : public FieldIndexing { bool has_raw_data() const override; - idx_t - get_index_cursor() override; - knowhere::Json get_build_params() const; @@ -369,6 +362,9 @@ class IndexingRecord { } } + // for sparse float vector: + // * element_size is not used + // * output_raw pooints at a milvus::schema::proto::SparseFloatArray. void GetDataFromIndex(FieldId fieldId, const int64_t* seg_offsets, @@ -377,9 +373,10 @@ class IndexingRecord { void* output_raw) const { if (is_in(fieldId)) { auto& indexing = field_indexings_.at(fieldId); - if (indexing->get_field_meta().is_vector() && + if (indexing->get_field_meta().get_data_type() == + DataType::VECTOR_FLOAT || indexing->get_field_meta().get_data_type() == - DataType::VECTOR_FLOAT) { + DataType::VECTOR_SPARSE_FLOAT) { indexing->GetDataFromIndex( seg_offsets, count, element_size, output_raw); } diff --git a/internal/core/src/segcore/IndexConfigGenerator.h b/internal/core/src/segcore/IndexConfigGenerator.h index ce8c20b609538..102e4f74f048c 100644 --- a/internal/core/src/segcore/IndexConfigGenerator.h +++ b/internal/core/src/segcore/IndexConfigGenerator.h @@ -38,7 +38,7 @@ class VecIndexConfig { {knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC, 0.1}}; inline static const std::unordered_set maintain_params = { - "radius", "range_filter"}; + "radius", "range_filter", "drop_ratio_search"}; public: VecIndexConfig(const int64_t max_index_row_count, diff --git a/internal/core/src/segcore/InsertRecord.h b/internal/core/src/segcore/InsertRecord.h index f667de5f07524..f5a604ae4947d 100644 --- a/internal/core/src/segcore/InsertRecord.h +++ b/internal/core/src/segcore/InsertRecord.h @@ -552,7 +552,7 @@ struct InsertRecord { return ptr; } - // append a column of scalar type + // append a column of scalar or sparse float vector type template void append_field_data(FieldId field_id, int64_t size_per_chunk) { diff --git a/internal/core/src/segcore/SegmentGrowingImpl.cpp b/internal/core/src/segcore/SegmentGrowingImpl.cpp index 3a6dc07a1cee5..463a31ae47f71 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.cpp +++ b/internal/core/src/segcore/SegmentGrowingImpl.cpp @@ -71,9 +71,14 @@ void SegmentGrowingImpl::try_remove_chunks(FieldId fieldId) { //remove the chunk data to reduce memory consumption if (indexing_record_.SyncDataWithIndex(fieldId)) { - auto vec_data_base = + VectorBase* vec_data_base = dynamic_cast*>( insert_record_.get_field_data_base(fieldId)); + if (!vec_data_base) { + vec_data_base = + dynamic_cast*>( + insert_record_.get_field_data_base(fieldId)); + } if (vec_data_base && vec_data_base->num_chunk() > 0 && chunk_mutex_.try_lock()) { vec_data_base->clear(); @@ -135,7 +140,9 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset, // update average row data size auto field_data_size = GetRawDataSizeOfDataArray( - &insert_record_proto->fields_data(data_offset), field_meta, num_rows); + &insert_record_proto->fields_data(data_offset), + field_meta, + num_rows); if (datatype_is_variable(field_meta.get_data_type())) { SegmentInternalInterface::set_field_avg_size( field_id, num_rows, field_data_size); @@ -485,6 +492,16 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, seg_offsets, count, result->mutable_vectors()->mutable_bfloat16_vector()->data()); + } else if (field_meta.get_data_type() == + DataType::VECTOR_SPARSE_FLOAT) { + bulk_subscript_sparse_float_vector_impl( + field_id, + (const ConcurrentVector*)vec_ptr, + seg_offsets, + count, + result->mutable_vectors()->mutable_sparse_float_vector()); + result->mutable_vectors()->set_dim( + result->vectors().sparse_float_vector().dim()); } else { PanicInfo(DataTypeInvalid, "logical error"); } @@ -601,6 +618,33 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, return result; } +void +SegmentGrowingImpl::bulk_subscript_sparse_float_vector_impl( + FieldId field_id, + const ConcurrentVector* vec_raw, + const int64_t* seg_offsets, + int64_t count, + milvus::proto::schema::SparseFloatArray* output) const { + AssertInfo(HasRawData(field_id.get()), "Growing segment loss raw data"); + + // if index has finished building index, grab from index + if (indexing_record_.SyncDataWithIndex(field_id)) { + indexing_record_.GetDataFromIndex( + field_id, seg_offsets, count, 0, output); + return; + } + // else copy from raw data + std::lock_guard guard(chunk_mutex_); + SparseRowsToProto( + [&](size_t i) { + auto offset = seg_offsets[i]; + return offset != INVALID_SEG_OFFSET ? vec_raw->get_element(offset) + : nullptr; + }, + count, + output); +} + template void SegmentGrowingImpl::bulk_subscript_ptr_impl( @@ -629,32 +673,27 @@ SegmentGrowingImpl::bulk_subscript_impl(FieldId field_id, AssertInfo(vec_ptr, "Pointer of vec_raw is nullptr"); auto& vec = *vec_ptr; - auto copy_from_chunk = [&]() { - auto output_base = reinterpret_cast(output_raw); - for (int i = 0; i < count; ++i) { - auto dst = output_base + i * element_sizeof; - auto offset = seg_offsets[i]; - if (offset == INVALID_SEG_OFFSET) { - memset(dst, 0, element_sizeof); - } else { - auto src = (const uint8_t*)vec.get_element(offset); - memcpy(dst, src, element_sizeof); - } - } - }; - //HasRawData interface guarantees that data can be fetched from growing segment - if (HasRawData(field_id.get())) { - //When data sync with index - if (indexing_record_.SyncDataWithIndex(field_id)) { - indexing_record_.GetDataFromIndex( - field_id, seg_offsets, count, element_sizeof, output_raw); + // HasRawData interface guarantees that data can be fetched from growing segment + AssertInfo(HasRawData(field_id.get()), "Growing segment loss raw data"); + // when data is in sync with index + if (indexing_record_.SyncDataWithIndex(field_id)) { + indexing_record_.GetDataFromIndex( + field_id, seg_offsets, count, element_sizeof, output_raw); + return; + } + // else copy from chunk + std::lock_guard guard(chunk_mutex_); + auto output_base = reinterpret_cast(output_raw); + for (int i = 0; i < count; ++i) { + auto dst = output_base + i * element_sizeof; + auto offset = seg_offsets[i]; + if (offset == INVALID_SEG_OFFSET) { + memset(dst, 0, element_sizeof); } else { - //Else copy from chunk - std::lock_guard guard(chunk_mutex_); - copy_from_chunk(); + auto src = (const uint8_t*)vec.get_element(offset); + memcpy(dst, src, element_sizeof); } } - AssertInfo(HasRawData(field_id.get()), "Growing segment loss raw data"); } template diff --git a/internal/core/src/segcore/SegmentGrowingImpl.h b/internal/core/src/segcore/SegmentGrowingImpl.h index d26fb6fb14822..2e7841dab9038 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.h +++ b/internal/core/src/segcore/SegmentGrowingImpl.h @@ -96,11 +96,6 @@ class SegmentGrowingImpl : public SegmentGrowing { return chunk_mutex_; } - const SealedIndexingRecord& - get_sealed_indexing_record() const { - return sealed_indexing_record_; - } - const Schema& get_schema() const override { return *schema_; @@ -180,6 +175,14 @@ class SegmentGrowingImpl : public SegmentGrowing { int64_t count, void* output_raw) const; + void + bulk_subscript_sparse_float_vector_impl( + FieldId field_id, + const ConcurrentVector* vec_raw, + const int64_t* seg_offsets, + int64_t count, + milvus::proto::schema::SparseFloatArray* output) const; + void bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, @@ -292,7 +295,6 @@ class SegmentGrowingImpl : public SegmentGrowing { // small indexes for every chunk IndexingRecord indexing_record_; - SealedIndexingRecord sealed_indexing_record_; // not used // inserted fields data and row_ids, timestamps InsertRecord insert_record_; diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 120316529c7d4..f9a6add1647fa 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -848,65 +848,68 @@ SegmentSealedImpl::get_vector(FieldId field_id, if (has_raw_data) { // If index has raw data, get vector from memory. auto ids_ds = GenIdsDataset(count, ids); - auto vector = vec_index->GetVector(ids_ds); - return segcore::CreateVectorDataArrayFrom( - vector.data(), count, field_meta); - } else { - // If index doesn't have raw data, get vector from chunk cache. - auto cc = storage::ChunkCacheSingleton::GetInstance().GetChunkCache(); - - // group by data_path - auto id_to_data_path = - std::unordered_map>{}; - auto path_to_column = - std::unordered_map>{}; - for (auto i = 0; i < count; i++) { - const auto& tuple = GetFieldDataPath(field_id, ids[i]); - id_to_data_path.emplace(ids[i], tuple); - path_to_column.emplace(std::get<0>(tuple), nullptr); + if (field_meta.get_data_type() == DataType::VECTOR_SPARSE_FLOAT) { + auto res = vec_index->GetSparseVector(ids_ds); + return segcore::CreateVectorDataArrayFrom( + res.get(), count, field_meta); + } else { + // dense vector: + auto vector = vec_index->GetVector(ids_ds); + return segcore::CreateVectorDataArrayFrom( + vector.data(), count, field_meta); } + } - // read and prefetch - auto& pool = - ThreadPools::GetThreadPool(milvus::ThreadPoolPriority::HIGH); - std::vector< - std::future>>> - futures; - futures.reserve(path_to_column.size()); - for (const auto& iter : path_to_column) { - const auto& data_path = iter.first; - futures.emplace_back( - pool.Submit(ReadFromChunkCache, cc, data_path)); - } + AssertInfo(field_meta.get_data_type() != DataType::VECTOR_SPARSE_FLOAT, + "index of sparse float vector is guaranteed to have raw data"); - for (int i = 0; i < futures.size(); ++i) { - const auto& [data_path, column] = futures[i].get(); - path_to_column[data_path] = column; - } + // If index doesn't have raw data, get vector from chunk cache. + auto cc = storage::ChunkCacheSingleton::GetInstance().GetChunkCache(); - // assign to data array - auto row_bytes = field_meta.get_sizeof(); - auto buf = std::vector(count * row_bytes); - for (auto i = 0; i < count; i++) { - AssertInfo(id_to_data_path.count(ids[i]) != 0, "id not found"); - const auto& [data_path, offset_in_binlog] = - id_to_data_path.at(ids[i]); - AssertInfo(path_to_column.count(data_path) != 0, - "column not found"); - const auto& column = path_to_column.at(data_path); - AssertInfo( - offset_in_binlog * row_bytes < column->ByteSize(), - "column idx out of range, idx: {}, size: {}, data_path: {}", - offset_in_binlog * row_bytes, - column->ByteSize(), - data_path); - auto vector = &column->Data()[offset_in_binlog * row_bytes]; - std::memcpy(buf.data() + i * row_bytes, vector, row_bytes); - } - return segcore::CreateVectorDataArrayFrom( - buf.data(), count, field_meta); + // group by data_path + auto id_to_data_path = + std::unordered_map>{}; + auto path_to_column = + std::unordered_map>{}; + for (auto i = 0; i < count; i++) { + const auto& tuple = GetFieldDataPath(field_id, ids[i]); + id_to_data_path.emplace(ids[i], tuple); + path_to_column.emplace(std::get<0>(tuple), nullptr); } + + // read and prefetch + auto& pool = ThreadPools::GetThreadPool(milvus::ThreadPoolPriority::HIGH); + std::vector< + std::future>>> + futures; + futures.reserve(path_to_column.size()); + for (const auto& iter : path_to_column) { + const auto& data_path = iter.first; + futures.emplace_back(pool.Submit(ReadFromChunkCache, cc, data_path)); + } + + for (int i = 0; i < futures.size(); ++i) { + const auto& [data_path, column] = futures[i].get(); + path_to_column[data_path] = column; + } + + // assign to data array + auto row_bytes = field_meta.get_sizeof(); + auto buf = std::vector(count * row_bytes); + for (auto i = 0; i < count; i++) { + AssertInfo(id_to_data_path.count(ids[i]) != 0, "id not found"); + const auto& [data_path, offset_in_binlog] = id_to_data_path.at(ids[i]); + AssertInfo(path_to_column.count(data_path) != 0, "column not found"); + const auto& column = path_to_column.at(data_path); + AssertInfo(offset_in_binlog * row_bytes < column->ByteSize(), + "column idx out of range, idx: {}, size: {}, data_path: {}", + offset_in_binlog * row_bytes, + column->ByteSize(), + data_path); + auto vector = &column->Data()[offset_in_binlog * row_bytes]; + std::memcpy(buf.data() + i * row_bytes, vector, row_bytes); + } + return segcore::CreateVectorDataArrayFrom(buf.data(), count, field_meta); } void @@ -1102,7 +1105,7 @@ SegmentSealedImpl::bulk_subscript_array_impl( } } -// for vector +// for dense vector void SegmentSealedImpl::bulk_subscript_impl(int64_t element_sizeof, const void* src_raw, @@ -1250,7 +1253,6 @@ SegmentSealedImpl::get_raw_data(FieldId field_id, ->mutable_data()); break; } - case DataType::VECTOR_FLOAT: { bulk_subscript_impl(field_meta.get_sizeof(), column->Data(), @@ -1289,6 +1291,21 @@ SegmentSealedImpl::get_raw_data(FieldId field_id, ret->mutable_vectors()->mutable_binary_vector()->data()); break; } + case DataType::VECTOR_SPARSE_FLOAT: { + auto rows = static_cast*>( + static_cast(column->Data())); + auto dst = ret->mutable_vectors()->mutable_sparse_float_vector(); + SparseRowsToProto( + [&](size_t i) { + auto offset = seg_offsets[i]; + return offset != INVALID_SEG_OFFSET ? (rows + offset) + : nullptr; + }, + count, + dst); + ret->mutable_vectors()->set_dim(dst->dim()); + break; + } default: { PanicInfo(DataTypeInvalid, diff --git a/internal/core/src/segcore/Utils.cpp b/internal/core/src/segcore/Utils.cpp index 8e2132251da2c..8004fcab6d188 100644 --- a/internal/core/src/segcore/Utils.cpp +++ b/internal/core/src/segcore/Utils.cpp @@ -315,8 +315,11 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta) { field_meta.get_data_type())); auto vector_array = data_array->mutable_vectors(); - auto dim = field_meta.get_dim(); - vector_array->set_dim(dim); + auto dim = 0; + if (data_type != DataType::VECTOR_SPARSE_FLOAT) { + dim = field_meta.get_dim(); + vector_array->set_dim(dim); + } switch (data_type) { case DataType::VECTOR_FLOAT: { auto length = count * dim; @@ -494,9 +497,13 @@ CreateVectorDataArrayFrom(const void* data_raw, } case DataType::VECTOR_SPARSE_FLOAT: { SparseRowsToProto( - reinterpret_cast*>( - data_raw), - count, + [&](size_t i) { + return reinterpret_cast< + const knowhere::sparse::SparseRow*>( + data_raw) + + i; + }, + count, vector_array->mutable_sparse_float_vector()); vector_array->set_dim(vector_array->sparse_float_vector().dim()); break; @@ -541,8 +548,11 @@ MergeDataArray( "merge field data type not consistent"); if (field_meta.is_vector()) { auto vector_array = data_array->mutable_vectors(); - auto dim = field_meta.get_dim(); - vector_array->set_dim(dim); + auto dim = 0; + if (!datatype_is_sparse_vector(data_type)) { + dim = field_meta.get_dim(); + vector_array->set_dim(dim); + } if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) { auto data = VEC_FIELD_DATA(src_field_data, float).data(); auto obj = vector_array->mutable_float_vector(); diff --git a/internal/core/src/segcore/segment_c.cpp b/internal/core/src/segcore/segment_c.cpp index ff9f574576c3d..52aa2589aefe2 100644 --- a/internal/core/src/segcore/segment_c.cpp +++ b/internal/core/src/segcore/segment_c.cpp @@ -301,7 +301,8 @@ LoadFieldRawData(CSegmentInterface c_segment, auto field_meta = segment->get_schema()[milvus::FieldId(field_id)]; data_type = field_meta.get_data_type(); - if (milvus::datatype_is_vector(data_type)) { + if (milvus::datatype_is_vector(data_type) && + !milvus::datatype_is_sparse_vector(data_type)) { dim = field_meta.get_dim(); } } diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index 70c9dd5e8ad9f..3318141200c9e 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -18,6 +18,7 @@ add_definitions(-DMILVUS_TEST_SEGCORE_YAML_PATH="${CMAKE_SOURCE_DIR}/unittest/te set(MILVUS_TEST_FILES init_gtest.cpp test_bf.cpp + test_bf_sparse.cpp test_binary.cpp test_bitmap.cpp test_bool_index.cpp diff --git a/internal/core/unittest/test_always_true_expr.cpp b/internal/core/unittest/test_always_true_expr.cpp index b89420112e253..ab0e03f1f3edf 100644 --- a/internal/core/unittest/test_always_true_expr.cpp +++ b/internal/core/unittest/test_always_true_expr.cpp @@ -23,13 +23,24 @@ #include "expr/ITypeExpr.h" #include "plan/PlanNode.h" -TEST(Expr, AlwaysTrue) { +class ExprAlwaysTrueTest : public ::testing::TestWithParam {}; + +INSTANTIATE_TEST_SUITE_P( + ExprAlwaysTrueParameters, + ExprAlwaysTrueTest, + ::testing::Values(milvus::DataType::VECTOR_FLOAT, + milvus::DataType::VECTOR_SPARSE_FLOAT)); + +TEST_P(ExprAlwaysTrueTest, AlwaysTrue) { using namespace milvus; using namespace milvus::query; using namespace milvus::segcore; + auto data_type = GetParam(); + auto metric_type = data_type == DataType::VECTOR_FLOAT + ? knowhere::metric::L2 + : knowhere::metric::IP; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i64_fid = schema->AddDebugField("age", DataType::INT64); schema->set_primary_field_id(i64_fid); @@ -64,4 +75,4 @@ TEST(Expr, AlwaysTrue) { auto val = age_col[i]; ASSERT_EQ(ans, true) << "@" << i << "!!" << val; } -} \ No newline at end of file +} diff --git a/internal/core/unittest/test_bf_sparse.cpp b/internal/core/unittest/test_bf_sparse.cpp new file mode 100644 index 0000000000000..f1224ee9c6974 --- /dev/null +++ b/internal/core/unittest/test_bf_sparse.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2019-2024 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include +#include + +#include "common/Utils.h" + +#include "query/SearchBruteForce.h" +#include "test_utils/Constants.h" +#include "test_utils/Distance.h" +#include "test_utils/DataGen.h" + +using namespace milvus; +using namespace milvus::segcore; +using namespace milvus::query; + +namespace { + +std::vector +Ref(const knowhere::sparse::SparseRow* base, + const knowhere::sparse::SparseRow& query, + int nb, + int topk, + const knowhere::MetricType& metric) { + std::vector> res; + for (int i = 0; i < nb; i++) { + auto& row = base[i]; + auto distance = row.dot(query); + res.emplace_back(-distance, i); + } + std::sort(res.begin(), res.end()); + std::vector offsets; + for (int i = 0; i < topk; i++) { + auto [distance, offset] = res[i]; + if (distance == 0) { + distance = std::numeric_limits::quiet_NaN(); + offset = -1; + } + offsets.push_back(offset); + } + return offsets; +} + +void +AssertMatch(const std::vector& expected, const int64_t* actual) { + for (int i = 0; i < expected.size(); i++) { + ASSERT_EQ(expected[i], actual[i]); + } +} + +bool +is_supported_sparse_float_metric(const std::string& metric) { + return milvus::IsMetricType(metric, knowhere::metric::IP); +} + +} // namespace + +class TestSparseFloatSearchBruteForce : public ::testing::Test { + public: + void + Run(int nb, + int nq, + int topk, + const knowhere::MetricType& metric_type) { + auto bitset = std::make_shared(); + bitset->resize(nb); + auto bitset_view = BitsetView(*bitset); + + auto base = milvus::segcore::GenerateRandomSparseFloatVector(nb); + auto query = milvus::segcore::GenerateRandomSparseFloatVector(nq); + + dataset::SearchDataset dataset{ + metric_type, nq, topk, -1, kTestSparseDim, query.get()}; + if (!is_supported_sparse_float_metric(metric_type)) { + ASSERT_ANY_THROW(BruteForceSearch(dataset, + base.get(), + nb, + knowhere::Json(), + bitset_view, + DataType::VECTOR_SPARSE_FLOAT)); + return; + } + auto result = BruteForceSearch(dataset, + base.get(), + nb, + knowhere::Json(), + bitset_view, + DataType::VECTOR_SPARSE_FLOAT); + for (int i = 0; i < nq; i++) { + auto ref = Ref(base.get(), + *(query.get() + i), + nb, + topk, + metric_type); + auto ans = result.get_seg_offsets() + i * topk; + AssertMatch(ref, ans); + } + } +}; + +TEST_F(TestSparseFloatSearchBruteForce, NotSupported) { + Run(100, 10, 5, "L2"); + Run(100, 10, 5, "l2"); + Run(100, 10, 5, "lxxx"); +} + +TEST_F(TestSparseFloatSearchBruteForce, IP) { + Run(100, 10, 5, "IP"); + Run(100, 10, 5, "ip"); +} + diff --git a/internal/core/unittest/test_binlog_index.cpp b/internal/core/unittest/test_binlog_index.cpp index 0ebb5fbd6d0f1..2e9dac8776f38 100644 --- a/internal/core/unittest/test_binlog_index.cpp +++ b/internal/core/unittest/test_binlog_index.cpp @@ -27,14 +27,13 @@ using namespace milvus; using namespace milvus::segcore; namespace pb = milvus::proto; -std::shared_ptr +std::unique_ptr GenRandomFloatVecData(int rows, int dim, int seed = 42) { - std::shared_ptr vecs = - std::shared_ptr(new float[rows * dim]); + auto vecs = std::make_unique(rows * dim); std::mt19937 rng(seed); std::uniform_int_distribution<> distrib(0.0, 100.0); for (int i = 0; i < rows * dim; ++i) vecs[i] = (float)distrib(rng); - return std::move(vecs); + return vecs; } inline float @@ -60,27 +59,42 @@ GetKnnSearchRecall( return ((float)matched_num) / ((float)nq * res_k); } -using Param = const char*; +using Param = + std::tuple; class BinlogIndexTest : public ::testing::TestWithParam { void SetUp() override { - auto param = GetParam(); - metricType = param; + std::tie(data_type, metric_type, index_type) = GetParam(); schema = std::make_shared(); - auto metric_type = metricType; - vec_field_id = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, data_d, metric_type); + vec_field_id = + schema->AddDebugField("fakevec", data_type, data_d, metric_type); auto i64_fid = schema->AddDebugField("counter", DataType::INT64); schema->set_primary_field_id(i64_fid); - - // generate vector field data - vec_data = GenRandomFloatVecData(data_n, data_d); - - vec_field_data = - storage::CreateFieldData(DataType::VECTOR_FLOAT, data_d); - vec_field_data->FillFieldData(vec_data.get(), data_n); + vec_field_data = storage::CreateFieldData(data_type, data_d); + + if (data_type == DataType::VECTOR_FLOAT) { + auto vec_data = GenRandomFloatVecData(data_n, data_d); + vec_field_data->FillFieldData(vec_data.get(), data_n); + raw_dataset = knowhere::GenDataSet(data_n, data_d, vec_data.get()); + raw_dataset->SetIsOwner(true); + vec_data.release(); + } else if (data_type == DataType::VECTOR_SPARSE_FLOAT) { + auto sparse_vecs = GenerateRandomSparseFloatVector(data_n); + vec_field_data->FillFieldData(sparse_vecs.get(), data_n); + data_d = std::dynamic_pointer_cast< + milvus::FieldData>( + vec_field_data) + ->Dim(); + raw_dataset = + knowhere::GenDataSet(data_n, data_d, sparse_vecs.get()); + raw_dataset->SetIsOwner(true); + raw_dataset->SetIsSparse(true); + sparse_vecs.release(); + } else { + throw std::runtime_error("not implemented"); + } } public: @@ -88,7 +102,7 @@ class BinlogIndexTest : public ::testing::TestWithParam { GetCollectionIndexMeta(std::string index_type) { std::map index_params = { {"index_type", index_type}, - {"metric_type", metricType}, + {"metric_type", metric_type}, {"nlist", "1024"}}; std::map type_params = {{"dim", "128"}}; FieldIndexMeta fieldIndexMeta( @@ -131,23 +145,34 @@ class BinlogIndexTest : public ::testing::TestWithParam { protected: milvus::SchemaPtr schema; - const char* metricType; + knowhere::MetricType metric_type; + DataType data_type; + std::string index_type; size_t data_n = 10000; size_t data_d = 128; size_t topk = 10; milvus::FieldDataPtr vec_field_data = nullptr; milvus::segcore::SegmentSealedUPtr segment = nullptr; milvus::FieldId vec_field_id; - std::shared_ptr vec_data; + knowhere::DataSetPtr raw_dataset; }; -INSTANTIATE_TEST_SUITE_P(MetricTypeParameters, - BinlogIndexTest, - ::testing::Values(knowhere::metric::L2)); +INSTANTIATE_TEST_SUITE_P( + MetricTypeParameters, + BinlogIndexTest, + ::testing::Values( + std::make_tuple(DataType::VECTOR_FLOAT, + knowhere::metric::L2, + knowhere::IndexEnum::INDEX_FAISS_IVFFLAT), + std::make_tuple(DataType::VECTOR_SPARSE_FLOAT, + knowhere::metric::IP, + knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX), + std::make_tuple(DataType::VECTOR_SPARSE_FLOAT, + knowhere::metric::IP, + knowhere::IndexEnum::INDEX_SPARSE_WAND))); TEST_P(BinlogIndexTest, Accuracy) { - IndexMetaPtr collection_index_meta = - GetCollectionIndexMeta(knowhere::IndexEnum::INDEX_FAISS_IVFFLAT); + IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type); segment = CreateSealedSegment(schema, collection_index_meta); LoadOtherFields(); @@ -159,6 +184,7 @@ TEST_P(BinlogIndexTest, Accuracy) { auto field_data_info = FieldDataInfo{ vec_field_id.get(), data_n, std::vector{vec_field_data}}; segment->LoadFieldData(vec_field_id, field_data_info); + //assert segment has been built binlog index EXPECT_TRUE(segment->HasIndex(vec_field_id)); EXPECT_EQ(segment->get_row_count(), data_n); @@ -166,7 +192,6 @@ TEST_P(BinlogIndexTest, Accuracy) { // 2. search binlog index auto num_queries = 10; - auto query_ptr = GenRandomFloatVecData(num_queries, data_d); milvus::proto::plan::PlanNode plan_node; auto vector_anns = plan_node.mutable_vector_anns(); @@ -176,12 +201,17 @@ TEST_P(BinlogIndexTest, Accuracy) { auto query_info = vector_anns->mutable_query_info(); query_info->set_topk(topk); query_info->set_round_decimal(3); - query_info->set_metric_type(metricType); + query_info->set_metric_type(metric_type); query_info->set_search_params(R"({"nprobe": 1024})"); auto plan_str = plan_node.SerializeAsString(); auto ph_group_raw = - CreatePlaceholderGroupFromBlob(num_queries, data_d, query_ptr.get()); + data_type == DataType::VECTOR_FLOAT + ? CreatePlaceholderGroupFromBlob( + num_queries, + data_d, + GenRandomFloatVecData(num_queries, data_d).get()) + : CreateSparseFloatPlaceholderGroup(num_queries); auto plan = milvus::query::CreateSearchPlanByExpr( *schema, plan_str.data(), plan_str.size()); @@ -201,27 +231,25 @@ TEST_P(BinlogIndexTest, Accuracy) { // 3. update vector index { milvus::index::CreateIndexInfo create_index_info; - create_index_info.field_type = DataType::VECTOR_FLOAT; - create_index_info.metric_type = metricType; - create_index_info.index_type = knowhere::IndexEnum::INDEX_FAISS_IVFFLAT; + create_index_info.field_type = data_type; + create_index_info.metric_type = metric_type; + create_index_info.index_type = index_type; create_index_info.index_engine_version = knowhere::Version::GetCurrentVersion().VersionNumber(); auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex( create_index_info, milvus::storage::FileManagerContext()); auto build_conf = - knowhere::Json{{knowhere::meta::METRIC_TYPE, metricType}, + knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type}, {knowhere::meta::DIM, std::to_string(data_d)}, {knowhere::indexparam::NLIST, "1024"}}; - - auto database = knowhere::GenDataSet(data_n, data_d, vec_data.get()); - indexing->BuildWithDataset(database, build_conf); + indexing->BuildWithDataset(raw_dataset, build_conf); LoadIndexInfo load_info; load_info.field_id = vec_field_id.get(); load_info.index = std::move(indexing); - load_info.index_params["metric_type"] = metricType; + load_info.index_params["metric_type"] = metric_type; segment->DropFieldData(vec_field_id); ASSERT_NO_THROW(segment->LoadIndex(load_info)); EXPECT_TRUE(segment->HasIndex(vec_field_id)); @@ -238,8 +266,7 @@ TEST_P(BinlogIndexTest, Accuracy) { } TEST_P(BinlogIndexTest, DisableInterimIndex) { - IndexMetaPtr collection_index_meta = - GetCollectionIndexMeta(knowhere::IndexEnum::INDEX_FAISS_IVFFLAT); + IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type); segment = CreateSealedSegment(schema, collection_index_meta); LoadOtherFields(); @@ -254,27 +281,26 @@ TEST_P(BinlogIndexTest, DisableInterimIndex) { EXPECT_TRUE(segment->HasFieldData(vec_field_id)); // load vector index milvus::index::CreateIndexInfo create_index_info; - create_index_info.field_type = DataType::VECTOR_FLOAT; - create_index_info.metric_type = metricType; - create_index_info.index_type = knowhere::IndexEnum::INDEX_FAISS_IVFFLAT; + create_index_info.field_type = data_type; + create_index_info.metric_type = metric_type; + create_index_info.index_type = index_type; create_index_info.index_engine_version = knowhere::Version::GetCurrentVersion().VersionNumber(); auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex( create_index_info, milvus::storage::FileManagerContext()); auto build_conf = - knowhere::Json{{knowhere::meta::METRIC_TYPE, metricType}, + knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type}, {knowhere::meta::DIM, std::to_string(data_d)}, {knowhere::indexparam::NLIST, "1024"}}; - auto database = knowhere::GenDataSet(data_n, data_d, vec_data.get()); - indexing->BuildWithDataset(database, build_conf); + indexing->BuildWithDataset(raw_dataset, build_conf); LoadIndexInfo load_info; load_info.field_id = vec_field_id.get(); load_info.index = std::move(indexing); - load_info.index_params["metric_type"] = metricType; + load_info.index_params["metric_type"] = metric_type; segment->DropFieldData(vec_field_id); ASSERT_NO_THROW(segment->LoadIndex(load_info)); diff --git a/internal/core/unittest/test_exec.cpp b/internal/core/unittest/test_exec.cpp index b70b3c11f455a..fc409ea92b20c 100644 --- a/internal/core/unittest/test_exec.cpp +++ b/internal/core/unittest/test_exec.cpp @@ -37,7 +37,7 @@ using namespace milvus::exec; using namespace milvus::query; using namespace milvus::segcore; -class TaskTest : public testing::Test { +class TaskTest : public testing::TestWithParam { protected: void SetUp() override { @@ -46,7 +46,7 @@ class TaskTest : public testing::Test { using namespace milvus::segcore; auto schema = std::make_shared(); auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + "fakevec", GetParam(), 16, knowhere::metric::L2); auto bool_fid = schema->AddDebugField("bool", DataType::BOOL); field_map_.insert({"bool", bool_fid}); auto bool_1_fid = schema->AddDebugField("bool1", DataType::BOOL); @@ -112,7 +112,12 @@ class TaskTest : public testing::Test { int64_t num_rows_{0}; }; -TEST_F(TaskTest, UnaryExpr) { +INSTANTIATE_TEST_SUITE_P(TaskTestSuite, + TaskTest, + ::testing::Values(DataType::VECTOR_FLOAT, + DataType::VECTOR_SPARSE_FLOAT)); + +TEST_P(TaskTest, UnaryExpr) { ::milvus::proto::plan::GenericValue value; value.set_int64_val(-1); auto logical_expr = std::make_shared( @@ -149,7 +154,7 @@ TEST_F(TaskTest, UnaryExpr) { EXPECT_EQ(num_rows, num_rows_); } -TEST_F(TaskTest, LogicalExpr) { +TEST_P(TaskTest, LogicalExpr) { ::milvus::proto::plan::GenericValue value; value.set_int64_val(-1); auto left = std::make_shared( @@ -193,13 +198,13 @@ TEST_F(TaskTest, LogicalExpr) { EXPECT_EQ(num_rows, num_rows_); } -TEST_F(TaskTest, CompileInputs_and) { +TEST_P(TaskTest, CompileInputs_and) { using namespace milvus; using namespace milvus::query; using namespace milvus::segcore; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = + schema->AddDebugField("fakevec", GetParam(), 16, knowhere::metric::L2); auto int64_fid = schema->AddDebugField("int64", DataType::INT64); proto::plan::GenericValue val; val.set_int64_val(10); @@ -236,13 +241,13 @@ TEST_F(TaskTest, CompileInputs_and) { } } -TEST_F(TaskTest, CompileInputs_or_with_and) { +TEST_P(TaskTest, CompileInputs_or_with_and) { using namespace milvus; using namespace milvus::query; using namespace milvus::segcore; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = + schema->AddDebugField("fakevec", GetParam(), 16, knowhere::metric::L2); auto int64_fid = schema->AddDebugField("int64", DataType::INT64); proto::plan::GenericValue val; val.set_int64_val(10); diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index df109b7d4970b..fd98b45e2bcf7 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -39,117 +39,43 @@ using namespace milvus; using namespace milvus::query; using namespace milvus::segcore; -TEST(Expr, Range) { - SUCCEED(); - // std::string dsl_string = R"({ - // "bool": { - // "must": [ - // { - // "range": { - // "age": { - // "GT": 1, - // "LT": 100 - // } - // } - // }, - // { - // "vector": { - // "fakevec": { - // "metric_type": "L2", - // "params": { - // "nprobe": 10 - // }, - // "query": "$0", - // "topk": 10, - // "round_decimal": 3 - // } - // } - // } - // ] - // } - // })"; - - const char* raw_plan = R"(vector_anns: < - field_id: 100 - predicates: < - binary_expr: < - op: LogicalAnd - left: < - unary_range_expr: < - column_info: < - field_id: 101 - data_type: Int32 - > - op: GreaterThan - value: < - int64_val: 1 - > - > - > - right: < - unary_range_expr: < - column_info: < - field_id: 101 - data_type: Int32 - > - op: LessThan - value: < - int64_val: 100 - > - > - > - > - > - query_info: < - topk: 10 - round_decimal: 3 - metric_type: "L2" - search_params: "{\"nprobe\": 10}" - > - placeholder_tag: "$0" - >)"; - auto plan_str = translate_text_plan_to_binary_plan(raw_plan); - auto schema = std::make_shared(); - schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); - schema->AddDebugField("age", DataType::INT32); - auto plan = - CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); - ShowPlanNodeVisitor shower; - Assert(plan->tag2field_.at("$0") == - schema->get_field_id(FieldName("fakevec"))); -} +class ExprTest : public ::testing::TestWithParam< + std::pair> { + public: + void + SetUp() override { + auto param = GetParam(); + data_type = param.first; + metric_type = param.second; + } + + // replace the metric type in the plan string with the proper type + std::vector + translate_text_plan_with_metric_type(std::string plan) { + return milvus::segcore:: + replace_metric_and_translate_text_plan_to_binary_plan( + std::move(plan), metric_type); + } + + milvus::DataType data_type; + knowhere::MetricType metric_type; +}; + +INSTANTIATE_TEST_SUITE_P( + ExprTestSuite, + ExprTest, + ::testing::Values( + std::pair(milvus::DataType::VECTOR_FLOAT, knowhere::metric::L2), + std::pair(milvus::DataType::VECTOR_SPARSE_FLOAT, knowhere::metric::IP), + std::pair(milvus::DataType::VECTOR_BINARY, knowhere::metric::JACCARD))); -TEST(Expr, RangeBinary) { +TEST_P(ExprTest, Range) { SUCCEED(); - // std::string dsl_string = R"({ - // "bool": { - // "must": [ - // { - // "range": { - // "age": { - // "GT": 1, - // "LT": 100 - // } - // } - // }, - // { - // "vector": { - // "fakevec": { - // "metric_type": "Jaccard", - // "params": { - // "nprobe": 10 - // }, - // "query": "$0", - // "topk": 10, - // "round_decimal": 3 - // } - // } - // } - // ] - // } - // })"; - const char* raw_plan = R"(vector_anns: < + using namespace milvus; + using namespace milvus::query; + using namespace milvus::segcore; + + std::string raw_plan = R"(vector_anns: < field_id: 100 predicates: < binary_expr: < @@ -183,15 +109,14 @@ TEST(Expr, RangeBinary) { query_info: < topk: 10 round_decimal: 3 - metric_type: "JACCARD" + metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; - auto plan_str = translate_text_plan_to_binary_plan(raw_plan); + auto plan_str = translate_text_plan_with_metric_type(raw_plan); auto schema = std::make_shared(); - schema->AddDebugField( - "fakevec", DataType::VECTOR_BINARY, 512, knowhere::metric::JACCARD); + schema->AddDebugField("fakevec", data_type, 16, metric_type); schema->AddDebugField("age", DataType::INT32); auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); @@ -200,36 +125,9 @@ TEST(Expr, RangeBinary) { schema->get_field_id(FieldName("fakevec"))); } -TEST(Expr, InvalidRange) { +TEST_P(ExprTest, InvalidRange) { SUCCEED(); - // std::string dsl_string = R"( - // { - // "bool": { - // "must": [ - // { - // "range": { - // "age": { - // "GT": 1, - // "LT": "100" - // } - // } - // }, - // { - // "vector": { - // "fakevec": { - // "metric_type": "L2", - // "params": { - // "nprobe": 10 - // }, - // "query": "$0", - // "topk": 10 - // } - // } - // } - // ] - // } - // })"; - const char* raw_plan = R"(vector_anns: < + std::string raw_plan = R"(vector_anns: < field_id: 100 predicates: < binary_expr: < @@ -268,21 +166,19 @@ TEST(Expr, InvalidRange) { > placeholder_tag: "$0" >)"; - auto plan_str = translate_text_plan_to_binary_plan(raw_plan); + auto plan_str = translate_text_plan_with_metric_type(raw_plan); auto schema = std::make_shared(); - schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + schema->AddDebugField("fakevec", data_type, 16, metric_type); schema->AddDebugField("age", DataType::INT32); ASSERT_ANY_THROW( CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size())); } -TEST(Expr, ShowExecutor) { +TEST_P(ExprTest, ShowExecutor) { auto node = std::make_unique(); auto schema = std::make_shared(); - auto metric_type = knowhere::metric::L2; - auto field_id = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, metric_type); + auto field_id = + schema->AddDebugField("fakevec", data_type, 16, metric_type); int64_t num_queries = 100L; auto raw_data = DataGen(schema, num_queries); auto& info = node->search_info_; @@ -299,7 +195,7 @@ TEST(Expr, ShowExecutor) { std::cout << dup.dump(4); } -TEST(Expr, TestRange) { +TEST_P(ExprTest, TestRange) { std::vector>> testcases = { {R"(binary_range_expr: < column_info: < @@ -429,32 +325,6 @@ TEST(Expr, TestRange) { [](int v) { return v != 2000; }}, }; - // std::string dsl_string_tmp = R"({ - // "bool": { - // "must": [ - // { - // "range": { - // "age": { - // @@@@ - // } - // } - // }, - // { - // "vector": { - // "fakevec": { - // "metric_type": "L2", - // "params": { - // "nprobe": 10 - // }, - // "query": "$0", - // "topk": 10, - // "round_decimal": 3 - // } - // } - // } - // ] - // } - // })"; std::string raw_plan_tmp = R"(vector_anns: < field_id: 100 predicates: < @@ -469,8 +339,7 @@ TEST(Expr, TestRange) { placeholder_tag: "$0" >)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i64_fid = schema->AddDebugField("age", DataType::INT64); schema->set_primary_field_id(i64_fid); @@ -496,7 +365,7 @@ TEST(Expr, TestRange) { auto loc = raw_plan_tmp.find("@@@@"); auto raw_plan = raw_plan_tmp; raw_plan.replace(loc, 4, clause); - auto plan_str = translate_text_plan_to_binary_plan(raw_plan.c_str()); + auto plan_str = translate_text_plan_with_metric_type(raw_plan); auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP); @@ -517,7 +386,8 @@ TEST(Expr, TestRange) { } } -TEST(Expr, TestBinaryRangeJSON) { +TEST_P(ExprTest, TestBinaryRangeJSON) { + struct Testcase { bool lower_inclusive; bool upper_inclusive; @@ -616,7 +486,8 @@ TEST(Expr, TestBinaryRangeJSON) { } } -TEST(Expr, TestExistsJson) { +TEST_P(ExprTest, TestExistsJson) { + struct Testcase { std::vector nested_path; }; @@ -707,7 +578,7 @@ GetValueFromProto(const milvus::proto::plan::GenericValue& value_proto) { } }; -TEST(Expr, TestUnaryRangeJson) { +TEST_P(ExprTest, TestUnaryRangeJson) { struct Testcase { int64_t val; std::vector nested_path; @@ -876,7 +747,8 @@ TEST(Expr, TestUnaryRangeJson) { } } -TEST(Expr, TestTermJson) { +TEST_P(ExprTest, TestTermJson) { + struct Testcase { std::vector term; std::vector nested_path; @@ -947,7 +819,7 @@ TEST(Expr, TestTermJson) { } } -TEST(Expr, TestTerm) { +TEST_P(ExprTest, TestTerm) { auto vec_2k_3k = [] { std::string buf; for (int i = 2000; i < 3000; ++i) { @@ -977,33 +849,6 @@ TEST(Expr, TestTerm) { {vec_2k_3k, [](int v) { return 2000 <= v && v < 3000; }}, }; - // std::string dsl_string_tmp = R"({ - // "bool": { - // "must": [ - // { - // "term": { - // "age": { - // "values": @@@@, - // "is_in_field" : false - // } - // } - // }, - // { - // "vector": { - // "fakevec": { - // "metric_type": "L2", - // "params": { - // "nprobe": 10 - // }, - // "query": "$0", - // "topk": 10, - // "round_decimal": 3 - // } - // } - // } - // ] - // } - // })"; std::string raw_plan_tmp = R"(vector_anns: < field_id: 100 predicates: < @@ -1024,8 +869,7 @@ TEST(Expr, TestTerm) { placeholder_tag: "$0" >)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i64_fid = schema->AddDebugField("age", DataType::INT64); schema->set_primary_field_id(i64_fid); @@ -1051,7 +895,7 @@ TEST(Expr, TestTerm) { auto loc = raw_plan_tmp.find("@@@@"); auto raw_plan = raw_plan_tmp; raw_plan.replace(loc, 4, clause); - auto plan_str = translate_text_plan_to_binary_plan(raw_plan.c_str()); + auto plan_str = translate_text_plan_with_metric_type(raw_plan); auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); BitsetType final; @@ -1071,7 +915,7 @@ TEST(Expr, TestTerm) { } } -TEST(Expr, TestCompare) { +TEST_P(ExprTest, TestCompare) { std::vector>> testcases = { {R"(LessThan)", [](int a, int64_t b) { return a < b; }}, @@ -1082,33 +926,6 @@ TEST(Expr, TestCompare) { {R"(NotEqual)", [](int a, int64_t b) { return a != b; }}, }; - // std::string dsl_string_tpl = R"({ - // "bool": { - // "must": [ - // { - // "compare": { - // %1%: [ - // "age1", - // "age2" - // ] - // } - // }, - // { - // "vector": { - // "fakevec": { - // "metric_type": "L2", - // "params": { - // "nprobe": 10 - // }, - // "query": "$0", - // "topk": 10, - // "round_decimal": 3 - // } - // } - // } - // ] - // } - // })"; std::string raw_plan_tmp = R"(vector_anns: < field_id: 100 predicates: < @@ -1133,8 +950,7 @@ TEST(Expr, TestCompare) { placeholder_tag: "$0" >)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i32_fid = schema->AddDebugField("age1", DataType::INT32); auto i64_fid = schema->AddDebugField("age2", DataType::INT64); schema->set_primary_field_id(i64_fid); @@ -1166,7 +982,7 @@ TEST(Expr, TestCompare) { auto loc = raw_plan_tmp.find("@@@@"); auto raw_plan = raw_plan_tmp; raw_plan.replace(loc, 4, clause); - auto plan_str = translate_text_plan_to_binary_plan(raw_plan.c_str()); + auto plan_str = translate_text_plan_with_metric_type(raw_plan); auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); BitsetType final; @@ -1188,7 +1004,7 @@ TEST(Expr, TestCompare) { } } -TEST(Expr, TestCompareWithScalarIndex) { +TEST_P(ExprTest, TestCompareWithScalarIndex) { std::vector>> testcases = { {R"(LessThan)", [](int a, int64_t b) { return a < b; }}, @@ -1224,8 +1040,7 @@ TEST(Expr, TestCompareWithScalarIndex) { >)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i32_fid = schema->AddDebugField("age32", DataType::INT32); auto i64_fid = schema->AddDebugField("age64", DataType::INT64); schema->set_primary_field_id(i64_fid); @@ -1264,7 +1079,7 @@ TEST(Expr, TestCompareWithScalarIndex) { i32_fid.get() % proto::schema::DataType_Name(int(DataType::INT32)) % i64_fid.get() % proto::schema::DataType_Name(int(DataType::INT64)); auto binary_plan = - translate_text_plan_to_binary_plan(dsl_string.str().data()); + translate_text_plan_with_metric_type(dsl_string.str()); auto plan = CreateSearchPlanByExpr( *schema, binary_plan.data(), binary_plan.size()); // std::cout << ShowPlanNodeVisitor().call_child(*plan->plan_node_) << std::endl; @@ -1284,10 +1099,9 @@ TEST(Expr, TestCompareWithScalarIndex) { } } -TEST(Expr, TestCompareExpr) { +TEST_P(ExprTest, TestCompareExpr) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto bool_fid = schema->AddDebugField("bool", DataType::BOOL); auto bool_1_fid = schema->AddDebugField("bool1", DataType::BOOL); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); @@ -1433,10 +1247,9 @@ TEST(Expr, TestCompareExpr) { std::cout << "end compare test" << std::endl; } -TEST(Expr, TestMultiLogicalExprsOptimization) { +TEST_P(ExprTest, TestMultiLogicalExprsOptimization) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int64_fid = schema->AddDebugField("int64", DataType::INT64); auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR); schema->set_primary_field_id(str1_fid); @@ -1519,10 +1332,9 @@ TEST(Expr, TestMultiLogicalExprsOptimization) { ASSERT_LT(cost_op, cost_no_op); } -TEST(Expr, TestExprs) { +TEST_P(ExprTest, TestExprs) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -1691,11 +1503,10 @@ TEST(Expr, TestExprs) { // test_case(500); } -TEST(Expr, test_term_pk) { +TEST_P(ExprTest, test_term_pk) { auto schema = std::make_shared(); schema->AddField(FieldName("Timestamp"), FieldId(1), DataType::INT64); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR); auto int64_fid = schema->AddDebugField("int64", DataType::INT64); schema->set_primary_field_id(int64_fid); @@ -1755,10 +1566,9 @@ TEST(Expr, test_term_pk) { } } -TEST(Expr, TestSealedSegmentGetBatchSize) { +TEST_P(ExprTest, TestSealedSegmentGetBatchSize) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR); schema->set_primary_field_id(str1_fid); @@ -1817,10 +1627,9 @@ TEST(Expr, TestSealedSegmentGetBatchSize) { } } -TEST(Expr, TestGrowingSegmentGetBatchSize) { +TEST_P(ExprTest, TestGrowingSegmentGetBatchSize) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR); schema->set_primary_field_id(str1_fid); @@ -1873,10 +1682,9 @@ TEST(Expr, TestGrowingSegmentGetBatchSize) { } } -TEST(Expr, TestConjuctExpr) { +TEST_P(ExprTest, TestConjuctExpr) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -1941,10 +1749,9 @@ TEST(Expr, TestConjuctExpr) { } } -TEST(Expr, TestUnaryBenchTest) { +TEST_P(ExprTest, TestUnaryBenchTest) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -2013,10 +1820,9 @@ TEST(Expr, TestUnaryBenchTest) { } } -TEST(Expr, TestBinaryRangeBenchTest) { +TEST_P(ExprTest, TestBinaryRangeBenchTest) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -2094,10 +1900,9 @@ TEST(Expr, TestBinaryRangeBenchTest) { } } -TEST(Expr, TestLogicalUnaryBenchTest) { +TEST_P(ExprTest, TestLogicalUnaryBenchTest) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -2169,10 +1974,9 @@ TEST(Expr, TestLogicalUnaryBenchTest) { } } -TEST(Expr, TestBinaryLogicalBenchTest) { +TEST_P(ExprTest, TestBinaryLogicalBenchTest) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -2254,10 +2058,9 @@ TEST(Expr, TestBinaryLogicalBenchTest) { } } -TEST(Expr, TestBinaryArithOpEvalRangeBenchExpr) { +TEST_P(ExprTest, TestBinaryArithOpEvalRangeBenchExpr) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -2335,10 +2138,9 @@ TEST(Expr, TestBinaryArithOpEvalRangeBenchExpr) { } } -TEST(Expr, TestCompareExprBenchTest) { +TEST_P(ExprTest, TestCompareExprBenchTest) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -2409,10 +2211,9 @@ TEST(Expr, TestCompareExprBenchTest) { } } -TEST(Expr, TestRefactorExprs) { +TEST_P(ExprTest, TestRefactorExprs) { auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto int8_fid = schema->AddDebugField("int8", DataType::INT8); auto int8_1_fid = schema->AddDebugField("int81", DataType::INT8); auto int16_fid = schema->AddDebugField("int16", DataType::INT16); @@ -2579,7 +2380,7 @@ TEST(Expr, TestRefactorExprs) { // test_case(500); } -TEST(Expr, TestCompareWithScalarIndexMaris) { +TEST_P(ExprTest, TestCompareWithScalarIndexMaris) { std::vector< std::tuple>> testcases = { @@ -2597,7 +2398,7 @@ TEST(Expr, TestCompareWithScalarIndexMaris) { [](std::string a, std::string b) { return a.compare(b) != 0; }}, }; - const char* serialized_expr_plan = R"(vector_anns: < + std::string serialized_expr_plan = R"(vector_anns: < field_id: %1% predicates: < compare_expr: < @@ -2622,8 +2423,7 @@ TEST(Expr, TestCompareWithScalarIndexMaris) { >)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR); auto str2_fid = schema->AddDebugField("string2", DataType::VARCHAR); schema->set_primary_field_id(str1_fid); @@ -2658,7 +2458,7 @@ TEST(Expr, TestCompareWithScalarIndexMaris) { auto dsl_string = boost::format(serialized_expr_plan) % vec_fid.get() % clause % str1_fid.get() % str2_fid.get(); auto binary_plan = - translate_text_plan_to_binary_plan(dsl_string.str().data()); + translate_text_plan_with_metric_type(dsl_string.str()); auto plan = CreateSearchPlanByExpr( *schema, binary_plan.data(), binary_plan.size()); // std::cout << ShowPlanNodeVisitor().call_child(*plan->plan_node_) << std::endl; @@ -2678,7 +2478,7 @@ TEST(Expr, TestCompareWithScalarIndexMaris) { } } -TEST(Expr, TestBinaryArithOpEvalRange) { +TEST_P(ExprTest, TestBinaryArithOpEvalRange) { std::vector, DataType>> testcases = { // Add test cases for BinaryArithOpEvalRangeExpr EQ of various data types @@ -2893,31 +2693,6 @@ TEST(Expr, TestBinaryArithOpEvalRange) { DataType::INT64}, }; - // std::string dsl_string_tmp = R"({ - // "bool": { - // "must": [ - // { - // "range": { - // @@@@@ - // } - // }, - // { - // "vector": { - // "fakevec": { - // "metric_type": "L2", - // "params": { - // "nprobe": 10 - // }, - // "query": "$0", - // "topk": 10, - // "round_decimal": 3 - // } - // } - // } - // ] - // } - // })"; - std::string raw_plan_tmp = R"(vector_anns: < field_id: 100 predicates: < @@ -2932,8 +2707,7 @@ TEST(Expr, TestBinaryArithOpEvalRange) { placeholder_tag: "$0" >)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i8_fid = schema->AddDebugField("age8", DataType::INT8); auto i16_fid = schema->AddDebugField("age16", DataType::INT16); auto i32_fid = schema->AddDebugField("age32", DataType::INT32); @@ -3007,7 +2781,7 @@ TEST(Expr, TestBinaryArithOpEvalRange) { // } // loc = dsl_string.find("@@@@"); // dsl_string.replace(loc, 4, clause); - auto plan_str = translate_text_plan_to_binary_plan(raw_plan.c_str()); + auto plan_str = translate_text_plan_with_metric_type(raw_plan); auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); BitsetType final; @@ -3051,7 +2825,8 @@ TEST(Expr, TestBinaryArithOpEvalRange) { } } -TEST(Expr, TestBinaryArithOpEvalRangeJSON) { +TEST_P(ExprTest, TestBinaryArithOpEvalRangeJSON) { + struct Testcase { int64_t right_operand; int64_t value; @@ -3140,7 +2915,8 @@ TEST(Expr, TestBinaryArithOpEvalRangeJSON) { } } -TEST(Expr, TestBinaryArithOpEvalRangeJSONFloat) { +TEST_P(ExprTest, TestBinaryArithOpEvalRangeJSONFloat) { + struct Testcase { double right_operand; double value; @@ -3266,7 +3042,7 @@ TEST(Expr, TestBinaryArithOpEvalRangeJSONFloat) { } } -TEST(Expr, TestBinaryArithOpEvalRangeWithScalarSortIndex) { +TEST_P(ExprTest, TestBinaryArithOpEvalRangeWithScalarSortIndex) { std::vector, DataType>> testcases = { // Add test cases for BinaryArithOpEvalRangeExpr EQ of various data types @@ -3426,8 +3202,7 @@ TEST(Expr, TestBinaryArithOpEvalRangeWithScalarSortIndex) { @@@@)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i8_fid = schema->AddDebugField("age8", DataType::INT8); auto i16_fid = schema->AddDebugField("age16", DataType::INT16); auto i32_fid = schema->AddDebugField("age32", DataType::INT32); @@ -3539,8 +3314,7 @@ TEST(Expr, TestBinaryArithOpEvalRangeWithScalarSortIndex) { ASSERT_TRUE(false) << "No test case defined for this data type"; } - auto binary_plan = - translate_text_plan_to_binary_plan(expr.str().data()); + auto binary_plan = translate_text_plan_with_metric_type(expr.str()); auto plan = CreateSearchPlanByExpr( *schema, binary_plan.data(), binary_plan.size()); @@ -3582,7 +3356,7 @@ TEST(Expr, TestBinaryArithOpEvalRangeWithScalarSortIndex) { } } -TEST(Expr, TestUnaryRangeWithJSON) { +TEST_P(ExprTest, TestUnaryRangeWithJSON) { std::vector< std::tuple(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i64_fid = schema->AddDebugField("age64", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); schema->set_primary_field_id(i64_fid); @@ -3738,7 +3511,7 @@ TEST(Expr, TestUnaryRangeWithJSON) { } } - auto unary_plan = translate_text_plan_to_binary_plan(expr.str().data()); + auto unary_plan = translate_text_plan_with_metric_type(expr.str()); auto plan = CreateSearchPlanByExpr( *schema, unary_plan.data(), unary_plan.size()); @@ -3782,7 +3555,7 @@ TEST(Expr, TestUnaryRangeWithJSON) { } } -TEST(Expr, TestTermWithJSON) { +TEST_P(ExprTest, TestTermWithJSON) { std::vector< std::tuple(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i64_fid = schema->AddDebugField("age64", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); schema->set_primary_field_id(i64_fid); @@ -3916,7 +3688,7 @@ TEST(Expr, TestTermWithJSON) { } } - auto unary_plan = translate_text_plan_to_binary_plan(expr.str().data()); + auto unary_plan = translate_text_plan_with_metric_type(expr.str()); auto plan = CreateSearchPlanByExpr( *schema, unary_plan.data(), unary_plan.size()); @@ -3960,7 +3732,7 @@ TEST(Expr, TestTermWithJSON) { } } -TEST(Expr, TestExistsWithJSON) { +TEST_P(ExprTest, TestExistsWithJSON) { std::vector, DataType>> testcases = { {R"()", [](bool v) { return v; }, DataType::BOOL}, @@ -3995,8 +3767,7 @@ TEST(Expr, TestExistsWithJSON) { @@@@)"; auto schema = std::make_shared(); - auto vec_fid = schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type); auto i64_fid = schema->AddDebugField("age64", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); schema->set_primary_field_id(i64_fid); @@ -4068,7 +3839,7 @@ TEST(Expr, TestExistsWithJSON) { } } - auto unary_plan = translate_text_plan_to_binary_plan(expr.str().data()); + auto unary_plan = translate_text_plan_with_metric_type(expr.str()); auto plan = CreateSearchPlanByExpr( *schema, unary_plan.data(), unary_plan.size()); @@ -4120,7 +3891,8 @@ struct Testcase { bool res; }; -TEST(Expr, TestTermInFieldJson) { +TEST_P(ExprTest, TestTermInFieldJson) { + auto schema = std::make_shared(); auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); @@ -4336,8 +4108,8 @@ TEST(Expr, TestTermInFieldJson) { } } -TEST(Expr, PraseJsonContainsExpr) { - std::vector raw_plans{ +TEST_P(ExprTest, PraseJsonContainsExpr) { + std::vector raw_plans{ R"(vector_anns:< field_id:100 predicates:< @@ -4469,17 +4241,17 @@ TEST(Expr, PraseJsonContainsExpr) { }; for (auto& raw_plan : raw_plans) { - auto plan_str = translate_text_plan_to_binary_plan(raw_plan); + auto plan_str = translate_text_plan_with_metric_type(raw_plan); auto schema = std::make_shared(); - schema->AddDebugField( - "fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + schema->AddDebugField("fakevec", data_type, 16, metric_type); schema->AddDebugField("json", DataType::JSON); auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size()); } } -TEST(Expr, TestJsonContainsAny) { +TEST_P(ExprTest, TestJsonContainsAny) { + auto schema = std::make_shared(); auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); @@ -4699,7 +4471,8 @@ TEST(Expr, TestJsonContainsAny) { } } -TEST(Expr, TestJsonContainsAll) { +TEST_P(ExprTest, TestJsonContainsAll) { + auto schema = std::make_shared(); auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); @@ -4943,7 +4716,8 @@ TEST(Expr, TestJsonContainsAll) { } } -TEST(Expr, TestJsonContainsArray) { +TEST_P(ExprTest, TestJsonContainsArray) { + auto schema = std::make_shared(); auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); @@ -5270,7 +5044,8 @@ generatedArrayWithFourDiffType(int64_t int_val, return value; } -TEST(Expr, TestJsonContainsDiffTypeArray) { +TEST_P(ExprTest, TestJsonContainsDiffTypeArray) { + auto schema = std::make_shared(); auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); @@ -5372,7 +5147,7 @@ TEST(Expr, TestJsonContainsDiffTypeArray) { } } -TEST(Expr, TestJsonContainsDiffType) { +TEST_P(ExprTest, TestJsonContainsDiffType) { auto schema = std::make_shared(); auto i64_fid = schema->AddDebugField("id", DataType::INT64); auto json_fid = schema->AddDebugField("json", DataType::JSON); diff --git a/internal/core/unittest/test_growing.cpp b/internal/core/unittest/test_growing.cpp index 671d5d23a78f9..f5421384e02fb 100644 --- a/internal/core/unittest/test_growing.cpp +++ b/internal/core/unittest/test_growing.cpp @@ -97,9 +97,50 @@ TEST(Growing, RealCount) { ASSERT_EQ(0, segment->get_real_count()); } -TEST(Growing, FillData) { +class GrowingTest + : public ::testing::TestWithParam< + std::tuple> { + public: + void + SetUp() override { + auto index_type = std::get<0>(GetParam()); + auto metric_type = std::get<1>(GetParam()); + if (index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT || + index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC) { + data_type = DataType::VECTOR_FLOAT; + } else if (index_type == + knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX || + index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) { + data_type = DataType::VECTOR_SPARSE_FLOAT; + } else { + ASSERT_TRUE(false); + } + } + knowhere::MetricType metric_type; + std::string index_type; + DataType data_type; +}; + +INSTANTIATE_TEST_SUITE_P( + FloatGrowingTest, + GrowingTest, + ::testing::Combine( + ::testing::Values(knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, + knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC), + ::testing::Values(knowhere::metric::L2, + knowhere::metric::IP, + knowhere::metric::COSINE))); + +INSTANTIATE_TEST_SUITE_P( + SparseFloatGrowingTest, + GrowingTest, + ::testing::Combine( + ::testing::Values(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, + knowhere::IndexEnum::INDEX_SPARSE_WAND), + ::testing::Values(knowhere::metric::IP))); + +TEST_P(GrowingTest, FillData) { auto schema = std::make_shared(); - auto metric_type = knowhere::metric::L2; auto bool_field = schema->AddDebugField("bool", DataType::BOOL); auto int8_field = schema->AddDebugField("int8", DataType::INT8); auto int16_field = schema->AddDebugField("int16", DataType::INT16); @@ -121,12 +162,11 @@ TEST(Growing, FillData) { "double_array", DataType::ARRAY, DataType::DOUBLE); auto float_array_field = schema->AddDebugField("float_array", DataType::ARRAY, DataType::FLOAT); - auto vec = schema->AddDebugField( - "embeddings", DataType::VECTOR_FLOAT, 128, metric_type); + auto vec = schema->AddDebugField("embeddings", data_type, 128, metric_type); schema->set_primary_field_id(int64_field); std::map index_params = { - {"index_type", "IVF_FLAT"}, + {"index_type", index_type}, {"metric_type", metric_type}, {"nlist", "128"}}; std::map type_params = {{"dim", "128"}}; @@ -146,25 +186,6 @@ TEST(Growing, FillData) { int64_t dim = 128; for (int64_t i = 0; i < n_batch; i++) { auto dataset = DataGen(schema, per_batch); - auto bool_values = dataset.get_col(bool_field); - auto int8_values = dataset.get_col(int8_field); - auto int16_values = dataset.get_col(int16_field); - auto int32_values = dataset.get_col(int32_field); - auto int64_values = dataset.get_col(int64_field); - auto float_values = dataset.get_col(float_field); - auto double_values = dataset.get_col(double_field); - auto varchar_values = dataset.get_col(varchar_field); - auto json_values = dataset.get_col(json_field); - auto int_array_values = dataset.get_col(int_array_field); - auto long_array_values = dataset.get_col(long_array_field); - auto bool_array_values = dataset.get_col(bool_array_field); - auto string_array_values = - dataset.get_col(string_array_field); - auto double_array_values = - dataset.get_col(double_array_field); - auto float_array_values = - dataset.get_col(float_array_field); - auto vector_values = dataset.get_col(vec); auto offset = segment->PreInsert(per_batch); segment->Insert(offset, @@ -220,8 +241,16 @@ TEST(Growing, FillData) { EXPECT_EQ(varchar_result->scalars().string_data().data_size(), num_inserted); EXPECT_EQ(json_result->scalars().json_data().data_size(), num_inserted); - EXPECT_EQ(vec_result->vectors().float_vector().data_size(), - num_inserted * dim); + if (data_type == DataType::VECTOR_FLOAT) { + EXPECT_EQ(vec_result->vectors().float_vector().data_size(), + num_inserted * dim); + } else if (data_type == DataType::VECTOR_SPARSE_FLOAT) { + EXPECT_EQ( + vec_result->vectors().sparse_float_vector().contents_size(), + num_inserted); + } else { + ASSERT_TRUE(false); + } EXPECT_EQ(int_array_result->scalars().array_data().data_size(), num_inserted); EXPECT_EQ(long_array_result->scalars().array_data().data_size(), diff --git a/internal/core/unittest/test_growing_index.cpp b/internal/core/unittest/test_growing_index.cpp index 55afd136a179f..7d619182b650d 100644 --- a/internal/core/unittest/test_growing_index.cpp +++ b/internal/core/unittest/test_growing_index.cpp @@ -11,9 +11,11 @@ #include +#include "common/Utils.h" #include "pb/plan.pb.h" #include "pb/schema.pb.h" #include "query/Plan.h" +#include "segcore/ConcurrentVector.h" #include "segcore/SegmentGrowing.h" #include "segcore/SegmentGrowingImpl.h" #include "test_utils/DataGen.h" @@ -22,16 +24,63 @@ using namespace milvus; using namespace milvus::segcore; namespace pb = milvus::proto; -TEST(GrowingIndex, Correctness) { +using Param = std::tuple; + +class GrowingIndexTest : public ::testing::TestWithParam { + void + SetUp() override { + auto param = GetParam(); + index_type = std::get<0>(param); + metric_type = std::get<1>(param); + if (index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT || + index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC) { + data_type = DataType::VECTOR_FLOAT; + } else if (index_type == + knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX || + index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) { + data_type = DataType::VECTOR_SPARSE_FLOAT; + is_sparse = true; + } else { + ASSERT_TRUE(false); + } + } + + protected: + std::string index_type; + knowhere::MetricType metric_type; + DataType data_type; + bool is_sparse = false; +}; + +INSTANTIATE_TEST_SUITE_P( + FloatIndexTypeParameters, + GrowingIndexTest, + ::testing::Combine( + ::testing::Values(knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, + knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC), + ::testing::Values(knowhere::metric::L2, + knowhere::metric::COSINE, + knowhere::metric::IP))); + +INSTANTIATE_TEST_SUITE_P( + SparseIndexTypeParameters, + GrowingIndexTest, + ::testing::Combine( + ::testing::Values(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, + knowhere::IndexEnum::INDEX_SPARSE_WAND), + ::testing::Values(knowhere::metric::IP))); + +TEST_P(GrowingIndexTest, Correctness) { auto schema = std::make_shared(); auto pk = schema->AddDebugField("pk", DataType::INT64); auto random = schema->AddDebugField("random", DataType::DOUBLE); - auto vec = schema->AddDebugField( - "embeddings", DataType::VECTOR_FLOAT, 128, knowhere::metric::L2); + auto vec = schema->AddDebugField("embeddings", data_type, 128, metric_type); schema->set_primary_field_id(pk); std::map index_params = { - {"index_type", "IVF_FLAT"}, {"metric_type", "L2"}, {"nlist", "128"}}; + {"index_type", index_type}, + {"metric_type", metric_type}, + {"nlist", "128"}}; std::map type_params = {{"dim", "128"}}; FieldIndexMeta fieldIndexMeta( vec, std::move(index_params), std::move(type_params)); @@ -46,28 +95,44 @@ TEST(GrowingIndex, Correctness) { milvus::proto::plan::PlanNode plan_node; auto vector_anns = plan_node.mutable_vector_anns(); - vector_anns->set_vector_type(milvus::proto::plan::VectorType::FloatVector); + if (is_sparse) { + vector_anns->set_vector_type( + milvus::proto::plan::VectorType::SparseFloatVector); + } else { + vector_anns->set_vector_type( + milvus::proto::plan::VectorType::FloatVector); + } vector_anns->set_placeholder_tag("$0"); vector_anns->set_field_id(102); auto query_info = vector_anns->mutable_query_info(); query_info->set_topk(5); query_info->set_round_decimal(3); - query_info->set_metric_type("l2"); + query_info->set_metric_type(metric_type); query_info->set_search_params(R"({"nprobe": 16})"); auto plan_str = plan_node.SerializeAsString(); milvus::proto::plan::PlanNode range_query_plan_node; auto vector_range_querys = range_query_plan_node.mutable_vector_anns(); - vector_range_querys->set_vector_type( - milvus::proto::plan::VectorType::FloatVector); + if (is_sparse) { + vector_range_querys->set_vector_type( + milvus::proto::plan::VectorType::SparseFloatVector); + } else { + vector_range_querys->set_vector_type( + milvus::proto::plan::VectorType::FloatVector); + } vector_range_querys->set_placeholder_tag("$0"); vector_range_querys->set_field_id(102); auto range_query_info = vector_range_querys->mutable_query_info(); range_query_info->set_topk(5); range_query_info->set_round_decimal(3); - range_query_info->set_metric_type("l2"); - range_query_info->set_search_params( - R"({"nprobe": 10, "radius": 600, "range_filter": 500})"); + range_query_info->set_metric_type(metric_type); + if (PositivelyRelated(metric_type)) { + range_query_info->set_search_params( + R"({"nprobe": 10, "radius": 500, "range_filter": 600})"); + } else { + range_query_info->set_search_params( + R"({"nprobe": 10, "radius": 600, "range_filter": 500})"); + } auto range_plan_str = range_query_plan_node.SerializeAsString(); int64_t per_batch = 10000; @@ -82,20 +147,32 @@ TEST(GrowingIndex, Correctness) { dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); - auto filed_data = segmentImplPtr->get_insert_record() - .get_field_data(vec); + const VectorBase* field_data = nullptr; + if (is_sparse) { + field_data = segmentImplPtr->get_insert_record() + .get_field_data(vec); + } else { + field_data = segmentImplPtr->get_insert_record() + .get_field_data(vec); + } auto inserted = (i + 1) * per_batch; - //once index built, chunk data will be removed - if (i < 2) { - EXPECT_EQ(filed_data->num_chunk(), - upper_div(inserted, filed_data->get_size_per_chunk())); + // once index built, chunk data will be removed. + // growing index will only be built when num rows reached + // get_build_threshold(). This value for sparse is 0, thus sparse index + // will be built since the first chunk. Dense segment buffers the first + // 2 chunks before building an index in this test case. + if (!is_sparse && i < 2) { + EXPECT_EQ(field_data->num_chunk(), + upper_div(inserted, field_data->get_size_per_chunk())); } else { - EXPECT_EQ(filed_data->num_chunk(), 0); + EXPECT_EQ(field_data->num_chunk(), 0); } auto num_queries = 5; - auto ph_group_raw = CreatePlaceholderGroup(num_queries, 128, 1024); + auto ph_group_raw = + is_sparse ? CreateSparseFloatPlaceholderGroup(num_queries) + : CreatePlaceholderGroup(num_queries, 128, 1024); auto plan = milvus::query::CreateSearchPlanByExpr( *schema, plan_str.data(), plan_str.size()); @@ -109,6 +186,10 @@ TEST(GrowingIndex, Correctness) { EXPECT_EQ(sr->distances_.size(), num_queries * top_k); EXPECT_EQ(sr->seg_offsets_.size(), num_queries * top_k); + // range search for sparse is not yet supported + if (is_sparse) { + continue; + } auto range_plan = milvus::query::CreateSearchPlanByExpr( *schema, range_plan_str.data(), range_plan_str.size()); auto range_ph_group = ParsePlaceholderGroup( @@ -128,12 +209,11 @@ TEST(GrowingIndex, Correctness) { } } -TEST(GrowingIndex, MissIndexMeta) { +TEST_P(GrowingIndexTest, MissIndexMeta) { auto schema = std::make_shared(); auto pk = schema->AddDebugField("pk", DataType::INT64); auto random = schema->AddDebugField("random", DataType::DOUBLE); - auto vec = schema->AddDebugField( - "embeddings", DataType::VECTOR_FLOAT, 128, knowhere::metric::L2); + auto vec = schema->AddDebugField("embeddings", data_type, 128, metric_type); schema->set_primary_field_id(pk); auto& config = SegcoreConfig::default_config(); @@ -142,36 +222,16 @@ TEST(GrowingIndex, MissIndexMeta) { auto segment = CreateGrowingSegment(schema, nullptr); } -using Param = const char*; - -class GrowingIndexGetVectorTest : public ::testing::TestWithParam { - void - SetUp() override { - auto param = GetParam(); - metricType = param; - } - - protected: - const char* metricType; -}; - -INSTANTIATE_TEST_SUITE_P(IndexTypeParameters, - GrowingIndexGetVectorTest, - ::testing::Values(knowhere::metric::L2, - knowhere::metric::COSINE, - knowhere::metric::IP)); - -TEST_P(GrowingIndexGetVectorTest, GetVector) { +TEST_P(GrowingIndexTest, GetVector) { auto schema = std::make_shared(); auto pk = schema->AddDebugField("pk", DataType::INT64); auto random = schema->AddDebugField("random", DataType::DOUBLE); - auto vec = schema->AddDebugField( - "embeddings", DataType::VECTOR_FLOAT, 128, metricType); + auto vec = schema->AddDebugField("embeddings", data_type, 128, metric_type); schema->set_primary_field_id(pk); std::map index_params = { - {"index_type", "IVF_FLAT"}, - {"metric_type", metricType}, + {"index_type", index_type}, + {"metric_type", metric_type}, {"nlist", "128"}}; std::map type_params = {{"dim", "128"}}; FieldIndexMeta fieldIndexMeta( @@ -185,30 +245,74 @@ TEST_P(GrowingIndexGetVectorTest, GetVector) { auto segment_growing = CreateGrowingSegment(schema, metaPtr); auto segment = dynamic_cast(segment_growing.get()); - int64_t per_batch = 5000; - int64_t n_batch = 20; - int64_t dim = 128; - for (int64_t i = 0; i < n_batch; i++) { - auto dataset = DataGen(schema, per_batch); - auto fakevec = dataset.get_col(vec); - auto offset = segment->PreInsert(per_batch); - segment->Insert(offset, - per_batch, - dataset.row_ids_.data(), - dataset.timestamps_.data(), - dataset.raw_); - auto num_inserted = (i + 1) * per_batch; - auto ids_ds = GenRandomIds(num_inserted); - auto result = - segment->bulk_subscript(vec, ids_ds->GetIds(), num_inserted); - - auto vector = result.get()->mutable_vectors()->float_vector().data(); - EXPECT_TRUE(vector.size() == num_inserted * dim); - for (size_t i = 0; i < num_inserted; ++i) { - auto id = ids_ds->GetIds()[i]; - for (size_t j = 0; j < 128; ++j) { - EXPECT_TRUE(vector[i * dim + j] == - fakevec[(id % per_batch) * dim + j]); + if (data_type == DataType::VECTOR_FLOAT) { + // GetVector for VECTOR_FLOAT + int64_t per_batch = 5000; + int64_t n_batch = 20; + int64_t dim = 128; + for (int64_t i = 0; i < n_batch; i++) { + auto dataset = DataGen(schema, per_batch); + auto fakevec = dataset.get_col(vec); + auto offset = segment->PreInsert(per_batch); + segment->Insert(offset, + per_batch, + dataset.row_ids_.data(), + dataset.timestamps_.data(), + dataset.raw_); + auto num_inserted = (i + 1) * per_batch; + auto ids_ds = GenRandomIds(num_inserted); + auto result = + segment->bulk_subscript(vec, ids_ds->GetIds(), num_inserted); + + auto vector = + result.get()->mutable_vectors()->float_vector().data(); + EXPECT_TRUE(vector.size() == num_inserted * dim); + for (size_t i = 0; i < num_inserted; ++i) { + auto id = ids_ds->GetIds()[i]; + for (size_t j = 0; j < 128; ++j) { + EXPECT_TRUE(vector[i * dim + j] == + fakevec[(id % per_batch) * dim + j]); + } + } + } + } else if (is_sparse) { + // GetVector for VECTOR_SPARSE_FLOAT + int64_t per_batch = 5000; + int64_t n_batch = 20; + int64_t dim = 128; + for (int64_t i = 0; i < n_batch; i++) { + auto dataset = DataGen(schema, per_batch); + auto fakevec = + dataset.get_col>(vec); + auto offset = segment->PreInsert(per_batch); + segment->Insert(offset, + per_batch, + dataset.row_ids_.data(), + dataset.timestamps_.data(), + dataset.raw_); + auto num_inserted = (i + 1) * per_batch; + auto ids_ds = GenRandomIds(num_inserted); + auto result = + segment->bulk_subscript(vec, ids_ds->GetIds(), num_inserted); + + auto vector = result.get() + ->mutable_vectors() + ->sparse_float_vector() + .contents(); + EXPECT_TRUE(result.get() + ->mutable_vectors() + ->sparse_float_vector() + .contents_size() == num_inserted); + auto sparse_rows = SparseBytesToRows(vector); + for (size_t i = 0; i < num_inserted; ++i) { + auto id = ids_ds->GetIds()[i]; + auto actual_row = sparse_rows[i]; + auto expected_row = fakevec[(id % per_batch)]; + EXPECT_TRUE(actual_row.size() == expected_row.size()); + for (size_t j = 0; j < actual_row.size(); ++j) { + EXPECT_TRUE(actual_row[j].id == expected_row[j].id); + EXPECT_TRUE(actual_row[j].val == expected_row[j].val); + } } } } diff --git a/internal/core/unittest/test_index_wrapper.cpp b/internal/core/unittest/test_index_wrapper.cpp index 240797620df5c..ed53305265e46 100644 --- a/internal/core/unittest/test_index_wrapper.cpp +++ b/internal/core/unittest/test_index_wrapper.cpp @@ -166,12 +166,6 @@ TEST_P(IndexWrapperTest, BuildAndQuery) { ASSERT_NO_THROW(vec_index->Load(binary_set)); - if (vec_field_data_type == DataType::VECTOR_SPARSE_FLOAT) { - // TODO(SPARSE): complete test in PR adding search/query to sparse - // float vector. - return; - } - milvus::SearchInfo search_info; search_info.topk_ = K; search_info.metric_type_ = metric_type; diff --git a/internal/core/unittest/test_indexing.cpp b/internal/core/unittest/test_indexing.cpp index dd1cfdf68c69e..0206909b74f8f 100644 --- a/internal/core/unittest/test_indexing.cpp +++ b/internal/core/unittest/test_indexing.cpp @@ -296,11 +296,7 @@ TEST(Indexing, Naive) { vec_index->Query(query_ds, searchInfo, view, result); for (int i = 0; i < TOPK; ++i) { - if (result.seg_offsets_[i] < N / 2) { - std::cout << "WRONG: "; - } - std::cout << result.seg_offsets_[i] << "->" << result.distances_[i] - << std::endl; + ASSERT_FALSE(result.seg_offsets_[i] < N / 2); } } @@ -315,7 +311,6 @@ class IndexTest : public ::testing::TestWithParam { auto param = GetParam(); index_type = param.first; metric_type = param.second; - NB = 3000; // try to reduce the test time, // but the large dataset is needed for the case below. @@ -330,35 +325,42 @@ class IndexTest : public ::testing::TestWithParam { search_conf = generate_search_conf(index_type, metric_type); range_search_conf = generate_range_search_conf(index_type, metric_type); - std::map is_binary_map = { - {knowhere::IndexEnum::INDEX_FAISS_IDMAP, false}, - {knowhere::IndexEnum::INDEX_FAISS_IVFPQ, false}, - {knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, false}, - {knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, false}, - {knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT, true}, - {knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP, true}, - {knowhere::IndexEnum::INDEX_HNSW, false}, - {knowhere::IndexEnum::INDEX_DISKANN, false}, - }; - - is_binary = is_binary_map[index_type]; - if (is_binary) { + if (index_type == knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX || + index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) { + is_sparse = true; + vec_field_data_type = milvus::DataType::VECTOR_SPARSE_FLOAT; + } else if (index_type == knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT || + index_type == knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP) { + is_binary = true; vec_field_data_type = milvus::DataType::VECTOR_BINARY; } else { vec_field_data_type = milvus::DataType::VECTOR_FLOAT; } - auto dataset = GenDataset(NB, metric_type, is_binary); - if (!is_binary) { - xb_data = dataset.get_col(milvus::FieldId(100)); - xb_dataset = knowhere::GenDataSet(NB, DIM, xb_data.data()); - xq_dataset = knowhere::GenDataSet( - NQ, DIM, xb_data.data() + DIM * query_offset); - } else { + auto dataset = GenDatasetWithDataType(NB, metric_type, vec_field_data_type); + if (is_binary) { + // binary vector xb_bin_data = dataset.get_col(milvus::FieldId(100)); xb_dataset = knowhere::GenDataSet(NB, DIM, xb_bin_data.data()); xq_dataset = knowhere::GenDataSet( NQ, DIM, xb_bin_data.data() + DIM * query_offset); + } else if (is_sparse) { + // sparse vector + xb_sparse_data = + dataset.get_col>( + milvus::FieldId(100)); + xb_dataset = + knowhere::GenDataSet(NB, kTestSparseDim, xb_sparse_data.data()); + xb_dataset->SetIsSparse(true); + xq_dataset = knowhere::GenDataSet( + NQ, kTestSparseDim, xb_sparse_data.data() + query_offset); + xq_dataset->SetIsSparse(true); + } else { + // float vector + xb_data = dataset.get_col(milvus::FieldId(100)); + xb_dataset = knowhere::GenDataSet(NB, DIM, xb_data.data()); + xq_dataset = knowhere::GenDataSet( + NQ, DIM, xb_data.data() + DIM * query_offset); } } @@ -368,7 +370,8 @@ class IndexTest : public ::testing::TestWithParam { protected: std::string index_type, metric_type; - bool is_binary; + bool is_binary = false; + bool is_sparse = false; milvus::Config build_conf; milvus::Config load_conf; milvus::Config search_conf; @@ -377,9 +380,10 @@ class IndexTest : public ::testing::TestWithParam { knowhere::DataSetPtr xb_dataset; FixedVector xb_data; FixedVector xb_bin_data; + FixedVector> xb_sparse_data; knowhere::DataSetPtr xq_dataset; int64_t query_offset = 100; - int64_t NB = 3000; + int64_t NB = 3000; // will be updated to 27000 for mmap+hnsw StorageConfig storage_config_; }; @@ -397,6 +401,9 @@ INSTANTIATE_TEST_SUITE_P( knowhere::metric::JACCARD), std::pair(knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP, knowhere::metric::JACCARD), + std::pair(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, + knowhere::metric::IP), + std::pair(knowhere::IndexEnum::INDEX_SPARSE_WAND, knowhere::metric::IP), #ifdef BUILD_DISK_ANN std::pair(knowhere::IndexEnum::INDEX_DISKANN, knowhere::metric::L2), #endif @@ -506,7 +513,9 @@ TEST_P(IndexTest, BuildAndQuery) { load_conf["index_files"] = index_files; ASSERT_NO_THROW(vec_index->Load(milvus::tracer::TraceContext{}, load_conf)); EXPECT_EQ(vec_index->Count(), NB); - EXPECT_EQ(vec_index->GetDim(), DIM); + if (!is_sparse) { + EXPECT_EQ(vec_index->GetDim(), DIM); + } milvus::SearchInfo search_info; search_info.topk_ = K; @@ -518,11 +527,19 @@ TEST_P(IndexTest, BuildAndQuery) { EXPECT_EQ(result.unity_topK_, K); EXPECT_EQ(result.distances_.size(), NQ * K); EXPECT_EQ(result.seg_offsets_.size(), NQ * K); - if (!is_binary) { - EXPECT_EQ(result.seg_offsets_[0], query_offset); + if (metric_type == knowhere::metric::L2) { + // for L2 metric each vector is closest to itself + for (int i = 0; i < NQ; i++) { + EXPECT_EQ(result.seg_offsets_[i * K], query_offset + i); + } + // for other metrics we can't verify the correctness unless we perform + // brute force search to get the ground truth. + } + if (!is_sparse) { + // sparse doesn't support range search yet + search_info.search_params_ = range_search_conf; + vec_index->Query(xq_dataset, search_info, nullptr, result); } - search_info.search_params_ = range_search_conf; - vec_index->Query(xq_dataset, search_info, nullptr, result); } TEST_P(IndexTest, Mmap) { @@ -623,7 +640,9 @@ TEST_P(IndexTest, GetVector) { } else { vec_index->Load(milvus::tracer::TraceContext{}, load_conf); } - EXPECT_EQ(vec_index->GetDim(), DIM); + if (!is_sparse) { + EXPECT_EQ(vec_index->GetDim(), DIM); + } EXPECT_EQ(vec_index->Count(), NB); if (!vec_index->HasRawData()) { @@ -631,27 +650,38 @@ TEST_P(IndexTest, GetVector) { } auto ids_ds = GenRandomIds(NB); - auto results = vec_index->GetVector(ids_ds); - EXPECT_TRUE(results.size() > 0); - if (!is_binary) { - std::vector result_vectors(results.size() / (sizeof(float))); - memcpy(result_vectors.data(), results.data(), results.size()); - EXPECT_TRUE(result_vectors.size() == xb_data.size()); + if (is_binary) { + auto results = vec_index->GetVector(ids_ds); + EXPECT_EQ(results.size(), xb_bin_data.size()); + const auto data_bytes = DIM / 8; for (size_t i = 0; i < NB; ++i) { auto id = ids_ds->GetIds()[i]; - for (size_t j = 0; j < DIM; ++j) { - EXPECT_TRUE(result_vectors[i * DIM + j] == - xb_data[id * DIM + j]); + for (size_t j = 0; j < data_bytes; ++j) { + ASSERT_EQ(results[i * data_bytes + j], + xb_bin_data[id * data_bytes + j]); + } + } + } else if (is_sparse) { + auto sparse_rows = vec_index->GetSparseVector(ids_ds); + for (size_t i = 0; i < NB; ++i) { + auto id = ids_ds->GetIds()[i]; + auto& row = sparse_rows[i]; + ASSERT_EQ(row.size(), xb_sparse_data[id].size()); + for (size_t j = 0; j < row.size(); ++j) { + ASSERT_EQ(row[j].id, xb_sparse_data[id][j].id); + ASSERT_EQ(row[j].val, xb_sparse_data[id][j].val); } } } else { - EXPECT_TRUE(results.size() == xb_bin_data.size()); - const auto data_bytes = DIM / 8; + auto results = vec_index->GetVector(ids_ds); + std::vector result_vectors(results.size() / (sizeof(float))); + memcpy(result_vectors.data(), results.data(), results.size()); + ASSERT_EQ(result_vectors.size(), xb_data.size()); for (size_t i = 0; i < NB; ++i) { auto id = ids_ds->GetIds()[i]; - for (size_t j = 0; j < data_bytes; ++j) { - EXPECT_TRUE(results[i * data_bytes + j] == - xb_bin_data[id * data_bytes + j]); + for (size_t j = 0; j < DIM; ++j) { + ASSERT_EQ(result_vectors[i * DIM + j], + xb_data[id * DIM + j]); } } } diff --git a/internal/core/unittest/test_offset_ordered_array.cpp b/internal/core/unittest/test_offset_ordered_array.cpp index b69297294e30a..ec371c6114540 100644 --- a/internal/core/unittest/test_offset_ordered_array.cpp +++ b/internal/core/unittest/test_offset_ordered_array.cpp @@ -62,7 +62,7 @@ class TypedOffsetOrderedArrayTest : public testing::Test { }; using TypeOfPks = testing::Types; -TYPED_TEST_CASE_P(TypedOffsetOrderedArrayTest); +TYPED_TEST_SUITE_P(TypedOffsetOrderedArrayTest); TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) { std::vector offsets; @@ -117,5 +117,5 @@ TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) { ASSERT_EQ(0, offsets.size()); } -REGISTER_TYPED_TEST_CASE_P(TypedOffsetOrderedArrayTest, find_first); -INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TypedOffsetOrderedArrayTest, TypeOfPks); +REGISTER_TYPED_TEST_SUITE_P(TypedOffsetOrderedArrayTest, find_first); +INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TypedOffsetOrderedArrayTest, TypeOfPks); diff --git a/internal/core/unittest/test_offset_ordered_map.cpp b/internal/core/unittest/test_offset_ordered_map.cpp index aa40c7de408f3..be16aed9e0eed 100644 --- a/internal/core/unittest/test_offset_ordered_map.cpp +++ b/internal/core/unittest/test_offset_ordered_map.cpp @@ -57,7 +57,7 @@ class TypedOffsetOrderedMapTest : public testing::Test { }; using TypeOfPks = testing::Types; -TYPED_TEST_CASE_P(TypedOffsetOrderedMapTest); +TYPED_TEST_SUITE_P(TypedOffsetOrderedMapTest); TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) { std::vector offsets; @@ -110,5 +110,5 @@ TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) { ASSERT_EQ(0, offsets.size()); } -REGISTER_TYPED_TEST_CASE_P(TypedOffsetOrderedMapTest, find_first); -INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TypedOffsetOrderedMapTest, TypeOfPks); +REGISTER_TYPED_TEST_SUITE_P(TypedOffsetOrderedMapTest, find_first); +INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TypedOffsetOrderedMapTest, TypeOfPks); diff --git a/internal/core/unittest/test_retrieve.cpp b/internal/core/unittest/test_retrieve.cpp index 0139d2e7c1cc1..f63c474c5eecf 100644 --- a/internal/core/unittest/test_retrieve.cpp +++ b/internal/core/unittest/test_retrieve.cpp @@ -29,12 +29,34 @@ RetrieveUsingDefaultOutputSize(SegmentInterface* segment, return segment->Retrieve(plan, timestamp, DEFAULT_MAX_OUTPUT_SIZE); } -TEST(Retrieve, AutoID) { +using Param = DataType; +class RetrieveTest : public ::testing::TestWithParam { + public: + void + SetUp() override { + data_type = GetParam(); + metric_type = datatype_is_sparse_vector(data_type) + ? knowhere::metric::IP + : knowhere::metric::L2; + is_sparse = datatype_is_sparse_vector(data_type); + } + + DataType data_type; + knowhere::MetricType metric_type; + bool is_sparse = false; +}; + +INSTANTIATE_TEST_SUITE_P(RetrieveTest, + RetrieveTest, + ::testing::Values(DataType::VECTOR_FLOAT, + DataType::VECTOR_SPARSE_FLOAT)); + +TEST_P(RetrieveTest, AutoID) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; - auto fid_vec = schema->AddDebugField( - "vector_64", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + auto fid_vec = + schema->AddDebugField("vector_64", data_type, DIM, metric_type); schema->set_primary_field_id(fid_64); int64_t N = 100; @@ -48,12 +70,10 @@ TEST(Retrieve, AutoID) { auto plan = std::make_unique(*schema); std::vector values; - { - for (int i = 0; i < req_size; ++i) { - proto::plan::GenericValue val; - val.set_int64_val(i64_col[choose(i)]); - values.push_back(val); - } + for (int i = 0; i < req_size; ++i) { + proto::plan::GenericValue val; + val.set_int64_val(i64_col[choose(i)]); + values.push_back(val); } auto term_expr = std::make_shared( milvus::expr::ColumnInfo( @@ -72,11 +92,6 @@ TEST(Retrieve, AutoID) { Assert(field0.has_scalars()); auto field0_data = field0.scalars().long_data(); - for (int i = 0; i < req_size; ++i) { - auto index = choose(i); - auto data = field0_data.data(i); - } - for (int i = 0; i < req_size; ++i) { auto index = choose(i); auto data = field0_data.data(i); @@ -85,16 +100,21 @@ TEST(Retrieve, AutoID) { auto field1 = retrieve_results->fields_data(1); Assert(field1.has_vectors()); - auto field1_data = field1.vectors().float_vector(); - ASSERT_EQ(field1_data.data_size(), DIM * req_size); + if (!is_sparse) { + auto field1_data = field1.vectors().float_vector(); + ASSERT_EQ(field1_data.data_size(), DIM * req_size); + } else { + auto field1_data = field1.vectors().sparse_float_vector(); + ASSERT_EQ(field1_data.contents_size(), req_size); + } } -TEST(Retrieve, AutoID2) { +TEST_P(RetrieveTest, AutoID2) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; - auto fid_vec = schema->AddDebugField( - "vector_64", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + auto fid_vec = + schema->AddDebugField("vector_64", data_type, DIM, metric_type); schema->set_primary_field_id(fid_64); int64_t N = 100; @@ -140,16 +160,21 @@ TEST(Retrieve, AutoID2) { auto field1 = retrieve_results->fields_data(1); Assert(field1.has_vectors()); - auto field1_data = field1.vectors().float_vector(); - ASSERT_EQ(field1_data.data_size(), DIM * req_size); + if (!is_sparse) { + auto field1_data = field1.vectors().float_vector(); + ASSERT_EQ(field1_data.data_size(), DIM * req_size); + } else { + auto field1_data = field1.vectors().sparse_float_vector(); + ASSERT_EQ(field1_data.contents_size(), req_size); + } } -TEST(Retrieve, NotExist) { +TEST_P(RetrieveTest, NotExist) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; - auto fid_vec = schema->AddDebugField( - "vector_64", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + auto fid_vec = + schema->AddDebugField("vector_64", data_type, DIM, metric_type); schema->set_primary_field_id(fid_64); int64_t N = 100; @@ -200,16 +225,21 @@ TEST(Retrieve, NotExist) { auto field1 = retrieve_results->fields_data(1); Assert(field1.has_vectors()); - auto field1_data = field1.vectors().float_vector(); - ASSERT_EQ(field1_data.data_size(), DIM * req_size); + if (!is_sparse) { + auto field1_data = field1.vectors().float_vector(); + ASSERT_EQ(field1_data.data_size(), DIM * req_size); + } else { + auto field1_data = field1.vectors().sparse_float_vector(); + ASSERT_EQ(field1_data.contents_size(), req_size); + } } -TEST(Retrieve, Empty) { +TEST_P(RetrieveTest, Empty) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; auto fid_vec = schema->AddDebugField( - "vector_64", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + "vector_64", data_type, DIM, metric_type); schema->set_primary_field_id(fid_64); int64_t N = 100; @@ -246,15 +276,19 @@ TEST(Retrieve, Empty) { Assert(field0.has_scalars()); auto field0_data = field0.scalars().long_data(); Assert(field0_data.data_size() == 0); - Assert(field1.vectors().float_vector().data_size() == 0); + if (!is_sparse) { + ASSERT_EQ(field1.vectors().float_vector().data_size(), 0); + } else { + ASSERT_EQ(field1.vectors().sparse_float_vector().contents_size(), 0); + } } -TEST(Retrieve, Limit) { +TEST_P(RetrieveTest, Limit) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; - auto fid_vec = schema->AddDebugField( - "vector_64", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + auto fid_vec = + schema->AddDebugField("vector_64", data_type, DIM, metric_type); schema->set_primary_field_id(fid_64); int64_t N = 101; @@ -285,18 +319,22 @@ TEST(Retrieve, Limit) { auto field0 = retrieve_results->fields_data(0); auto field2 = retrieve_results->fields_data(2); Assert(field0.scalars().long_data().data_size() == N); - Assert(field2.vectors().float_vector().data_size() == N * DIM); + if (!is_sparse) { + Assert(field2.vectors().float_vector().data_size() == N * DIM); + } else { + Assert(field2.vectors().sparse_float_vector().contents_size() == N); + } } -TEST(Retrieve, FillEntry) { +TEST_P(RetrieveTest, FillEntry) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; auto fid_bool = schema->AddDebugField("bool", DataType::BOOL); auto fid_f32 = schema->AddDebugField("f32", DataType::FLOAT); auto fid_f64 = schema->AddDebugField("f64", DataType::DOUBLE); - auto fid_vec32 = schema->AddDebugField( - "vector_32", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + auto fid_vec = + schema->AddDebugField("vector", data_type, DIM, knowhere::metric::L2); auto fid_vecbin = schema->AddDebugField( "vec_bin", DataType::VECTOR_BINARY, DIM, knowhere::metric::L2); schema->set_primary_field_id(fid_64); @@ -323,7 +361,7 @@ TEST(Retrieve, FillEntry) { fid_bool, fid_f32, fid_f64, - fid_vec32, + fid_vec, fid_vecbin}; plan->field_ids_ = target_fields; EXPECT_THROW(segment->Retrieve(plan.get(), N, 1), std::runtime_error); @@ -333,12 +371,12 @@ TEST(Retrieve, FillEntry) { Assert(retrieve_results->fields_data_size() == target_fields.size()); } -TEST(Retrieve, LargeTimestamp) { +TEST_P(RetrieveTest, LargeTimestamp) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; - auto fid_vec = schema->AddDebugField( - "vector_64", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + auto fid_vec = + schema->AddDebugField("vector_64", data_type, DIM, metric_type); schema->set_primary_field_id(fid_64); int64_t N = 100; @@ -392,16 +430,21 @@ TEST(Retrieve, LargeTimestamp) { Assert(field_data.vectors().float_vector().data_size() == target_num * DIM); } + if (DataType(field_data.type()) == DataType::VECTOR_SPARSE_FLOAT) { + Assert(field_data.vectors() + .sparse_float_vector() + .contents_size() == target_num); + } } } } -TEST(Retrieve, Delete) { +TEST_P(RetrieveTest, Delete) { auto schema = std::make_shared(); auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto DIM = 16; - auto fid_vec = schema->AddDebugField( - "vector_64", DataType::VECTOR_FLOAT, DIM, knowhere::metric::L2); + auto fid_vec = + schema->AddDebugField("vector_64", data_type, DIM, metric_type); schema->set_primary_field_id(fid_64); auto fid_ts = schema->AddDebugField("Timestamp", DataType::INT64); @@ -465,8 +508,13 @@ TEST(Retrieve, Delete) { auto field2 = retrieve_results->fields_data(2); Assert(field2.has_vectors()); - auto field2_data = field2.vectors().float_vector(); - ASSERT_EQ(field2_data.data_size(), DIM * req_size); + if (!is_sparse) { + auto field2_data = field2.vectors().float_vector(); + ASSERT_EQ(field2_data.data_size(), DIM * req_size); + } else { + auto field2_data = field2.vectors().sparse_float_vector(); + ASSERT_EQ(field2_data.contents_size(), req_size); + } } int64_t row_count = 0; @@ -512,7 +560,12 @@ TEST(Retrieve, Delete) { auto field2 = retrieve_results->fields_data(2); Assert(field2.has_vectors()); - auto field2_data = field2.vectors().float_vector(); - ASSERT_EQ(field2_data.data_size(), DIM * size); + if (!is_sparse) { + auto field2_data = field2.vectors().float_vector(); + ASSERT_EQ(field2_data.data_size(), DIM * size); + } else { + auto field2_data = field2.vectors().sparse_float_vector(); + ASSERT_EQ(field2_data.contents_size(), size); + } } } diff --git a/internal/core/unittest/test_scalar_index.cpp b/internal/core/unittest/test_scalar_index.cpp index f1a52ef2d56ab..30311c7e2a25e 100644 --- a/internal/core/unittest/test_scalar_index.cpp +++ b/internal/core/unittest/test_scalar_index.cpp @@ -41,7 +41,7 @@ class TypedScalarIndexTest : public ::testing::Test { // } }; -TYPED_TEST_CASE_P(TypedScalarIndexTest); +TYPED_TEST_SUITE_P(TypedScalarIndexTest); TYPED_TEST_P(TypedScalarIndexTest, Dummy) { using T = TypeParam; @@ -213,7 +213,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) { using ScalarT = ::testing::Types; -REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, +REGISTER_TYPED_TEST_SUITE_P(TypedScalarIndexTest, Dummy, Constructor, Count, @@ -224,7 +224,7 @@ REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, Reverse, HasRawData); -INSTANTIATE_TYPED_TEST_CASE_P(ArithmeticCheck, TypedScalarIndexTest, ScalarT); +INSTANTIATE_TYPED_TEST_SUITE_P(ArithmeticCheck, TypedScalarIndexTest, ScalarT); template class TypedScalarIndexTestV2 : public ::testing::Test { @@ -344,7 +344,7 @@ struct TypedScalarIndexTestV2::Helper { using C = arrow::DoubleType; }; -TYPED_TEST_CASE_P(TypedScalarIndexTestV2); +TYPED_TEST_SUITE_P(TypedScalarIndexTestV2); TYPED_TEST_P(TypedScalarIndexTestV2, Base) { using T = TypeParam; @@ -386,6 +386,6 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) { } } -REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTestV2, Base); +REGISTER_TYPED_TEST_SUITE_P(TypedScalarIndexTestV2, Base); -INSTANTIATE_TYPED_TEST_CASE_P(ArithmeticCheck, TypedScalarIndexTestV2, ScalarT); +INSTANTIATE_TYPED_TEST_SUITE_P(ArithmeticCheck, TypedScalarIndexTestV2, ScalarT); diff --git a/internal/core/unittest/test_scalar_index_creator.cpp b/internal/core/unittest/test_scalar_index_creator.cpp index f766203d14d7b..14a1ac969e516 100644 --- a/internal/core/unittest/test_scalar_index_creator.cpp +++ b/internal/core/unittest/test_scalar_index_creator.cpp @@ -86,7 +86,7 @@ class TypedScalarIndexCreatorTest : public ::testing::Test { using ScalarT = ::testing:: Types; -TYPED_TEST_CASE_P(TypedScalarIndexCreatorTest); +TYPED_TEST_SUITE_P(TypedScalarIndexCreatorTest); TYPED_TEST_P(TypedScalarIndexCreatorTest, Dummy) { using T = TypeParam; @@ -149,11 +149,11 @@ TYPED_TEST_P(TypedScalarIndexCreatorTest, Codec) { } } -REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexCreatorTest, +REGISTER_TYPED_TEST_SUITE_P(TypedScalarIndexCreatorTest, Dummy, Constructor, Codec); -INSTANTIATE_TYPED_TEST_CASE_P(ArithmeticCheck, +INSTANTIATE_TYPED_TEST_SUITE_P(ArithmeticCheck, TypedScalarIndexCreatorTest, ScalarT); diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index f8f42d5a15c4b..a517d5090d5d3 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -34,6 +34,13 @@ using milvus::segcore::LoadIndexInfo; const int64_t ROW_COUNT = 10 * 1000; const int64_t BIAS = 4200; +using Param = std::string; +class SealedTest : public ::testing::TestWithParam { + public: + void SetUp() override { + } +}; + TEST(Sealed, without_predicate) { auto schema = std::make_shared(); auto dim = 16; diff --git a/internal/core/unittest/test_utils/Constants.h b/internal/core/unittest/test_utils/Constants.h index dfeae7b77f89c..3e8858da7dc5d 100644 --- a/internal/core/unittest/test_utils/Constants.h +++ b/internal/core/unittest/test_utils/Constants.h @@ -14,5 +14,5 @@ constexpr int64_t TestChunkSize = 32 * 1024; constexpr char TestLocalPath[] = "/tmp/milvus/local_data/"; constexpr char TestRemotePath[] = "/tmp/milvus/remote_data"; -constexpr int64_t kTestSparseDim = 10000; -constexpr float kTestSparseVectorDensity = 0.0003; +constexpr int64_t kTestSparseDim = 1000; +constexpr float kTestSparseVectorDensity = 0.003; diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index a72ef198d3df2..ac58ad362606f 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -27,7 +27,6 @@ #include "index/ScalarIndexSort.h" #include "index/StringIndexSort.h" #include "index/VectorMemIndex.h" -#include "query/SearchOnIndex.h" #include "segcore/Collection.h" #include "segcore/SegmentGrowingImpl.h" #include "segcore/SegmentSealedImpl.h" @@ -246,7 +245,10 @@ struct GeneratedData { }; inline std::unique_ptr[]> -GenerateRandomSparseFloatVector(size_t rows, size_t cols, float density, int seed = 42) { +GenerateRandomSparseFloatVector(size_t rows, + size_t cols = kTestSparseDim, + float density = kTestSparseVectorDensity, + int seed = 42) { int32_t num_elements = static_cast(rows * cols * density); std::mt19937 rng(seed); @@ -1143,6 +1145,23 @@ translate_text_plan_to_binary_plan(const char* text_plan) { return ret; } +// we have lots of tests with literal string plan with hard coded metric type, +// so creating a helper function to replace metric type for different metrics. +inline std::vector +replace_metric_and_translate_text_plan_to_binary_plan( + std::string plan, knowhere::MetricType metric_type) { + if (metric_type != knowhere::metric::L2) { + std::string replace = R"(metric_type: "L2")"; + std::string target = "metric_type: \"" + metric_type + "\""; + size_t pos = 0; + while ((pos = plan.find(replace, pos)) != std::string::npos) { + plan.replace(pos, replace.length(), target); + pos += target.length(); + } + } + return translate_text_plan_to_binary_plan(plan.c_str()); +} + inline auto GenTss(int64_t num, int64_t begin_ts) { std::vector tss(num, 0); diff --git a/internal/core/unittest/test_utils/indexbuilder_test_utils.h b/internal/core/unittest/test_utils/indexbuilder_test_utils.h index fc1f3b67fc141..dd2b08cf7ad8b 100644 --- a/internal/core/unittest/test_utils/indexbuilder_test_utils.h +++ b/internal/core/unittest/test_utils/indexbuilder_test_utils.h @@ -102,6 +102,7 @@ generate_build_conf(const milvus::IndexType& index_type, index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) { return knowhere::Json{ {knowhere::meta::METRIC_TYPE, metric_type}, + {knowhere::indexparam::DROP_RATIO_BUILD, "0.1"}, }; } return knowhere::Json(); diff --git a/internal/proto/plan.proto b/internal/proto/plan.proto index 3a15ba1b9cec5..ca19f76ba2a1b 100644 --- a/internal/proto/plan.proto +++ b/internal/proto/plan.proto @@ -35,6 +35,7 @@ enum VectorType { FloatVector = 1; Float16Vector = 2; BFloat16Vector = 3; + SparseFloatVector = 4; }; message GenericValue {