diff --git a/core/src/codecs/snapshot/SSAttrsFormat.cpp b/core/src/codecs/snapshot/SSAttrsFormat.cpp deleted file mode 100644 index 6d4910327edba..0000000000000 --- a/core/src/codecs/snapshot/SSAttrsFormat.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "codecs/snapshot/SSAttrsFormat.h" - -#include -#include -#include -#include -#include - -#include - -#include "utils/Exception.h" -#include "utils/Log.h" -#include "utils/TimeRecorder.h" - -namespace milvus { -namespace codec { - -void -SSAttrsFormat::read_attrs_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, off_t offset, - size_t num, std::vector& raw_attrs, size_t& nbytes) { - auto open_res = fs_ptr->reader_ptr_->open(file_path.c_str()); - fiu_do_on("read_attrs_internal_open_file_fail", open_res = false); - if (!open_res) { - std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg); - } - - fs_ptr->reader_ptr_->read(&nbytes, sizeof(size_t)); - - num = std::min(num, nbytes - offset); - - offset += sizeof(size_t); - fs_ptr->reader_ptr_->seekg(offset); - - raw_attrs.resize(num / sizeof(uint8_t)); - fs_ptr->reader_ptr_->read(raw_attrs.data(), num); - - fs_ptr->reader_ptr_->close(); -} - -void -SSAttrsFormat::read_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - std::vector& uids) { - auto open_res = fs_ptr->reader_ptr_->open(file_path.c_str()); - fiu_do_on("read_uids_internal_open_file_fail", open_res = false); - if (!open_res) { - std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg); - } - - size_t num_bytes; - fs_ptr->reader_ptr_->read(&num_bytes, sizeof(size_t)); - - uids.resize(num_bytes / sizeof(int64_t)); - fs_ptr->reader_ptr_->read(uids.data(), num_bytes); - - fs_ptr->reader_ptr_->read(uids.data(), num_bytes); -} - -void -SSAttrsFormat::read(const milvus::storage::FSHandlerPtr& fs_ptr, milvus::segment::AttrsPtr& attrs_read) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - auto is_directory = boost::filesystem::is_directory(dir_path); - fiu_do_on("read_id_directory_false", is_directory = false); - if (!is_directory) { - std::string err_msg = "Directory: " + dir_path + "does not exist"; - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_INVALID_ARGUMENT, err_msg); - } - - boost::filesystem::path target_path(dir_path); - typedef boost::filesystem::directory_iterator d_it; - d_it it_end; - d_it uid_it(target_path); - std::vector uids; - for (; uid_it != it_end; ++uid_it) { - const auto& path = uid_it->path(); - if (path.extension().string() == user_id_extension_) { - read_uids_internal(fs_ptr, path.string(), uids); - break; - } - } - - d_it it(target_path); - for (; it != it_end; ++it) { - const auto& path = it->path(); - if (path.extension().string() == raw_attr_extension_) { - auto file_name = path.filename().string(); - auto field_name = file_name.substr(0, file_name.size() - 3); - std::vector attr_list; - size_t nbytes; - read_attrs_internal(fs_ptr, path.string(), 0, INT64_MAX, attr_list, nbytes); - milvus::segment::AttrPtr attr = - std::make_shared(attr_list, nbytes, uids, field_name); - attrs_read->attrs.insert(std::pair(field_name, attr)); - } - } -} - -void -SSAttrsFormat::write(const milvus::storage::FSHandlerPtr& fs_ptr, const milvus::segment::AttrsPtr& attrs_ptr) { - TimeRecorder rc("write attributes"); - - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - - auto it = attrs_ptr->attrs.begin(); - if (it == attrs_ptr->attrs.end()) { - // std::string err_msg = "Attributes is null"; - // LOG_ENGINE_ERROR_ << err_msg; - return; - } - -#if 0 - const std::string uid_file_path = dir_path + "/" + it->second->GetCollectionId() + user_id_extension_; - - int uid_fd = open(uid_file_path.c_str(), O_WRONLY | O_TRUNC | O_CREAT, 00664); - if (uid_fd == -1) { - std::string err_msg = "Failed to open file: " + uid_file_path + ", error: " + std::strerror(errno); - ENGINE_LOG_ERROR << err_msg; - throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg); - } - size_t uid_num_bytes = it->second->GetUids().size() * sizeof(int64_t); - if (::write(uid_fd, &uid_num_bytes, sizeof(size_t)) == -1) { - std::string err_msg = "Failed to write to file" + uid_file_path + ", error: " + std::strerror(errno); - ENGINE_LOG_ERROR << err_msg; - throw Exception(SERVER_WRITE_ERROR, err_msg); - } - if (::write(uid_fd, it->second->GetUids().data(), uid_num_bytes) == -1) { - std::string err_msg = "Failed to write to file" + uid_file_path + ", error: " + std::strerror(errno); - ENGINE_LOG_ERROR << err_msg; - throw Exception(SERVER_WRITE_ERROR, err_msg); - } - if (::close(uid_fd) == -1) { - std::string err_msg = "Failed to close file: " + uid_file_path + ", error: " + std::strerror(errno); - ENGINE_LOG_ERROR << err_msg; - throw Exception(SERVER_WRITE_ERROR, err_msg); - } - rc.RecordSection("write uids done"); -#endif - - for (; it != attrs_ptr->attrs.end(); it++) { - const std::string ra_file_path = dir_path + "/" + it->second->GetName() + raw_attr_extension_; - - int ra_fd = open(ra_file_path.c_str(), O_WRONLY | O_TRUNC | O_CREAT, 00664); - if (ra_fd == -1) { - std::string err_msg = "Failed to open file: " + ra_file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg); - } - - size_t ra_num_bytes = it->second->GetNbytes(); - if (::write(ra_fd, &ra_num_bytes, sizeof(size_t)) == -1) { - std::string err_msg = "Failed to write to file: " + ra_file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_WRITE_ERROR, err_msg); - } - if (::write(ra_fd, it->second->GetData().data(), ra_num_bytes) == -1) { - std::string err_msg = "Failed to write to file: " + ra_file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_WRITE_ERROR, err_msg); - } - if (::close(ra_fd) == -1) { - std::string err_msg = "Failed to close file: " + ra_file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_WRITE_ERROR, err_msg); - } - - rc.RecordSection("write rv done"); - } -} - -void -SSAttrsFormat::read_attrs(const milvus::storage::FSHandlerPtr& fs_ptr, const std::string& field_name, off_t offset, - size_t num_bytes, std::vector& raw_attrs) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - if (!boost::filesystem::is_directory(dir_path)) { - std::string err_msg = "Directory: " + dir_path + "does not exist"; - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_INVALID_ARGUMENT, err_msg); - } - - boost::filesystem::path target_path(dir_path); - typedef boost::filesystem::directory_iterator d_it; - d_it it_end; - d_it it(target_path); - - for (; it != it_end; ++it) { - const auto& path = it->path(); - std::string file_name = path.filename().string(); - if (path.extension().string() == raw_attr_extension_ && - file_name.substr(0, file_name.size() - 3) == field_name) { - size_t nbytes; - read_attrs_internal(fs_ptr, path.string(), offset, num_bytes, raw_attrs, nbytes); - } - } -} - -void -SSAttrsFormat::read_uids(const milvus::storage::FSHandlerPtr& fs_ptr, std::vector& uids) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - auto is_directory = boost::filesystem::is_directory(dir_path); - fiu_do_on("is_directory_false", is_directory = false); - if (!is_directory) { - std::string err_msg = "Directory: " + dir_path + "does not exist"; - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_INVALID_ARGUMENT, err_msg); - } - - boost::filesystem::path target_path(dir_path); - typedef boost::filesystem::directory_iterator d_it; - d_it it_end; - d_it it(target_path); - // for (auto& it : boost::filesystem::directory_iterator(dir_path)) { - for (; it != it_end; ++it) { - const auto& path = it->path(); - if (path.extension().string() == user_id_extension_) { - read_uids_internal(fs_ptr, path.string(), uids); - } - } -} - -} // namespace codec -} // namespace milvus diff --git a/core/src/codecs/snapshot/SSAttrsFormat.h b/core/src/codecs/snapshot/SSAttrsFormat.h deleted file mode 100644 index 2b53ddb7de80f..0000000000000 --- a/core/src/codecs/snapshot/SSAttrsFormat.h +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "segment/Attrs.h" -#include "storage/FSHandler.h" - -namespace milvus { -namespace codec { - -class SSAttrsFormat { - public: - SSAttrsFormat() = default; - - void - read(const storage::FSHandlerPtr& fs_ptr, segment::AttrsPtr& attrs_read); - - void - write(const storage::FSHandlerPtr& fs_ptr, const segment::AttrsPtr& attr); - - void - read_attrs(const storage::FSHandlerPtr& fs_ptr, const std::string& field_name, off_t offset, size_t num_bytes, - std::vector& raw_attrs); - - void - read_uids(const storage::FSHandlerPtr& fs_ptr, std::vector& uids); - - // No copy and move - SSAttrsFormat(const SSAttrsFormat&) = delete; - SSAttrsFormat(SSAttrsFormat&&) = delete; - - SSAttrsFormat& - operator=(const SSAttrsFormat&) = delete; - SSAttrsFormat& - operator=(SSAttrsFormat&&) = delete; - - private: - void - read_attrs_internal(const storage::FSHandlerPtr& fs_ptr, const std::string&, off_t, size_t, std::vector&, - size_t&); - - void - read_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string&, std::vector&); - - private: - const std::string raw_attr_extension_ = ".ra"; - const std::string user_id_extension_ = ".uid"; -}; - -using SSAttrsFormatPtr = std::shared_ptr; - -} // namespace codec -} // namespace milvus diff --git a/core/src/codecs/snapshot/SSBlockFormat.cpp b/core/src/codecs/snapshot/SSBlockFormat.cpp index 743cd1c3b2d5f..5dde918bf1b79 100644 --- a/core/src/codecs/snapshot/SSBlockFormat.cpp +++ b/core/src/codecs/snapshot/SSBlockFormat.cpp @@ -48,6 +48,78 @@ SSBlockFormat::read(const storage::FSHandlerPtr& fs_ptr, const std::string& file fs_ptr->reader_ptr_->close(); } +void +SSBlockFormat::read(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, int64_t offset, + int64_t num_bytes, std::vector& raw) { + if (offset < 0 || num_bytes <= 0) { + std::string err_msg = "Invalid input to read: " + file_path; + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_INVALID_ARGUMENT, err_msg); + } + + if (!fs_ptr->reader_ptr_->open(file_path.c_str())) { + std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg); + } + + size_t total_num_bytes; + fs_ptr->reader_ptr_->read(&total_num_bytes, sizeof(size_t)); + + offset += sizeof(size_t); // Beginning of file is num_bytes + if (offset + num_bytes > total_num_bytes) { + std::string err_msg = "Invalid input to read: " + file_path; + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_INVALID_ARGUMENT, err_msg); + } + + raw.resize(num_bytes); + fs_ptr->reader_ptr_->seekg(offset); + fs_ptr->reader_ptr_->read(raw.data(), num_bytes); + fs_ptr->reader_ptr_->close(); +} + +void +SSBlockFormat::read(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, const ReadRanges& read_ranges, + std::vector& raw) { + if (read_ranges.empty()) { + return; + } + + if (!fs_ptr->reader_ptr_->open(file_path.c_str())) { + std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg); + } + + size_t total_num_bytes; + fs_ptr->reader_ptr_->read(&total_num_bytes, sizeof(size_t)); + + int64_t total_bytes = 0; + for (auto& range : read_ranges) { + int64_t offset = range.offset_ + sizeof(size_t); + if (offset + range.num_bytes_ > total_num_bytes) { + std::string err_msg = "Invalid input to read: " + file_path; + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_INVALID_ARGUMENT, err_msg); + } + + total_bytes += range.num_bytes_; + } + + raw.clear(); + raw.resize(total_bytes); + int64_t poz = 0; + for (auto& range : read_ranges) { + int64_t offset = range.offset_ + sizeof(size_t); + fs_ptr->reader_ptr_->seekg(offset); + fs_ptr->reader_ptr_->read(raw.data() + poz, range.num_bytes_); + poz += range.num_bytes_; + } + + fs_ptr->reader_ptr_->close(); +} + void SSBlockFormat::write(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, const std::vector& raw) { diff --git a/core/src/codecs/snapshot/SSBlockFormat.h b/core/src/codecs/snapshot/SSBlockFormat.h index 5efe490974b4c..24bedbae1f267 100644 --- a/core/src/codecs/snapshot/SSBlockFormat.h +++ b/core/src/codecs/snapshot/SSBlockFormat.h @@ -27,6 +27,15 @@ namespace milvus { namespace codec { +struct ReadRange { + ReadRange(int64_t offset, int64_t num_bytes) : offset_(offset), num_bytes_(num_bytes) { + } + int64_t offset_; + int64_t num_bytes_; +}; + +using ReadRanges = std::vector; + class SSBlockFormat { public: SSBlockFormat() = default; @@ -34,6 +43,14 @@ class SSBlockFormat { void read(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, std::vector& raw); + void + read(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, int64_t offset, int64_t num_bytes, + std::vector& raw); + + void + read(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, const ReadRanges& read_ranges, + std::vector& raw); + void write(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, const std::vector& raw); diff --git a/core/src/codecs/snapshot/SSCodec.cpp b/core/src/codecs/snapshot/SSCodec.cpp index ca7202ffce64e..a0cf7ba69e8e0 100644 --- a/core/src/codecs/snapshot/SSCodec.cpp +++ b/core/src/codecs/snapshot/SSCodec.cpp @@ -19,12 +19,10 @@ #include -#include "SSAttrsFormat.h" -#include "SSAttrsIndexFormat.h" #include "SSDeletedDocsFormat.h" #include "SSIdBloomFilterFormat.h" +#include "SSStructuredIndexFormat.h" #include "SSVectorIndexFormat.h" -#include "SSVectorsFormat.h" namespace milvus { namespace codec { @@ -37,10 +35,8 @@ SSCodec::instance() { SSCodec::SSCodec() { block_format_ptr_ = std::make_shared(); - vectors_format_ptr_ = std::make_shared(); - attrs_format_ptr_ = std::make_shared(); + structured_index_format_ptr_ = std::make_shared(); vector_index_format_ptr_ = std::make_shared(); - attrs_index_format_ptr_ = std::make_shared(); deleted_docs_format_ptr_ = std::make_shared(); id_bloom_filter_format_ptr_ = std::make_shared(); vector_compress_format_ptr_ = std::make_shared(); @@ -51,24 +47,14 @@ SSCodec::GetBlockFormat() { return block_format_ptr_; } -SSVectorsFormatPtr -SSCodec::GetVectorsFormat() { - return vectors_format_ptr_; -} - -SSAttrsFormatPtr -SSCodec::GetAttrsFormat() { - return attrs_format_ptr_; -} - SSVectorIndexFormatPtr SSCodec::GetVectorIndexFormat() { return vector_index_format_ptr_; } -SSAttrsIndexFormatPtr -SSCodec::GetAttrsIndexFormat() { - return attrs_index_format_ptr_; +SSStructuredIndexFormatPtr +SSCodec::GetStructuredIndexFormat() { + return structured_index_format_ptr_; } SSDeletedDocsFormatPtr diff --git a/core/src/codecs/snapshot/SSCodec.h b/core/src/codecs/snapshot/SSCodec.h index 85c3487e5d639..b5ee9cab729c8 100644 --- a/core/src/codecs/snapshot/SSCodec.h +++ b/core/src/codecs/snapshot/SSCodec.h @@ -17,14 +17,12 @@ #pragma once -#include "codecs/snapshot/SSAttrsFormat.h" -#include "codecs/snapshot/SSAttrsIndexFormat.h" #include "codecs/snapshot/SSBlockFormat.h" #include "codecs/snapshot/SSDeletedDocsFormat.h" #include "codecs/snapshot/SSIdBloomFilterFormat.h" +#include "codecs/snapshot/SSStructuredIndexFormat.h" #include "codecs/snapshot/SSVectorCompressFormat.h" #include "codecs/snapshot/SSVectorIndexFormat.h" -#include "codecs/snapshot/SSVectorsFormat.h" namespace milvus { namespace codec { @@ -37,17 +35,11 @@ class SSCodec { SSBlockFormatPtr GetBlockFormat(); - SSVectorsFormatPtr - GetVectorsFormat(); - - SSAttrsFormatPtr - GetAttrsFormat(); - SSVectorIndexFormatPtr GetVectorIndexFormat(); - SSAttrsIndexFormatPtr - GetAttrsIndexFormat(); + SSStructuredIndexFormatPtr + GetStructuredIndexFormat(); SSDeletedDocsFormatPtr GetDeletedDocsFormat(); @@ -63,10 +55,8 @@ class SSCodec { private: SSBlockFormatPtr block_format_ptr_; - SSVectorsFormatPtr vectors_format_ptr_; - SSAttrsFormatPtr attrs_format_ptr_; + SSStructuredIndexFormatPtr structured_index_format_ptr_; SSVectorIndexFormatPtr vector_index_format_ptr_; - SSAttrsIndexFormatPtr attrs_index_format_ptr_; SSDeletedDocsFormatPtr deleted_docs_format_ptr_; SSIdBloomFilterFormatPtr id_bloom_filter_format_ptr_; SSVectorCompressFormatPtr vector_compress_format_ptr_; diff --git a/core/src/codecs/snapshot/SSIdBloomFilterFormat.cpp b/core/src/codecs/snapshot/SSIdBloomFilterFormat.cpp index 973b0c70a60dc..82f54078a8f75 100644 --- a/core/src/codecs/snapshot/SSIdBloomFilterFormat.cpp +++ b/core/src/codecs/snapshot/SSIdBloomFilterFormat.cpp @@ -59,9 +59,9 @@ SSIdBloomFilterFormat::write(const storage::FSHandlerPtr& fs_ptr, const std::str } void -SSIdBloomFilterFormat::create(const storage::FSHandlerPtr& fs_ptr, segment::IdBloomFilterPtr& id_bloom_filter_ptr) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - const std::string bloom_filter_file_path = dir_path + "/" + bloom_filter_filename_; +SSIdBloomFilterFormat::create(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, + segment::IdBloomFilterPtr& id_bloom_filter_ptr) { + const std::string bloom_filter_file_path = file_path; scaling_bloom_t* bloom_filter = new_scaling_bloom(bloom_filter_capacity, bloom_filter_error_rate, bloom_filter_file_path.c_str()); if (bloom_filter == nullptr) { diff --git a/core/src/codecs/snapshot/SSIdBloomFilterFormat.h b/core/src/codecs/snapshot/SSIdBloomFilterFormat.h index 036ae2009b643..18e8dac72aaff 100644 --- a/core/src/codecs/snapshot/SSIdBloomFilterFormat.h +++ b/core/src/codecs/snapshot/SSIdBloomFilterFormat.h @@ -39,7 +39,8 @@ class SSIdBloomFilterFormat { const segment::IdBloomFilterPtr& id_bloom_filter_ptr); void - create(const storage::FSHandlerPtr& fs_ptr, segment::IdBloomFilterPtr& id_bloom_filter_ptr); + create(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, + segment::IdBloomFilterPtr& id_bloom_filter_ptr); // No copy and move SSIdBloomFilterFormat(const SSIdBloomFilterFormat&) = delete; diff --git a/core/src/codecs/snapshot/SSAttrsIndexFormat.cpp b/core/src/codecs/snapshot/SSStructuredIndexFormat.cpp similarity index 91% rename from core/src/codecs/snapshot/SSAttrsIndexFormat.cpp rename to core/src/codecs/snapshot/SSStructuredIndexFormat.cpp index d9511af791ee7..29a5e6ed63de9 100644 --- a/core/src/codecs/snapshot/SSAttrsIndexFormat.cpp +++ b/core/src/codecs/snapshot/SSStructuredIndexFormat.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "codecs/snapshot/SSAttrsIndexFormat.h" +#include "codecs/snapshot/SSStructuredIndexFormat.h" #include #include @@ -35,7 +35,7 @@ namespace milvus { namespace codec { knowhere::IndexPtr -SSAttrsIndexFormat::create_structured_index(const milvus::engine::meta::hybrid::DataType data_type) { +SSStructuredIndexFormat::create_structured_index(const milvus::engine::meta::hybrid::DataType data_type) { knowhere::IndexPtr index = nullptr; switch (data_type) { case engine::meta::hybrid::DataType::INT8: { @@ -71,8 +71,8 @@ SSAttrsIndexFormat::create_structured_index(const milvus::engine::meta::hybrid:: } void -SSAttrsIndexFormat::read_internal(const milvus::storage::FSHandlerPtr& fs_ptr, const std::string& path, - knowhere::IndexPtr& index, engine::meta::hybrid::DataType& attr_type) { +SSStructuredIndexFormat::read_internal(const milvus::storage::FSHandlerPtr& fs_ptr, const std::string& path, + knowhere::IndexPtr& index, engine::meta::hybrid::DataType& attr_type) { milvus::TimeRecorder recorder("read_index"); knowhere::BinarySet load_data_list; @@ -137,7 +137,8 @@ SSAttrsIndexFormat::read_internal(const milvus::storage::FSHandlerPtr& fs_ptr, c } void -SSAttrsIndexFormat::read(const milvus::storage::FSHandlerPtr& fs_ptr, milvus::segment::AttrsIndexPtr& attrs_index) { +SSStructuredIndexFormat::read(const milvus::storage::FSHandlerPtr& fs_ptr, + milvus::segment::AttrsIndexPtr& attrs_index) { std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); if (!boost::filesystem::is_directory(dir_path)) { std::string err_msg = "Directory: " + dir_path + "does not exist"; @@ -165,8 +166,8 @@ SSAttrsIndexFormat::read(const milvus::storage::FSHandlerPtr& fs_ptr, milvus::se } void -SSAttrsIndexFormat::write(const milvus::storage::FSHandlerPtr& fs_ptr, - const milvus::segment::AttrsIndexPtr& attrs_index) { +SSStructuredIndexFormat::write(const milvus::storage::FSHandlerPtr& fs_ptr, + const milvus::segment::AttrsIndexPtr& attrs_index) { milvus::TimeRecorder recorder("write_index"); recorder.RecordSection("Start"); diff --git a/core/src/codecs/snapshot/SSAttrsIndexFormat.h b/core/src/codecs/snapshot/SSStructuredIndexFormat.h similarity index 78% rename from core/src/codecs/snapshot/SSAttrsIndexFormat.h rename to core/src/codecs/snapshot/SSStructuredIndexFormat.h index 21cd4719933f6..f34f6deb4f834 100644 --- a/core/src/codecs/snapshot/SSAttrsIndexFormat.h +++ b/core/src/codecs/snapshot/SSStructuredIndexFormat.h @@ -28,9 +28,9 @@ namespace milvus { namespace codec { -class SSAttrsIndexFormat { +class SSStructuredIndexFormat { public: - SSAttrsIndexFormat() = default; + SSStructuredIndexFormat() = default; void read(const storage::FSHandlerPtr& fs_ptr, segment::AttrsIndexPtr& attr_index); @@ -39,13 +39,13 @@ class SSAttrsIndexFormat { write(const storage::FSHandlerPtr& fs_ptr, const segment::AttrsIndexPtr& attr_index); // No copy and move - SSAttrsIndexFormat(const SSAttrsIndexFormat&) = delete; - SSAttrsIndexFormat(SSAttrsIndexFormat&&) = delete; + SSStructuredIndexFormat(const SSStructuredIndexFormat&) = delete; + SSStructuredIndexFormat(SSStructuredIndexFormat&&) = delete; - SSAttrsIndexFormat& - operator=(const SSAttrsIndexFormat&) = delete; - SSAttrsIndexFormat& - operator=(SSAttrsIndexFormat&&) = delete; + SSStructuredIndexFormat& + operator=(const SSStructuredIndexFormat&) = delete; + SSStructuredIndexFormat& + operator=(SSStructuredIndexFormat&&) = delete; private: void @@ -59,7 +59,7 @@ class SSAttrsIndexFormat { const std::string attr_index_extension_ = ".idx"; }; -using SSAttrsIndexFormatPtr = std::shared_ptr; +using SSStructuredIndexFormatPtr = std::shared_ptr; } // namespace codec } // namespace milvus diff --git a/core/src/codecs/snapshot/SSVectorIndexFormat.cpp b/core/src/codecs/snapshot/SSVectorIndexFormat.cpp index c00afb62ef55e..321ee91a16a79 100644 --- a/core/src/codecs/snapshot/SSVectorIndexFormat.cpp +++ b/core/src/codecs/snapshot/SSVectorIndexFormat.cpp @@ -23,7 +23,6 @@ #include "knowhere/common/BinarySet.h" #include "knowhere/index/vector_index/VecIndex.h" #include "knowhere/index/vector_index/VecIndexFactory.h" -#include "segment/VectorIndex.h" #include "utils/Exception.h" #include "utils/Log.h" #include "utils/TimeRecorder.h" @@ -31,22 +30,44 @@ namespace milvus { namespace codec { -knowhere::VecIndexPtr -SSVectorIndexFormat::read_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& path, - const std::string& extern_key, const knowhere::BinaryPtr& extern_data) { +void +SSVectorIndexFormat::read_raw(const storage::FSHandlerPtr& fs_ptr, const std::string& location, + knowhere::BinaryPtr& data) { + if (!fs_ptr->reader_ptr_->open(location.c_str())) { + std::string err_msg = "Failed to open file: " + location + ", error: " + std::strerror(errno); + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg); + } + + size_t num_bytes; + fs_ptr->reader_ptr_->read(&num_bytes, sizeof(size_t)); + + data = std::make_shared(); + data->size = num_bytes; + data->data = std::shared_ptr(new uint8_t[num_bytes]); + + // Beginning of file is num_bytes + fs_ptr->reader_ptr_->seekg(sizeof(size_t)); + fs_ptr->reader_ptr_->read(data->data.get(), num_bytes); + fs_ptr->reader_ptr_->close(); +} + +void +SSVectorIndexFormat::read_index(const storage::FSHandlerPtr& fs_ptr, const std::string& location, + knowhere::BinarySet& data) { milvus::TimeRecorder recorder("read_index"); - knowhere::BinarySet load_data_list; recorder.RecordSection("Start"); - if (!fs_ptr->reader_ptr_->open(path)) { - LOG_ENGINE_ERROR_ << "Fail to open vector index: " << path; - return nullptr; + if (!fs_ptr->reader_ptr_->open(location)) { + std::string err_msg = "Failed to open file: " + location + ", error: " + std::strerror(errno); + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg); } int64_t length = fs_ptr->reader_ptr_->length(); if (length <= 0) { - LOG_ENGINE_ERROR_ << "Invalid vector index length: " << path; - return nullptr; + LOG_ENGINE_ERROR_ << "Invalid vector index length: " << location; + return; } int64_t rp = 0; @@ -57,7 +78,7 @@ SSVectorIndexFormat::read_internal(const storage::FSHandlerPtr& fs_ptr, const st rp += sizeof(current_type); fs_ptr->reader_ptr_->seekg(rp); - LOG_ENGINE_DEBUG_ << "Start to read_index(" << path << ") length: " << length << " bytes"; + LOG_ENGINE_DEBUG_ << "Start to read_index(" << location << ") length: " << length << " bytes"; while (rp < length) { size_t meta_length; fs_ptr->reader_ptr_->read(&meta_length, sizeof(meta_length)); @@ -80,78 +101,68 @@ SSVectorIndexFormat::read_internal(const storage::FSHandlerPtr& fs_ptr, const st fs_ptr->reader_ptr_->seekg(rp); std::shared_ptr binptr(bin); - load_data_list.Append(std::string(meta, meta_length), binptr, bin_length); + data.Append(std::string(meta, meta_length), binptr, bin_length); delete[] meta; } fs_ptr->reader_ptr_->close(); double span = recorder.RecordSection("End"); double rate = length * 1000000.0 / span / 1024 / 1024; - LOG_ENGINE_DEBUG_ << "read_index(" << path << ") rate " << rate << "MB/s"; - - knowhere::VecIndexFactory& vec_index_factory = knowhere::VecIndexFactory::GetInstance(); - auto index = - vec_index_factory.CreateVecIndex(knowhere::OldIndexTypeToStr(current_type), knowhere::IndexMode::MODE_CPU); - if (index != nullptr) { - if (extern_data != nullptr) { - LOG_ENGINE_DEBUG_ << "load index with " << extern_key << " " << extern_data->size; - load_data_list.Append(extern_key, extern_data); - length += extern_data->size; - } - - index->Load(load_data_list); - index->SetIndexSize(length); - } else { - LOG_ENGINE_ERROR_ << "Fail to create vector index: " << path; - } + LOG_ENGINE_DEBUG_ << "read_index(" << location << ") rate " << rate << "MB/s"; +} - return index; +void +SSVectorIndexFormat::read_compress(const storage::FSHandlerPtr& fs_ptr, const std::string& location, + knowhere::BinaryPtr& data) { + auto& ss_codec = codec::SSCodec::instance(); + ss_codec.GetVectorCompressFormat()->read(fs_ptr, location, data); } void -SSVectorIndexFormat::read(const storage::FSHandlerPtr& fs_ptr, const std::string& location, ExternalData externalData, - segment::VectorIndexPtr& vector_index) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - if (!boost::filesystem::is_directory(dir_path)) { - std::string err_msg = "Directory: " + dir_path + "does not exist"; - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_INVALID_ARGUMENT, err_msg); - } +SSVectorIndexFormat::convert_raw(const std::vector& raw, knowhere::BinaryPtr& data) { + data = std::make_shared(); + data->size = raw.size(); + data->data = std::shared_ptr(new uint8_t[data->size]); +} - knowhere::VecIndexPtr index = nullptr; - switch (externalData) { - case ExternalData_None: { - index = read_internal(fs_ptr, location); - break; +void +SSVectorIndexFormat::construct_index(const std::string& index_name, knowhere::BinarySet& index_data, + knowhere::BinaryPtr& raw_data, knowhere::BinaryPtr& compress_data, + knowhere::VecIndexPtr& index) { + knowhere::VecIndexFactory& vec_index_factory = knowhere::VecIndexFactory::GetInstance(); + index = vec_index_factory.CreateVecIndex(index_name, knowhere::IndexMode::MODE_CPU); + if (index != nullptr) { + int64_t length = 0; + for (auto& pair : index_data.binary_map_) { + length += pair.second->size; } - case ExternalData_RawData: { - auto& ss_codec = codec::SSCodec::instance(); - knowhere::BinaryPtr raw_data = nullptr; - ss_codec.GetVectorsFormat()->read_vectors(fs_ptr, raw_data); - index = read_internal(fs_ptr, location, RAW_DATA, raw_data); - break; + if (raw_data != nullptr) { + LOG_ENGINE_DEBUG_ << "load index with " << RAW_DATA << " " << raw_data->size; + index_data.Append(RAW_DATA, raw_data); + length += raw_data->size; } - case ExternalData_SQ8: { - auto& ss_codec = codec::SSCodec::instance(); - knowhere::BinaryPtr sq8_data = nullptr; - ss_codec.GetVectorCompressFormat()->read(fs_ptr, location, sq8_data); - index = read_internal(fs_ptr, location, SQ8_DATA, sq8_data); - break; + if (compress_data != nullptr) { + LOG_ENGINE_DEBUG_ << "load index with " << SQ8_DATA << " " << compress_data->size; + index_data.Append(SQ8_DATA, compress_data); + length += compress_data->size; } - } - vector_index->SetVectorIndex(index); + index->Load(index_data); + index->SetIndexSize(length); + } else { + std::string err_msg = "Fail to create vector index"; + LOG_ENGINE_ERROR_ << err_msg; + throw Exception(SERVER_UNEXPECTED_ERROR, err_msg); + } } void -SSVectorIndexFormat::write(const storage::FSHandlerPtr& fs_ptr, const std::string& location, - const segment::VectorIndexPtr& vector_index) { +SSVectorIndexFormat::write_index(const storage::FSHandlerPtr& fs_ptr, const std::string& location, + const knowhere::VecIndexPtr& index) { milvus::TimeRecorder recorder("write_index"); - knowhere::VecIndexPtr index = vector_index->GetVectorIndex(); - auto binaryset = index->Serialize(knowhere::Config()); int32_t index_type = knowhere::StrToOldIndexType(index->index_type()); @@ -181,5 +192,19 @@ SSVectorIndexFormat::write(const storage::FSHandlerPtr& fs_ptr, const std::strin LOG_ENGINE_DEBUG_ << "write_index(" << location << ") rate " << rate << "MB/s"; } +void +SSVectorIndexFormat::write_compress(const storage::FSHandlerPtr& fs_ptr, const std::string& location, + const knowhere::VecIndexPtr& index) { + milvus::TimeRecorder recorder("write_index"); + + auto binaryset = index->Serialize(knowhere::Config()); + + auto sq8_data = binaryset.Erase(SQ8_DATA); + if (sq8_data != nullptr) { + auto& ss_codec = codec::SSCodec::instance(); + ss_codec.GetVectorCompressFormat()->write(fs_ptr, location, sq8_data); + } +} + } // namespace codec } // namespace milvus diff --git a/core/src/codecs/snapshot/SSVectorIndexFormat.h b/core/src/codecs/snapshot/SSVectorIndexFormat.h index 7c3ccc9d27a89..0c59b69c26ebc 100644 --- a/core/src/codecs/snapshot/SSVectorIndexFormat.h +++ b/core/src/codecs/snapshot/SSVectorIndexFormat.h @@ -19,9 +19,9 @@ #include #include +#include #include "codecs/VectorIndexFormat.h" -#include "segment/VectorIndex.h" #include "storage/FSHandler.h" namespace milvus { @@ -32,12 +32,27 @@ class SSVectorIndexFormat { SSVectorIndexFormat() = default; void - read(const storage::FSHandlerPtr& fs_ptr, const std::string& location, ExternalData externalData, - segment::VectorIndexPtr& vector_index); + read_raw(const storage::FSHandlerPtr& fs_ptr, const std::string& location, knowhere::BinaryPtr& data); void - write(const storage::FSHandlerPtr& fs_ptr, const std::string& location, - const segment::VectorIndexPtr& vector_index); + read_index(const storage::FSHandlerPtr& fs_ptr, const std::string& location, knowhere::BinarySet& data); + + void + read_compress(const storage::FSHandlerPtr& fs_ptr, const std::string& location, knowhere::BinaryPtr& data); + + void + convert_raw(const std::vector& raw, knowhere::BinaryPtr& data); + + void + construct_index(const std::string& index_name, knowhere::BinarySet& index_data, knowhere::BinaryPtr& raw_data, + knowhere::BinaryPtr& compress_data, knowhere::VecIndexPtr& index); + + void + write_index(const storage::FSHandlerPtr& fs_ptr, const std::string& location, const knowhere::VecIndexPtr& index); + + void + write_compress(const storage::FSHandlerPtr& fs_ptr, const std::string& location, + const knowhere::VecIndexPtr& index); // No copy and move SSVectorIndexFormat(const SSVectorIndexFormat&) = delete; @@ -47,11 +62,6 @@ class SSVectorIndexFormat { operator=(const SSVectorIndexFormat&) = delete; SSVectorIndexFormat& operator=(SSVectorIndexFormat&&) = delete; - - private: - knowhere::VecIndexPtr - read_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& path, const std::string& extern_key = "", - const knowhere::BinaryPtr& extern_data = nullptr); }; using SSVectorIndexFormatPtr = std::shared_ptr; diff --git a/core/src/codecs/snapshot/SSVectorsFormat.cpp b/core/src/codecs/snapshot/SSVectorsFormat.cpp deleted file mode 100644 index 610c09d9f3880..0000000000000 --- a/core/src/codecs/snapshot/SSVectorsFormat.cpp +++ /dev/null @@ -1,222 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "codecs/snapshot/SSVectorsFormat.h" - -#include -#include -#include -#include - -#include - -#include "utils/Exception.h" -#include "utils/Log.h" -#include "utils/TimeRecorder.h" - -namespace milvus { -namespace codec { - -void -SSVectorsFormat::read_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, off_t offset, - size_t num, std::vector& raw_vectors) { - if (!fs_ptr->reader_ptr_->open(file_path.c_str())) { - std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg); - } - - size_t num_bytes; - fs_ptr->reader_ptr_->read(&num_bytes, sizeof(size_t)); - - num = std::min(num, num_bytes - offset); - - offset += sizeof(size_t); // Beginning of file is num_bytes - fs_ptr->reader_ptr_->seekg(offset); - - raw_vectors.resize(num / sizeof(uint8_t)); - fs_ptr->reader_ptr_->read(raw_vectors.data(), num); - - fs_ptr->reader_ptr_->close(); -} - -void -SSVectorsFormat::read_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - knowhere::BinaryPtr& raw_vectors) { - if (!fs_ptr->reader_ptr_->open(file_path.c_str())) { - std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg); - } - - size_t num_bytes; - fs_ptr->reader_ptr_->read(&num_bytes, sizeof(size_t)); - - raw_vectors = std::make_shared(); - raw_vectors->size = num_bytes; - raw_vectors->data = std::shared_ptr(new uint8_t[num_bytes]); - - // Beginning of file is num_bytes - fs_ptr->reader_ptr_->seekg(sizeof(size_t)); - - fs_ptr->reader_ptr_->read(raw_vectors->data.get(), num_bytes); - - fs_ptr->reader_ptr_->close(); -} - -void -SSVectorsFormat::read_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - std::vector& uids) { - if (!fs_ptr->reader_ptr_->open(file_path.c_str())) { - std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg); - } - - size_t num_bytes; - fs_ptr->reader_ptr_->read(&num_bytes, sizeof(size_t)); - - uids.resize(num_bytes / sizeof(segment::doc_id_t)); - fs_ptr->reader_ptr_->read(uids.data(), num_bytes); - - fs_ptr->reader_ptr_->close(); -} - -void -SSVectorsFormat::read(const storage::FSHandlerPtr& fs_ptr, segment::VectorsPtr& vectors_read) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - if (!boost::filesystem::is_directory(dir_path)) { - std::string err_msg = "Directory: " + dir_path + "does not exist"; - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_INVALID_ARGUMENT, err_msg); - } - - boost::filesystem::path target_path(dir_path); - typedef boost::filesystem::directory_iterator d_it; - d_it it_end; - d_it it(target_path); - // for (auto& it : boost::filesystem::directory_iterator(dir_path)) { - for (; it != it_end; ++it) { - const auto& path = it->path(); - if (path.extension().string() == raw_vector_extension_) { - auto& vector_list = vectors_read->GetMutableData(); - read_vectors_internal(fs_ptr, path.string(), 0, INT64_MAX, vector_list); - vectors_read->SetName(path.stem().string()); - } else if (path.extension().string() == user_id_extension_) { - auto& uids = vectors_read->GetMutableUids(); - read_uids_internal(fs_ptr, path.string(), uids); - } - } -} - -void -SSVectorsFormat::write_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& raw_vectors) { - if (!fs_ptr->writer_ptr_->open(file_path.c_str())) { - std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg); - } - - size_t rv_num_bytes = raw_vectors.size() * sizeof(uint8_t); - fs_ptr->writer_ptr_->write(&rv_num_bytes, sizeof(size_t)); - fs_ptr->writer_ptr_->write((void*)raw_vectors.data(), rv_num_bytes); - fs_ptr->writer_ptr_->close(); -} - -void -SSVectorsFormat::write_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& uids) { - if (!fs_ptr->writer_ptr_->open(file_path.c_str())) { - std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno); - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_CANNOT_CREATE_FILE, err_msg); - } - - size_t uid_num_bytes = uids.size() * sizeof(segment::doc_id_t); - fs_ptr->writer_ptr_->write(&uid_num_bytes, sizeof(size_t)); - fs_ptr->writer_ptr_->write((void*)uids.data(), uid_num_bytes); - fs_ptr->writer_ptr_->close(); -} - -void -SSVectorsFormat::write(const storage::FSHandlerPtr& fs_ptr, const segment::VectorsPtr& vectors) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - - const std::string rv_file_path = dir_path + "/" + vectors->GetName() + raw_vector_extension_; - const std::string uid_file_path = dir_path + "/" + vectors->GetName() + user_id_extension_; - - TimeRecorder rc("write vectors"); - - write_vectors_internal(fs_ptr, rv_file_path, vectors->GetData()); - - rc.RecordSection("write rv done"); - - write_uids_internal(fs_ptr, uid_file_path, vectors->GetUids()); - - rc.RecordSection("write uids done"); -} - -void -SSVectorsFormat::read_uids(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - std::vector& uids) { - read_uids_internal(fs_ptr, file_path, uids); -} - -void -SSVectorsFormat::read_vectors(const storage::FSHandlerPtr& fs_ptr, knowhere::BinaryPtr& raw_vectors) { - std::string dir_path = fs_ptr->operation_ptr_->GetDirectory(); - if (!boost::filesystem::is_directory(dir_path)) { - std::string err_msg = "Directory: " + dir_path + "does not exist"; - LOG_ENGINE_ERROR_ << err_msg; - throw Exception(SERVER_INVALID_ARGUMENT, err_msg); - } - - boost::filesystem::path target_path(dir_path); - typedef boost::filesystem::directory_iterator d_it; - d_it it_end; - d_it it(target_path); - // for (auto& it : boost::filesystem::directory_iterator(dir_path)) { - for (; it != it_end; ++it) { - const auto& path = it->path(); - if (path.extension().string() == raw_vector_extension_) { - read_vectors_internal(fs_ptr, path.string(), raw_vectors); - break; - } - } -} - -void -SSVectorsFormat::read_vectors(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, off_t offset, - size_t num_bytes, std::vector& raw_vectors) { - read_vectors_internal(fs_ptr, file_path, offset, num_bytes, raw_vectors); -} - -void -SSVectorsFormat::write_uids(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& uids) { - write_uids_internal(fs_ptr, file_path, uids); -} - -void -SSVectorsFormat::write_vectors(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& raw_vectors) { - write_vectors_internal(fs_ptr, file_path, raw_vectors); -} - -} // namespace codec -} // namespace milvus diff --git a/core/src/codecs/snapshot/SSVectorsFormat.h b/core/src/codecs/snapshot/SSVectorsFormat.h deleted file mode 100644 index a74db9272a896..0000000000000 --- a/core/src/codecs/snapshot/SSVectorsFormat.h +++ /dev/null @@ -1,97 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "knowhere/common/BinarySet.h" -#include "segment/Vectors.h" -#include "storage/FSHandler.h" - -namespace milvus { -namespace codec { - -class SSVectorsFormat { - public: - SSVectorsFormat() = default; - - void - read(const storage::FSHandlerPtr& fs_ptr, segment::VectorsPtr& vectors_read); - - void - write(const storage::FSHandlerPtr& fs_ptr, const segment::VectorsPtr& vectors); - - void - read_uids(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, std::vector& uids); - - void - read_vectors(const storage::FSHandlerPtr& fs_ptr, knowhere::BinaryPtr& raw_vectors); - - void - read_vectors(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, off_t offset, size_t num_bytes, - std::vector& raw_vectors); - - void - write_uids(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& uids); - - void - write_vectors(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& raw_vectors); - - // No copy and move - SSVectorsFormat(const SSVectorsFormat&) = delete; - SSVectorsFormat(SSVectorsFormat&&) = delete; - - SSVectorsFormat& - operator=(const SSVectorsFormat&) = delete; - SSVectorsFormat& - operator=(SSVectorsFormat&&) = delete; - - private: - void - read_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, off_t offset, size_t num, - std::vector& raw_vectors); - - void - read_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - knowhere::BinaryPtr& raw_vectors); - - void - read_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - std::vector& uids); - - void - write_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& raw_vectors); - - void - write_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, - const std::vector& uids); - - private: - const std::string raw_vector_extension_ = ".rv"; - const std::string user_id_extension_ = ".uid"; -}; - -using SSVectorsFormatPtr = std::shared_ptr; - -} // namespace codec -} // namespace milvus diff --git a/core/src/db/SSDBImpl.cpp b/core/src/db/SSDBImpl.cpp index 9a3345573470d..40c8dcfd7d7d7 100644 --- a/core/src/db/SSDBImpl.cpp +++ b/core/src/db/SSDBImpl.cpp @@ -13,6 +13,7 @@ #include "cache/CpuCacheMgr.h" #include "db/IDGenerator.h" #include "db/merge/MergeManagerFactory.h" +#include "db/merge/SSMergeTask.h" #include "db/snapshot/CompoundOperations.h" #include "db/snapshot/ResourceHelper.h" #include "db/snapshot/ResourceTypes.h" @@ -23,6 +24,8 @@ #include "metrics/SystemInfo.h" #include "scheduler/Definition.h" #include "scheduler/SchedInst.h" +#include "segment/SSSegmentReader.h" +#include "segment/SSSegmentWriter.h" #include "utils/Exception.h" #include "utils/StringHelpFunctions.h" #include "utils/TimeRecorder.h" @@ -84,32 +87,33 @@ SSDBImpl::Start() { // wal if (options_.wal_enable_) { - auto error_code = DB_ERROR; - if (wal_mgr_ != nullptr) { - error_code = wal_mgr_->Init(); - } - if (error_code != WAL_SUCCESS) { - throw Exception(error_code, "Wal init error!"); - } - - // recovery - while (true) { - wal::MXLogRecord record; - auto error_code = wal_mgr_->GetNextRecovery(record); - if (error_code != WAL_SUCCESS) { - throw Exception(error_code, "Wal recovery error!"); - } - if (record.type == wal::MXLogType::None) { - break; - } - ExecWalRecord(record); - } - - // for distribute version, some nodes are read only - if (options_.mode_ != DBOptions::MODE::CLUSTER_READONLY) { - // background wal thread - bg_wal_thread_ = std::thread(&SSDBImpl::BackgroundWalThread, this); - } + return Status(SERVER_NOT_IMPLEMENT, "Wal not implemented"); + // auto error_code = DB_ERROR; + // if (wal_mgr_ != nullptr) { + // error_code = wal_mgr_->Init(); + // } + // if (error_code != WAL_SUCCESS) { + // throw Exception(error_code, "Wal init error!"); + // } + // + // // recovery + // while (true) { + // wal::MXLogRecord record; + // auto error_code = wal_mgr_->GetNextRecovery(record); + // if (error_code != WAL_SUCCESS) { + // throw Exception(error_code, "Wal recovery error!"); + // } + // if (record.type == wal::MXLogType::None) { + // break; + // } + // ExecWalRecord(record); + // } + // + // // for distribute version, some nodes are read only + // if (options_.mode_ != DBOptions::MODE::CLUSTER_READONLY) { + // // background wal thread + // bg_wal_thread_ = std::thread(&SSDBImpl::BackgroundWalThread, this); + // } } else { // for distribute version, some nodes are read only if (options_.mode_ != DBOptions::MODE::CLUSTER_READONLY) { @@ -143,9 +147,9 @@ SSDBImpl::Stop() { if (options_.mode_ != DBOptions::MODE::CLUSTER_READONLY) { if (options_.wal_enable_) { - // wait wal thread finish - swn_wal_.Notify(); - bg_wal_thread_.join(); + // // wait wal thread finish + // swn_wal_.Notify(); + // bg_wal_thread_.join(); } else { // flush all without merge wal::MXLogRecord record; @@ -178,8 +182,28 @@ SSDBImpl::CreateCollection(const snapshot::CreateCollectionContext& context) { CHECK_INITIALIZED; auto ctx = context; + // check uid existence/validation + bool has_uid = false; + for (auto& pair : ctx.fields_schema) { + if (pair.first->GetFtype() == meta::hybrid::DataType::UID) { + has_uid = true; + break; + } + } + + // add uid field if not specified + if (!has_uid) { + auto uid_field = std::make_shared(DEFAULT_UID_NAME, 0, milvus::engine::FieldType::UID); + auto bloom_filter_element = std::make_shared( + 0, 0, DEFAULT_BLOOM_FILTER_NAME, milvus::engine::FieldElementType::FET_BLOOM_FILTER); + auto delete_doc_element = std::make_shared( + 0, 0, DEFAULT_DELETED_DOCS_NAME, milvus::engine::FieldElementType::FET_DELETED_DOCS); + + ctx.fields_schema[uid_field] = {bloom_filter_element, delete_doc_element}; + } + if (options_.wal_enable_) { - ctx.lsn = wal_mgr_->CreateCollection(context.collection->GetName()); + // ctx.lsn = wal_mgr_->CreateCollection(context.collection->GetName()); } auto op = std::make_shared(ctx); return op->Push(); @@ -251,6 +275,19 @@ SSDBImpl::GetCollectionRowCount(const std::string& collection_name, uint64_t& ro return Status::OK(); } +Status +SSDBImpl::PreloadCollection(const server::ContextPtr& context, const std::string& collection_name, bool force) { + CHECK_INITIALIZED; + + snapshot::ScopedSnapshotT ss; + STATUS_CHECK(snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_name)); + + auto handler = std::make_shared(context, ss); + handler->Iterate(); + + return handler->GetStatus(); +} + Status SSDBImpl::CreatePartition(const std::string& collection_name, const std::string& partition_name) { CHECK_INITIALIZED; @@ -303,27 +340,84 @@ SSDBImpl::ShowPartitions(const std::string& collection_name, std::vectorGetPartition(partition_name); + if (partition_ptr == nullptr) { + return Status(DB_NOT_FOUND, "Fail to get partition " + partition_name); + } - snapshot::OperationContext context; - STATUS_CHECK(ss->GetFieldElement(field_name, field_element_name, context.stale_field_element)); - auto op = std::make_shared(context, ss); - STATUS_CHECK(op->Push()); + /* Generate id */ + if (data_chunk->fixed_fields_.find(engine::DEFAULT_UID_NAME) == data_chunk->fixed_fields_.end()) { + SafeIDGenerator& id_generator = SafeIDGenerator::GetInstance(); + IDNumbers ids; + STATUS_CHECK(id_generator.GetNextIDNumbers(data_chunk->count_, ids)); + FIXED_FIELD_DATA& id_data = data_chunk->fixed_fields_[engine::DEFAULT_UID_NAME]; + id_data.resize(ids.size() * sizeof(int64_t)); + memcpy(id_data.data(), ids.data(), ids.size() * sizeof(int64_t)); + } + + if (options_.wal_enable_) { + return Status(SERVER_NOT_IMPLEMENT, "Wal not implemented"); + // auto vector_it = entity.vector_data_.begin(); + // if (!vector_it->second.binary_data_.empty()) { + // wal_mgr_->InsertEntities(collection_name, partition_name, entity.id_array_, + // vector_it->second.binary_data_, + // attr_nbytes, attr_data); + // } else if (!vector_it->second.float_data_.empty()) { + // wal_mgr_->InsertEntities(collection_name, partition_name, entity.id_array_, + // vector_it->second.float_data_, + // attr_nbytes, attr_data); + // } + // swn_wal_.Notify(); + } else { + // insert entities: collection_name is field id + wal::MXLogRecord record; + record.lsn = 0; + record.collection_id = collection_name; + record.partition_tag = partition_name; + record.data_chunk = data_chunk; + record.length = data_chunk->count_; + record.type = wal::MXLogType::Entity; + + STATUS_CHECK(ExecWalRecord(record)); + } - // SS TODO: Start merge task needed? - /* std::set merge_collection_ids = {collection_id}; */ - /* StartMergeTask(merge_collection_ids, true); */ return Status::OK(); } +Status +SSDBImpl::DeleteEntities(const std::string& collection_name, engine::IDNumbers entity_ids) { + CHECK_INITIALIZED; + + Status status; + if (options_.wal_enable_) { + return Status(SERVER_NOT_IMPLEMENT, "Wal not implemented"); + // wal_mgr_->DeleteById(collection_name, entity_ids); + // swn_wal_.Notify(); + } else { + wal::MXLogRecord record; + record.lsn = 0; // need to get from meta ? + record.type = wal::MXLogType::Delete; + record.collection_id = collection_name; + record.ids = entity_ids.data(); + record.length = entity_ids.size(); + + status = ExecWalRecord(record); + } + + return status; +} + Status SSDBImpl::Flush(const std::string& collection_name) { if (!initialized_.load(std::memory_order_acquire)) { @@ -344,16 +438,17 @@ SSDBImpl::Flush(const std::string& collection_name) { LOG_ENGINE_DEBUG_ << "Begin flush collection: " << collection_name; if (options_.wal_enable_) { - LOG_ENGINE_DEBUG_ << "WAL flush"; - auto lsn = wal_mgr_->Flush(collection_name); - if (lsn != 0) { - swn_wal_.Notify(); - flush_req_swn_.Wait(); - } else { - // no collection flushed, call merge task to cleanup files - std::set merge_collection_ids; - StartMergeTask(merge_collection_ids); - } + return Status(SERVER_NOT_IMPLEMENT, "Wal not implemented"); + // LOG_ENGINE_DEBUG_ << "WAL flush"; + // auto lsn = wal_mgr_->Flush(collection_name); + // if (lsn != 0) { + // swn_wal_.Notify(); + // flush_req_swn_.Wait(); + // } else { + // // no collection flushed, call merge task to cleanup files + // std::set merge_collection_ids; + // StartMergeTask(merge_collection_ids); + // } } else { LOG_ENGINE_DEBUG_ << "MemTable flush"; InternalFlush(collection_name); @@ -375,16 +470,17 @@ SSDBImpl::Flush() { Status status; fiu_do_on("options_wal_enable_false", options_.wal_enable_ = false); if (options_.wal_enable_) { - LOG_ENGINE_DEBUG_ << "WAL flush"; - auto lsn = wal_mgr_->Flush(); - if (lsn != 0) { - swn_wal_.Notify(); - flush_req_swn_.Wait(); - } else { - // no collection flushed, call merge task to cleanup files - std::set merge_collection_ids; - StartMergeTask(merge_collection_ids); - } + return Status(SERVER_NOT_IMPLEMENT, "Wal not implemented"); + // LOG_ENGINE_DEBUG_ << "WAL flush"; + // auto lsn = wal_mgr_->Flush(); + // if (lsn != 0) { + // swn_wal_.Notify(); + // flush_req_swn_.Wait(); + // } else { + // // no collection flushed, call merge task to cleanup files + // std::set merge_collection_ids; + // StartMergeTask(merge_collection_ids); + // } } else { LOG_ENGINE_DEBUG_ << "MemTable flush"; InternalFlush(); @@ -402,6 +498,10 @@ SSDBImpl::Compact(const std::shared_ptr& context, const std::st return SHUTDOWN_ERROR; } + LOG_ENGINE_DEBUG_ << "Before compacting, wait for build index thread to finish..."; + const std::lock_guard index_lock(build_index_mutex_); + const std::lock_guard merge_lock(flush_merge_compact_mutex_); + Status status; bool has_collection; status = HasCollection(collection_name, has_collection); @@ -413,33 +513,58 @@ SSDBImpl::Compact(const std::shared_ptr& context, const std::st return Status(DB_NOT_FOUND, "Collection to compact does not exist"); } - LOG_ENGINE_DEBUG_ << "Before compacting, wait for build index thread to finish..."; + snapshot::ScopedSnapshotT latest_ss; + status = snapshot::Snapshots::GetInstance().GetSnapshot(latest_ss, collection_name); + if (!status.ok()) { + return status; + } - snapshot::ScopedSnapshotT ss; - status = snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_name); + auto& segments = latest_ss->GetResources(); + for (auto& kv : segments) { + // client break the connection, no need to continue + if (context && context->IsConnectionBroken()) { + LOG_ENGINE_DEBUG_ << "Client connection broken, stop compact operation"; + break; + } - std::vector part_names = ss->GetPartitionNames(); + snapshot::ID_TYPE segment_id = kv.first; + auto read_visitor = engine::SegmentVisitor::Build(latest_ss, segment_id); + segment::SSSegmentReaderPtr segment_reader = + std::make_shared(options_.meta_.path_, read_visitor); - return status; -} + segment::DeletedDocsPtr deleted_docs; + status = segment_reader->LoadDeletedDocs(deleted_docs); + if (!status.ok() || deleted_docs == nullptr) { + continue; // no deleted docs, no need to compact + } -Status -SSDBImpl::PreloadCollection(const server::ContextPtr& context, const std::string& collection_name, bool force) { - CHECK_INITIALIZED; + auto segment_commit = latest_ss->GetSegmentCommitBySegmentId(segment_id); + auto row_count = segment_commit->GetRowCount(); + if (row_count == 0) { + continue; + } - snapshot::ScopedSnapshotT ss; - STATUS_CHECK(snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_name)); + auto deleted_count = deleted_docs->GetSize(); + if (deleted_count / (row_count + deleted_count) < threshold) { + continue; // no need to compact + } - auto handler = std::make_shared(context, ss); - handler->Iterate(); + snapshot::IDS_TYPE ids = {segment_id}; + SSMergeTask merge_task(options_, latest_ss, ids); + status = merge_task.Execute(); + if (!status.ok()) { + LOG_ENGINE_ERROR_ << "Compact failed for segment " << segment_reader->GetSegmentPath() << ": " + << status.message(); + continue; // skip this file and try compact next one + } + } - return handler->GetStatus(); + return status; } Status SSDBImpl::GetEntityByID(const std::string& collection_name, const IDNumbers& id_array, - const std::vector& field_names, std::vector& vector_data, - /*std::vector& attr_type,*/ std::vector& attr_data) { + const std::vector& field_names, DataChunkPtr& data_chunk) { CHECK_INITIALIZED; snapshot::ScopedSnapshotT ss; @@ -450,227 +575,59 @@ SSDBImpl::GetEntityByID(const std::string& collection_name, const IDNumbers& id_ handler->Iterate(); STATUS_CHECK(handler->GetStatus()); - // vector_data = std::move(handler->segment_ptr_->vectors_ptr_); - // attr_type = std::move(handler->attr_type_); - // attr_data = std::move(handler->attr_data_); - + data_chunk = handler->data_chunk_; return Status::OK(); } Status -CopyToAttr(const std::vector& record, int64_t row_num, const std::vector& field_names, - std::unordered_map& attr_types, - std::unordered_map>& attr_datas, - std::unordered_map& attr_nbytes, - std::unordered_map& attr_data_size) { - int64_t offset = 0; - for (auto name : field_names) { - switch (attr_types.at(name)) { - case meta::hybrid::DataType::INT8: { - std::vector data; - data.resize(row_num * sizeof(int8_t)); - - std::vector attr_value(row_num, 0); - memcpy(attr_value.data(), record.data() + offset, row_num * sizeof(int64_t)); - - std::vector raw_value(row_num, 0); - for (uint64_t i = 0; i < row_num; ++i) { - raw_value[i] = attr_value[i]; - } - - memcpy(data.data(), raw_value.data(), row_num * sizeof(int8_t)); - attr_datas.insert(std::make_pair(name, data)); - - attr_nbytes.insert(std::make_pair(name, sizeof(int8_t))); - attr_data_size.insert(std::make_pair(name, row_num * sizeof(int8_t))); - offset += row_num * sizeof(int64_t); - break; - } - case meta::hybrid::DataType::INT16: { - std::vector data; - data.resize(row_num * sizeof(int16_t)); - - std::vector attr_value(row_num, 0); - memcpy(attr_value.data(), record.data() + offset, row_num * sizeof(int64_t)); - - std::vector raw_value(row_num, 0); - for (uint64_t i = 0; i < row_num; ++i) { - raw_value[i] = attr_value[i]; - } - - memcpy(data.data(), raw_value.data(), row_num * sizeof(int16_t)); - attr_datas.insert(std::make_pair(name, data)); - - attr_nbytes.insert(std::make_pair(name, sizeof(int16_t))); - attr_data_size.insert(std::make_pair(name, row_num * sizeof(int16_t))); - offset += row_num * sizeof(int64_t); - break; - } - case meta::hybrid::DataType::INT32: { - std::vector data; - data.resize(row_num * sizeof(int32_t)); - - std::vector attr_value(row_num, 0); - memcpy(attr_value.data(), record.data() + offset, row_num * sizeof(int64_t)); - - std::vector raw_value(row_num, 0); - for (uint64_t i = 0; i < row_num; ++i) { - raw_value[i] = attr_value[i]; - } - - memcpy(data.data(), raw_value.data(), row_num * sizeof(int32_t)); - attr_datas.insert(std::make_pair(name, data)); - - attr_nbytes.insert(std::make_pair(name, sizeof(int32_t))); - attr_data_size.insert(std::make_pair(name, row_num * sizeof(int32_t))); - offset += row_num * sizeof(int64_t); - break; - } - case meta::hybrid::DataType::INT64: { - std::vector data; - data.resize(row_num * sizeof(int64_t)); - memcpy(data.data(), record.data() + offset, row_num * sizeof(int64_t)); - attr_datas.insert(std::make_pair(name, data)); - - std::vector test_data(row_num); - memcpy(test_data.data(), record.data(), row_num * sizeof(int64_t)); - - attr_nbytes.insert(std::make_pair(name, sizeof(int64_t))); - attr_data_size.insert(std::make_pair(name, row_num * sizeof(int64_t))); - offset += row_num * sizeof(int64_t); - break; - } - case meta::hybrid::DataType::FLOAT: { - std::vector data; - data.resize(row_num * sizeof(float)); - - std::vector attr_value(row_num, 0); - memcpy(attr_value.data(), record.data() + offset, row_num * sizeof(double)); - - std::vector raw_value(row_num, 0); - for (uint64_t i = 0; i < row_num; ++i) { - raw_value[i] = attr_value[i]; - } +SSDBImpl::GetEntityIDs(const std::string& collection_id, int64_t segment_id, IDNumbers& entity_ids) { + return Status::OK(); +} - memcpy(data.data(), raw_value.data(), row_num * sizeof(float)); - attr_datas.insert(std::make_pair(name, data)); +Status +SSDBImpl::CreateIndex(const std::shared_ptr& context, const std::string& collection_id, + const std::string& field_name, const CollectionIndex& index) { + return Status::OK(); +} - attr_nbytes.insert(std::make_pair(name, sizeof(float))); - attr_data_size.insert(std::make_pair(name, row_num * sizeof(float))); - offset += row_num * sizeof(double); - break; - } - case meta::hybrid::DataType::DOUBLE: { - std::vector data; - data.resize(row_num * sizeof(double)); - memcpy(data.data(), record.data() + offset, row_num * sizeof(double)); - attr_datas.insert(std::make_pair(name, data)); - - attr_nbytes.insert(std::make_pair(name, sizeof(double))); - attr_data_size.insert(std::make_pair(name, row_num * sizeof(double))); - offset += row_num * sizeof(double); - break; - } - default: - break; - } - } +Status +SSDBImpl::DescribeIndex(const std::string& collection_id, const std::string& field_name, CollectionIndex& index) { return Status::OK(); } Status -SSDBImpl::InsertEntities(const std::string& collection_name, const std::string& partition_name, - const std::vector& field_names, Entity& entity, - std::unordered_map& attr_types) { +SSDBImpl::DropIndex(const std::string& collection_name, const std::string& field_name, + const std::string& element_name) { CHECK_INITIALIZED; + LOG_ENGINE_DEBUG_ << "Drop index for collection: " << collection_name; snapshot::ScopedSnapshotT ss; STATUS_CHECK(snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_name)); - auto partition_ptr = ss->GetPartition(partition_name); - if (partition_ptr == nullptr) { - return Status(DB_NOT_FOUND, "Fail to get partition " + partition_name); - } - - /* Generate id */ - if (entity.id_array_.empty()) { - SafeIDGenerator& id_generator = SafeIDGenerator::GetInstance(); - STATUS_CHECK(id_generator.GetNextIDNumbers(entity.entity_count_, entity.id_array_)); - } - - std::unordered_map> attr_data; - std::unordered_map attr_nbytes; - std::unordered_map attr_data_size; - STATUS_CHECK(CopyToAttr(entity.attr_value_, entity.entity_count_, field_names, attr_types, attr_data, attr_nbytes, - attr_data_size)); - - if (options_.wal_enable_) { - auto vector_it = entity.vector_data_.begin(); - if (!vector_it->second.binary_data_.empty()) { - wal_mgr_->InsertEntities(collection_name, partition_name, entity.id_array_, vector_it->second.binary_data_, - attr_nbytes, attr_data); - } else if (!vector_it->second.float_data_.empty()) { - wal_mgr_->InsertEntities(collection_name, partition_name, entity.id_array_, vector_it->second.float_data_, - attr_nbytes, attr_data); - } - swn_wal_.Notify(); - } else { - // insert entities: collection_name is field id - wal::MXLogRecord record; - record.lsn = 0; - record.collection_id = collection_name; - record.partition_tag = partition_name; - record.ids = entity.id_array_.data(); - record.length = entity.entity_count_; - record.attr_data = attr_data; - record.attr_nbytes = attr_nbytes; - record.attr_data_size = attr_data_size; - - auto vector_it = entity.vector_data_.begin(); - if (vector_it->second.binary_data_.empty()) { - record.type = wal::MXLogType::InsertVector; - record.data = vector_it->second.float_data_.data(); - record.data_size = vector_it->second.float_data_.size() * sizeof(float); - } else { - record.type = wal::MXLogType::InsertBinary; - record.data = vector_it->second.binary_data_.data(); - record.data_size = vector_it->second.binary_data_.size() * sizeof(uint8_t); - } + // SS TODO: Check Index Type - STATUS_CHECK(ExecWalRecord(record)); - } + snapshot::OperationContext context; + STATUS_CHECK(ss->GetFieldElement(field_name, element_name, context.stale_field_element)); + auto op = std::make_shared(context, ss); + STATUS_CHECK(op->Push()); + // SS TODO: Start merge task needed? + /* std::set merge_collection_ids = {collection_id}; */ + /* StartMergeTask(merge_collection_ids, true); */ return Status::OK(); } Status -SSDBImpl::DeleteEntities(const std::string& collection_name, engine::IDNumbers entity_ids) { - CHECK_INITIALIZED; - - Status status; - if (options_.wal_enable_) { - wal_mgr_->DeleteById(collection_name, entity_ids); - swn_wal_.Notify(); - } else { - wal::MXLogRecord record; - record.lsn = 0; // need to get from meta ? - record.type = wal::MXLogType::Delete; - record.collection_id = collection_name; - record.ids = entity_ids.data(); - record.length = entity_ids.size(); - - status = ExecWalRecord(record); - } - - return status; +SSDBImpl::DropIndex(const std::string& collection_id) { + return Status::OK(); } Status -SSDBImpl::HybridQuery(const server::ContextPtr& context, const std::string& collection_name, - const std::vector& partition_patterns, query::GeneralQueryPtr general_query, - query::QueryPtr query_ptr, std::vector& field_names, - std::unordered_map& attr_type, - engine::QueryResult& result) { +SSDBImpl::Query(const server::ContextPtr& context, const std::string& collection_name, + const std::vector& partition_patterns, query::GeneralQueryPtr general_query, + query::QueryPtr query_ptr, std::vector& field_names, + std::unordered_map& attr_type, + engine::QueryResult& result) { CHECK_INITIALIZED; auto query_ctx = context->Child("Query"); @@ -680,51 +637,51 @@ SSDBImpl::HybridQuery(const server::ContextPtr& context, const std::string& coll snapshot::ScopedSnapshotT ss; STATUS_CHECK(snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_name)); - auto handler = std::make_shared(nullptr, ss, partition_patterns); - handler->Iterate(); - STATUS_CHECK(handler->GetStatus()); - - LOG_ENGINE_DEBUG_ << LogOut("Engine query begin, segment count: %ld", handler->segments_.size()); - - VectorsData vectors; - scheduler::SearchJobPtr job = - std::make_shared(query_ctx, general_query, query_ptr, attr_type, vectors); - for (auto& segment : handler->segments_) { - // job->AddSegment(segment); - } - - // step 2: put search job to scheduler and wait result - scheduler::JobMgrInst::GetInstance()->Put(job); - job->WaitResult(); - - if (!job->GetStatus().ok()) { - return job->GetStatus(); - } - - // step 3: construct results - result.row_num_ = job->vector_count(); - result.result_ids_ = job->GetResultIds(); - result.result_distances_ = job->GetResultDistances(); - - // step 4: get entities by result ids - STATUS_CHECK(GetEntityByID(collection_name, result.result_ids_, field_names, result.vectors_, result.attrs_)); - - // step 5: filter entities by field names - // std::vector filter_attrs; - // for (auto attr : result.attrs_) { - // AttrsData attrs_data; - // attrs_data.attr_type_ = attr.attr_type_; - // attrs_data.attr_count_ = attr.attr_count_; - // attrs_data.id_array_ = attr.id_array_; - // for (auto& name : field_names) { - // if (attr.attr_data_.find(name) != attr.attr_data_.end()) { - // attrs_data.attr_data_.insert(std::make_pair(name, attr.attr_data_.at(name))); - // } - // } - // filter_attrs.emplace_back(attrs_data); + // auto handler = std::make_shared(nullptr, ss, partition_patterns); + // handler->Iterate(); + // STATUS_CHECK(handler->GetStatus()); + // + // LOG_ENGINE_DEBUG_ << LogOut("Engine query begin, segment count: %ld", handler->segments_.size()); + // + // VectorsData vectors; + // scheduler::SearchJobPtr job = + // std::make_shared(query_ctx, general_query, query_ptr, attr_type, vectors); + // for (auto& segment : handler->segments_) { + // // job->AddSegment(segment); + // } + // + // // step 2: put search job to scheduler and wait result + // scheduler::JobMgrInst::GetInstance()->Put(job); + // job->WaitResult(); + // + // if (!job->GetStatus().ok()) { + // return job->GetStatus(); // } // - // result.attrs_ = filter_attrs; + // // step 3: construct results + // result.row_num_ = job->vector_count(); + // result.result_ids_ = job->GetResultIds(); + // result.result_distances_ = job->GetResultDistances(); + // + // // step 4: get entities by result ids + // STATUS_CHECK(GetEntityByID(collection_name, result.result_ids_, field_names, result.vectors_, result.attrs_)); + // + // // step 5: filter entities by field names + // // std::vector filter_attrs; + // // for (auto attr : result.attrs_) { + // // AttrsData attrs_data; + // // attrs_data.attr_type_ = attr.attr_type_; + // // attrs_data.attr_count_ = attr.attr_count_; + // // attrs_data.id_array_ = attr.id_array_; + // // for (auto& name : field_names) { + // // if (attr.attr_data_.find(name) != attr.attr_data_.end()) { + // // attrs_data.attr_data_.insert(std::make_pair(name, attr.attr_data_.at(name))); + // // } + // // } + // // filter_attrs.emplace_back(attrs_data); + // // } + // // + // // result.attrs_ = filter_attrs; rc.ElapseFromBegin("Engine query totally cost"); @@ -827,14 +784,14 @@ SSDBImpl::StartBuildIndexTask() { { std::lock_guard lck(index_result_mutex_); if (index_thread_results_.empty()) { - index_thread_results_.push_back(index_thread_pool_.enqueue(&SSDBImpl::BackgroundWaitBuildIndex, this)); + index_thread_results_.push_back(index_thread_pool_.enqueue(&SSDBImpl::BackgroundBuildIndexTask, this)); } } } void -SSDBImpl::BackgroundWaitBuildIndex() { - // TODO: update segment to index state and wait BackgroundIndexThread to build index +SSDBImpl::BackgroundBuildIndexTask() { + std::unique_lock lock(build_index_mutex_); } void @@ -857,6 +814,16 @@ SSDBImpl::BackgroundIndexThread() { } } +void +SSDBImpl::WaitBuildIndexFinish() { + // LOG_ENGINE_DEBUG_ << "Begin WaitBuildIndexFinish"; + std::lock_guard lck(index_result_mutex_); + for (auto& iter : index_thread_results_) { + iter.wait(); + } + // LOG_ENGINE_DEBUG_ << "End WaitBuildIndexFinish"; +} + void SSDBImpl::BackgroundWalThread() { SetThreadName("wal_thread"); @@ -918,98 +885,22 @@ SSDBImpl::BackgroundWalThread() { } } -void -SSDBImpl::StartMergeTask(const std::set& merge_collection_ids, bool force_merge_all) { - // LOG_ENGINE_DEBUG_ << "Begin StartMergeTask"; - // merge task has been finished? - { - std::lock_guard lck(merge_result_mutex_); - if (!merge_thread_results_.empty()) { - std::chrono::milliseconds span(10); - if (merge_thread_results_.back().wait_for(span) == std::future_status::ready) { - merge_thread_results_.pop_back(); - } - } - } - - // add new merge task - { - std::lock_guard lck(merge_result_mutex_); - if (merge_thread_results_.empty()) { - // start merge file thread - merge_thread_results_.push_back( - merge_thread_pool_.enqueue(&SSDBImpl::BackgroundMerge, this, merge_collection_ids, force_merge_all)); - } - } - - // LOG_ENGINE_DEBUG_ << "End StartMergeTask"; -} - -void -SSDBImpl::BackgroundMerge(std::set collection_names, bool force_merge_all) { - // LOG_ENGINE_TRACE_ << " Background merge thread start"; - - Status status; - for (auto& collection_name : collection_names) { - const std::lock_guard lock(flush_merge_compact_mutex_); - - auto old_strategy = merge_mgr_ptr_->Strategy(); - if (force_merge_all) { - merge_mgr_ptr_->UseStrategy(MergeStrategyType::ADAPTIVE); - } - - auto status = merge_mgr_ptr_->MergeFiles(collection_name); - merge_mgr_ptr_->UseStrategy(old_strategy); - if (!status.ok()) { - LOG_ENGINE_ERROR_ << "Failed to get merge files for collection: " << collection_name - << " reason:" << status.message(); - } - - if (!initialized_.load(std::memory_order_acquire)) { - LOG_ENGINE_DEBUG_ << "Server will shutdown, skip merge action for collection: " << collection_name; - break; - } - } - - // TODO: cleanup with ttl -} - -void -SSDBImpl::WaitMergeFileFinish() { - // LOG_ENGINE_DEBUG_ << "Begin WaitMergeFileFinish"; - std::lock_guard lck(merge_result_mutex_); - for (auto& iter : merge_thread_results_) { - iter.wait(); - } - // LOG_ENGINE_DEBUG_ << "End WaitMergeFileFinish"; -} - -void -SSDBImpl::WaitBuildIndexFinish() { - // LOG_ENGINE_DEBUG_ << "Begin WaitBuildIndexFinish"; - std::lock_guard lck(index_result_mutex_); - for (auto& iter : index_thread_results_) { - iter.wait(); - } - // LOG_ENGINE_DEBUG_ << "End WaitBuildIndexFinish"; -} - Status SSDBImpl::ExecWalRecord(const wal::MXLogRecord& record) { auto collections_flushed = [&](const std::string collection_id, const std::set& target_collection_names) -> uint64_t { uint64_t max_lsn = 0; if (options_.wal_enable_ && !target_collection_names.empty()) { - uint64_t lsn = 0; - for (auto& collection_name : target_collection_names) { - snapshot::ScopedSnapshotT ss; - snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_name); - lsn = ss->GetMaxLsn(); - if (lsn > max_lsn) { - max_lsn = lsn; - } - } - wal_mgr_->CollectionFlushed(collection_id, lsn); + // uint64_t lsn = 0; + // for (auto& collection_name : target_collection_names) { + // snapshot::ScopedSnapshotT ss; + // snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_name); + // lsn = ss->GetMaxLsn(); + // if (lsn > max_lsn) { + // max_lsn = lsn; + // } + // } + // wal_mgr_->CollectionFlushed(collection_id, lsn); } std::set merge_collection_ids; @@ -1057,20 +948,7 @@ SSDBImpl::ExecWalRecord(const wal::MXLogRecord& record) { return status; } - // construct chunk data - DataChunkPtr chunk = std::make_shared(); - chunk->count_ = record.length; - chunk->fixed_fields_ = record.attr_data; - std::vector uid_data; - uid_data.resize(record.length * sizeof(int64_t)); - memcpy(uid_data.data(), record.ids, record.length * sizeof(int64_t)); - chunk->fixed_fields_.insert(std::make_pair(engine::DEFAULT_UID_NAME, uid_data)); - std::vector vector_data; - vector_data.resize(record.data_size); - memcpy(vector_data.data(), record.data, record.data_size); - chunk->fixed_fields_.insert(std::make_pair(VECTOR_FIELD, vector_data)); - - status = mem_mgr_->InsertEntities(collection_id, partition_id, chunk, record.lsn); + status = mem_mgr_->InsertEntities(collection_id, partition_id, record.data_chunk, record.lsn); force_flush_if_mem_full(); // metrics @@ -1131,7 +1009,7 @@ SSDBImpl::ExecWalRecord(const wal::MXLogRecord& record) { std::set flushed_collections; for (auto id : collection_ids) { snapshot::ScopedSnapshotT ss; - auto status = snapshot::Snapshots::GetInstance().GetSnapshot(ss, record.collection_id); + auto status = snapshot::Snapshots::GetInstance().GetSnapshot(ss, id); if (!status.ok()) { LOG_WAL_ERROR_ << LogOut("[%s][%ld] ", "flush", 0) << "Get snapshot fail: " << status.message(); return status; @@ -1142,7 +1020,7 @@ SSDBImpl::ExecWalRecord(const wal::MXLogRecord& record) { uint64_t lsn = collections_flushed("", flushed_collections); if (options_.wal_enable_) { - wal_mgr_->RemoveOldFiles(lsn); + // wal_mgr_->RemoveOldFiles(lsn); } } break; @@ -1155,6 +1033,72 @@ SSDBImpl::ExecWalRecord(const wal::MXLogRecord& record) { return status; } +void +SSDBImpl::StartMergeTask(const std::set& merge_collection_ids, bool force_merge_all) { + // LOG_ENGINE_DEBUG_ << "Begin StartMergeTask"; + // merge task has been finished? + { + std::lock_guard lck(merge_result_mutex_); + if (!merge_thread_results_.empty()) { + std::chrono::milliseconds span(10); + if (merge_thread_results_.back().wait_for(span) == std::future_status::ready) { + merge_thread_results_.pop_back(); + } + } + } + + // add new merge task + { + std::lock_guard lck(merge_result_mutex_); + if (merge_thread_results_.empty()) { + // start merge file thread + merge_thread_results_.push_back( + merge_thread_pool_.enqueue(&SSDBImpl::BackgroundMerge, this, merge_collection_ids, force_merge_all)); + } + } + + // LOG_ENGINE_DEBUG_ << "End StartMergeTask"; +} + +void +SSDBImpl::BackgroundMerge(std::set collection_names, bool force_merge_all) { + // LOG_ENGINE_TRACE_ << " Background merge thread start"; + + Status status; + for (auto& collection_name : collection_names) { + const std::lock_guard lock(flush_merge_compact_mutex_); + + auto old_strategy = merge_mgr_ptr_->Strategy(); + if (force_merge_all) { + merge_mgr_ptr_->UseStrategy(MergeStrategyType::ADAPTIVE); + } + + auto status = merge_mgr_ptr_->MergeFiles(collection_name); + merge_mgr_ptr_->UseStrategy(old_strategy); + if (!status.ok()) { + LOG_ENGINE_ERROR_ << "Failed to get merge files for collection: " << collection_name + << " reason:" << status.message(); + } + + if (!initialized_.load(std::memory_order_acquire)) { + LOG_ENGINE_DEBUG_ << "Server will shutdown, skip merge action for collection: " << collection_name; + break; + } + } + + // TODO: cleanup with ttl +} + +void +SSDBImpl::WaitMergeFileFinish() { + // LOG_ENGINE_DEBUG_ << "Begin WaitMergeFileFinish"; + std::lock_guard lck(merge_result_mutex_); + for (auto& iter : merge_thread_results_) { + iter.wait(); + } + // LOG_ENGINE_DEBUG_ << "End WaitMergeFileFinish"; +} + void SSDBImpl::SuspendIfFirst() { std::lock_guard lock(suspend_build_mutex_); diff --git a/core/src/db/SSDBImpl.h b/core/src/db/SSDBImpl.h index 4b7605f1ddc38..4fa691b8227fe 100644 --- a/core/src/db/SSDBImpl.h +++ b/core/src/db/SSDBImpl.h @@ -81,7 +81,10 @@ class SSDBImpl { ShowPartitions(const std::string& collection_name, std::vector& partition_names); Status - DropIndex(const std::string& collection_name, const std::string& field_name, const std::string& field_element_name); + InsertEntities(const std::string& collection_name, const std::string& partition_name, DataChunkPtr& data_chunk); + + Status + DeleteEntities(const std::string& collection_name, engine::IDNumbers entity_ids); Status Flush(const std::string& collection_name); @@ -95,23 +98,29 @@ class SSDBImpl { Status GetEntityByID(const std::string& collection_name, const IDNumbers& id_array, - const std::vector& field_names, std::vector& vector_data, - /*std::vector& attr_type,*/ std::vector& attr_data); + const std::vector& field_names, DataChunkPtr& data_chunk); Status - InsertEntities(const std::string& collection_name, const std::string& partition_name, - const std::vector& field_names, Entity& entity, - std::unordered_map& attr_types); + GetEntityIDs(const std::string& collection_id, int64_t segment_id, IDNumbers& entity_ids); Status - DeleteEntities(const std::string& collection_name, engine::IDNumbers entity_ids); + CreateIndex(const std::shared_ptr& context, const std::string& collection_id, + const std::string& field_name, const CollectionIndex& index); + + Status + DescribeIndex(const std::string& collection_id, const std::string& field_name, CollectionIndex& index); + + Status + DropIndex(const std::string& collection_name, const std::string& field_name, const std::string& element_name); Status - HybridQuery(const server::ContextPtr& context, const std::string& collection_name, - const std::vector& partition_patterns, query::GeneralQueryPtr general_query, - query::QueryPtr query_ptr, std::vector& field_names, - std::unordered_map& attr_type, - engine::QueryResult& result); + DropIndex(const std::string& collection_id); + + Status + Query(const server::ContextPtr& context, const std::string& collection_name, + const std::vector& partition_patterns, query::GeneralQueryPtr general_query, + query::QueryPtr query_ptr, std::vector& field_names, + std::unordered_map& attr_type, engine::QueryResult& result); private: void @@ -130,7 +139,7 @@ class SSDBImpl { StartBuildIndexTask(); void - BackgroundWaitBuildIndex(); + BackgroundBuildIndexTask(); void BackgroundIndexThread(); @@ -141,6 +150,9 @@ class SSDBImpl { void BackgroundWalThread(); + Status + ExecWalRecord(const wal::MXLogRecord& record); + void StartMergeTask(const std::set& merge_collection_names, bool force_merge_all = false); @@ -150,9 +162,6 @@ class SSDBImpl { void WaitMergeFileFinish(); - Status - ExecWalRecord(const wal::MXLogRecord& record); - void SuspendIfFirst(); diff --git a/core/src/db/SnapshotHandlers.cpp b/core/src/db/SnapshotHandlers.cpp index 95bbaa9806037..a1de92977f45a 100644 --- a/core/src/db/SnapshotHandlers.cpp +++ b/core/src/db/SnapshotHandlers.cpp @@ -111,19 +111,7 @@ GetEntityByIdSegmentHandler::GetEntityByIdSegmentHandler(const std::shared_ptr& field_names) - : BaseT(ss), - context_(context), - dir_root_(dir_root), - ids_(ids), - field_names_(field_names), - vector_data_(), - attr_type_(), - attr_data_() { - for (auto& field_name : field_names_) { - auto field_ptr = ss_->GetField(field_name); - auto field_type = field_ptr->GetFtype(); - attr_type_.push_back((meta::hybrid::DataType)field_type); - } + : BaseT(ss), context_(context), dir_root_(dir_root), ids_(ids), field_names_(field_names) { } Status @@ -139,108 +127,44 @@ GetEntityByIdSegmentHandler::Handle(const snapshot::SegmentPtr& segment) { auto uid_field_visitor = segment_visitor->GetFieldVisitor(DEFAULT_UID_NAME); /* load UID's bloom filter file */ - auto uid_blf_visitor = uid_field_visitor->GetElementVisitor(FieldElementType::FET_BLOOM_FILTER); - std::string uid_blf_path = snapshot::GetResPath(dir_root_, uid_blf_visitor->GetFile()); - segment::IdBloomFilterPtr id_bloom_filter_ptr; - STATUS_CHECK(segment_reader.LoadBloomFilter(uid_blf_path, id_bloom_filter_ptr)); + STATUS_CHECK(segment_reader.LoadBloomFilter(id_bloom_filter_ptr)); /* load UID's raw data */ - auto uid_raw_visitor = uid_field_visitor->GetElementVisitor(FieldElementType::FET_RAW); - std::string uid_raw_path = snapshot::GetResPath(dir_root_, uid_raw_visitor->GetFile()); - std::vector uids; - STATUS_CHECK(segment_reader.LoadUids(uid_raw_path, uids)); + std::vector uids; + STATUS_CHECK(segment_reader.LoadUids(uids)); /* load UID's deleted docs */ - auto uid_del_visitor = uid_field_visitor->GetElementVisitor(FieldElementType::FET_DELETED_DOCS); - std::string uid_del_path = snapshot::GetResPath(dir_root_, uid_del_visitor->GetFile()); segment::DeletedDocsPtr deleted_docs_ptr; - STATUS_CHECK(segment_reader.LoadDeletedDocs(uid_del_path, deleted_docs_ptr)); + STATUS_CHECK(segment_reader.LoadDeletedDocs(deleted_docs_ptr)); auto& deleted_docs = deleted_docs_ptr->GetDeletedDocs(); + std::vector offsets; for (auto id : ids_) { - AttrsData& attr_ref = attr_data_[id]; - VectorsData& vector_ref = vector_data_[id]; - - /* fast check using bloom filter */ + // fast check using bloom filter if (!id_bloom_filter_ptr->Check(id)) { continue; } - /* check if id really exists in uids */ + // check if id really exists in uids auto found = std::find(uids.begin(), uids.end(), id); if (found == uids.end()) { continue; } - /* check if this id is deleted */ + // check if this id is deleted auto offset = std::distance(uids.begin(), found); auto deleted = std::find(deleted_docs.begin(), deleted_docs.end(), offset); if (deleted != deleted_docs.end()) { continue; } - // std::unordered_map> raw_attrs; - // for (size_t i = 0; i < field_names_.size(); i++) { - // auto& field_name = field_names_[i]; - // auto field_ptr = ss_->GetField(field_name); - // - // auto field_type = attr_type_[i]; - // - // if (field_type == meta::hybrid::DataType::VECTOR_BINARY) { - // auto field_params = field_ptr->GetParams(); - // auto dim = field_params[knowhere::meta::DIM].get(); - // size_t vector_size = dim / 8; - // std::vector raw_vector; - // STATUS_CHECK(segment_reader.LoadVectors(offset * vector_size, vector_size, raw_vector)); - // - // vector_ref.vector_count_ = 1; - // vector_ref.binary_data_.swap(raw_vector); - // } else if (field_type == meta::hybrid::DataType::VECTOR_FLOAT) { - // auto field_params = field_ptr->GetParams(); - // auto dim = field_params[knowhere::meta::DIM].get(); - // size_t vector_size = dim * sizeof(float); - // std::vector raw_vector; - // STATUS_CHECK(segment_reader.LoadVectors(offset * vector_size, vector_size, raw_vector)); - // - // vector_ref.vector_count_ = 1; - // std::vector float_vector; - // float_vector.resize(dim); - // memcpy(float_vector.data(), raw_vector.data(), vector_size); - // vector_ref.float_data_.swap(float_vector); - // } else { - // size_t num_bytes; - // switch (field_type) { - // case meta::hybrid::DataType::INT8: - // num_bytes = 1; - // break; - // case meta::hybrid::DataType::INT16: - // num_bytes = 2; - // break; - // case meta::hybrid::DataType::INT32: - // case meta::hybrid::DataType::FLOAT: - // num_bytes = 4; - // break; - // case meta::hybrid::DataType::INT64: - // case meta::hybrid::DataType::DOUBLE: - // num_bytes = 8; - // break; - // default: { - // std::string msg = "Field type of " + field_name + " not supported"; - // return Status(DB_ERROR, msg); - // } - // } - // std::vector raw_attr; - // STATUS_CHECK(segment_reader.LoadAttrs(field_name, offset * num_bytes, num_bytes, raw_attr)); - // raw_attrs.insert(std::make_pair(field_name, raw_attr)); - // } - // } - // - // attr_ref.attr_count_ = 1; - // attr_ref.attr_data_ = raw_attrs; + offsets.push_back(offset); } + STATUS_CHECK(segment_reader.LoadFieldsEntities(field_names_, offsets, data_chunk_)); + return Status::OK(); } diff --git a/core/src/db/SnapshotHandlers.h b/core/src/db/SnapshotHandlers.h index a6e28c975cc4a..17098bcc0fb62 100644 --- a/core/src/db/SnapshotHandlers.h +++ b/core/src/db/SnapshotHandlers.h @@ -15,6 +15,7 @@ #include "db/meta/FilesHolder.h" #include "db/snapshot/IterateHandler.h" #include "db/snapshot/Snapshot.h" +#include "segment/Segment.h" #include "segment/Types.h" #include "server/context/Context.h" #include "utils/Log.h" @@ -76,9 +77,7 @@ struct GetEntityByIdSegmentHandler : public snapshot::IterateHandler field_names_; - std::vector vector_data_; - std::vector attr_type_; - std::vector attr_data_; + engine::DataChunkPtr data_chunk_; }; /////////////////////////////////////////////////////////////////////////////// diff --git a/core/src/db/Types.h b/core/src/db/Types.h index c3a3e9df7440b..e30563e2fd54e 100644 --- a/core/src/db/Types.h +++ b/core/src/db/Types.h @@ -92,6 +92,7 @@ enum FieldElementType { FET_BLOOM_FILTER = 2, FET_DELETED_DOCS = 3, FET_INDEX = 4, + FET_COMPRESS_SQ8 = 5, }; } // namespace engine diff --git a/core/src/db/Utils.cpp b/core/src/db/Utils.cpp index 38fffb7406974..4bd9d555e3195 100644 --- a/core/src/db/Utils.cpp +++ b/core/src/db/Utils.cpp @@ -304,43 +304,6 @@ EraseFromCache(const std::string& item_key) { } #endif } - -Status -CreatePath(const snapshot::Segment* segment, const DBOptions& options, std::string& path) { - std::string tables_path = options.meta_.path_ + TABLES_FOLDER; - STATUS_CHECK(CommonUtil::CreateDirectory(tables_path)); - std::string collection_path = tables_path + "/" + std::to_string(segment->GetCollectionId()); - STATUS_CHECK(CommonUtil::CreateDirectory(collection_path)); - std::string partition_path = collection_path + "/" + std::to_string(segment->GetPartitionId()); - STATUS_CHECK(CommonUtil::CreateDirectory(partition_path)); - path = partition_path + "/" + std::to_string(segment->GetID()); - STATUS_CHECK(CommonUtil::CreateDirectory(path)); - - return Status::OK(); -} - -Status -CreatePath(const snapshot::Partition* partition, const DBOptions& options, std::string& path) { - std::string tables_path = options.meta_.path_ + TABLES_FOLDER; - STATUS_CHECK(CommonUtil::CreateDirectory(tables_path)); - std::string collection_path = tables_path + "/" + std::to_string(partition->GetCollectionId()); - STATUS_CHECK(CommonUtil::CreateDirectory(collection_path)); - path = collection_path + "/" + std::to_string(partition->GetID()); - STATUS_CHECK(CommonUtil::CreateDirectory(path)); - - return Status::OK(); -} - -Status -CreatePath(const snapshot::Collection* collection, const DBOptions& options, std::string& path) { - std::string tables_path = options.meta_.path_ + TABLES_FOLDER; - STATUS_CHECK(CommonUtil::CreateDirectory(tables_path)); - path = tables_path + "/" + std::to_string(collection->GetID()); - STATUS_CHECK(CommonUtil::CreateDirectory(path)); - - return Status::OK(); -} - } // namespace utils } // namespace engine } // namespace milvus diff --git a/core/src/db/Utils.h b/core/src/db/Utils.h index f6c869fbe373a..7b65fe0265058 100644 --- a/core/src/db/Utils.h +++ b/core/src/db/Utils.h @@ -86,15 +86,6 @@ ExitOnWriteError(Status& status); void EraseFromCache(const std::string& item_key); - -Status -CreatePath(const snapshot::Segment* segment, const DBOptions& options, std::string& path); - -Status -CreatePath(const snapshot::Partition* partition, const DBOptions& options, std::string& path); - -Status -CreatePath(const snapshot::Collection* collection, const DBOptions& options, std::string& path); } // namespace utils } // namespace engine } // namespace milvus diff --git a/core/src/db/insert/SSMemCollection.cpp b/core/src/db/insert/SSMemCollection.cpp index a4569ef01bc85..b4a410c0b11c3 100644 --- a/core/src/db/insert/SSMemCollection.cpp +++ b/core/src/db/insert/SSMemCollection.cpp @@ -46,6 +46,8 @@ SSMemCollection::Add(const milvus::engine::SSVectorSourcePtr& source) { status = new_mem_segment->Add(source); if (status.ok()) { mem_segment_list_.emplace_back(new_mem_segment); + } else { + return status; } } else { status = current_mem_segment->Add(source); diff --git a/core/src/db/insert/SSMemSegment.cpp b/core/src/db/insert/SSMemSegment.cpp index f43105813836f..3f4cc967c02a9 100644 --- a/core/src/db/insert/SSMemSegment.cpp +++ b/core/src/db/insert/SSMemSegment.cpp @@ -116,7 +116,7 @@ SSMemSegment::CreateSegment() { } Status -SSMemSegment::GetSingleEntitySize(size_t& single_size) { +SSMemSegment::GetSingleEntitySize(int64_t& single_size) { snapshot::ScopedSnapshotT ss; auto status = snapshot::Snapshots::GetInstance().GetSnapshot(ss, collection_id_); if (!status.ok()) { @@ -153,6 +153,7 @@ SSMemSegment::GetSingleEntitySize(size_t& single_size) { case meta::hybrid::DataType::INT64: single_size += sizeof(uint64_t); break; + case meta::hybrid::DataType::VECTOR: case meta::hybrid::DataType::VECTOR_FLOAT: case meta::hybrid::DataType::VECTOR_BINARY: { json params = field->GetParams(); @@ -179,7 +180,7 @@ SSMemSegment::GetSingleEntitySize(size_t& single_size) { Status SSMemSegment::Add(const SSVectorSourcePtr& source) { - size_t single_entity_mem_size = 0; + int64_t single_entity_mem_size = 0; auto status = GetSingleEntitySize(single_entity_mem_size); if (!status.ok()) { return status; @@ -187,8 +188,8 @@ SSMemSegment::Add(const SSVectorSourcePtr& source) { size_t mem_left = GetMemLeft(); if (mem_left >= single_entity_mem_size) { - size_t num_entities_to_add = std::ceil(mem_left / single_entity_mem_size); - size_t num_entities_added; + int64_t num_entities_to_add = std::ceil(mem_left / single_entity_mem_size); + int64_t num_entities_added; auto status = source->Add(segment_writer_ptr_, num_entities_to_add, num_entities_added); @@ -254,19 +255,19 @@ SSMemSegment::Delete(const std::vector& doc_ids) { return Status::OK(); } -size_t +int64_t SSMemSegment::GetCurrentMem() { return current_mem_; } -size_t +int64_t SSMemSegment::GetMemLeft() { return (MAX_TABLE_FILE_MEM - current_mem_); } bool SSMemSegment::IsFull() { - size_t single_entity_mem_size = 0; + int64_t single_entity_mem_size = 0; auto status = GetSingleEntitySize(single_entity_mem_size); if (!status.ok()) { return true; @@ -280,29 +281,15 @@ SSMemSegment::Serialize(uint64_t wal_lsn) { int64_t size = GetCurrentMem(); server::CollectSerializeMetrics metrics(size); - snapshot::SegmentFileContext sf_context; - sf_context.field_name = "vector"; - sf_context.field_element_name = "raw"; - sf_context.collection_id = segment_->GetCollectionId(); - sf_context.partition_id = segment_->GetPartitionId(); - sf_context.segment_id = segment_->GetID(); - snapshot::SegmentFilePtr seg_file; - auto status = operation_->CommitNewSegmentFile(sf_context, seg_file); - - status = segment_writer_ptr_->Serialize(); + auto status = segment_writer_ptr_->Serialize(); if (!status.ok()) { LOG_ENGINE_ERROR_ << "Failed to serialize segment: " << segment_->GetID(); return status; } - seg_file->SetSize(segment_writer_ptr_->Size()); - seg_file->SetRowCount(segment_writer_ptr_->RowCount()); - + status = operation_->CommitRowCount(segment_writer_ptr_->RowCount()); status = operation_->Push(); - - LOG_ENGINE_DEBUG_ << "New file " << seg_file->GetID() << " of size " << seg_file->GetSize() - << " bytes, lsn = " << wal_lsn; - + LOG_ENGINE_DEBUG_ << "New segment " << segment_->GetID() << " serialized, lsn = " << wal_lsn; return status; } diff --git a/core/src/db/insert/SSMemSegment.h b/core/src/db/insert/SSMemSegment.h index 07becaeedf569..2f72d7144ff09 100644 --- a/core/src/db/insert/SSMemSegment.h +++ b/core/src/db/insert/SSMemSegment.h @@ -42,10 +42,10 @@ class SSMemSegment : public server::CacheConfigHandler { Status Delete(const std::vector& doc_ids); - size_t + int64_t GetCurrentMem(); - size_t + int64_t GetMemLeft(); bool @@ -66,7 +66,7 @@ class SSMemSegment : public server::CacheConfigHandler { CreateSegment(); Status - GetSingleEntitySize(size_t& single_size); + GetSingleEntitySize(int64_t& single_size); private: int64_t collection_id_; @@ -75,7 +75,7 @@ class SSMemSegment : public server::CacheConfigHandler { std::shared_ptr operation_; snapshot::SegmentPtr segment_; DBOptions options_; - size_t current_mem_; + int64_t current_mem_; // ExecutionEnginePtr execution_engine_; segment::SSSegmentWriterPtr segment_writer_ptr_; diff --git a/core/src/db/insert/SSVectorSource.cpp b/core/src/db/insert/SSVectorSource.cpp index 0538db0e3768b..527c344299357 100644 --- a/core/src/db/insert/SSVectorSource.cpp +++ b/core/src/db/insert/SSVectorSource.cpp @@ -27,55 +27,19 @@ SSVectorSource::SSVectorSource(const DataChunkPtr& chunk) : chunk_(chunk) { } Status -SSVectorSource::Add(const segment::SSSegmentWriterPtr& segment_writer_ptr, const size_t& num_entities_to_add, - size_t& num_entities_added) { +SSVectorSource::Add(const segment::SSSegmentWriterPtr& segment_writer_ptr, const int64_t& num_entities_to_add, + int64_t& num_entities_added) { // TODO: n = vectors_.vector_count_;??? - uint64_t n = chunk_->count_; + int64_t n = chunk_->count_; num_entities_added = current_num_added_ + num_entities_to_add <= n ? num_entities_to_add : n - current_num_added_; - // IDNumbers vector_ids_to_add; - // if (vectors_.id_array_.empty()) { - // SafeIDGenerator& id_generator = SafeIDGenerator::GetInstance(); - // Status status = id_generator.GetNextIDNumbers(num_entities_added, vector_ids_to_add); - // if (!status.ok()) { - // return status; - // } - // } else { - // vector_ids_to_add.resize(num_entities_added); - // for (size_t pos = current_num_attrs_added; pos < current_num_attrs_added + num_entities_added; pos++) { - // vector_ids_to_add[pos - current_num_attrs_added] = vectors_.id_array_[pos]; - // } - // } - // - // Status status; - // status = segment_writer_ptr->AddAttrs("", attr_size_, attr_data_, vector_ids_to_add); - // - // if (status.ok()) { - // current_num_attrs_added += num_entities_added; - // } else { - // LOG_ENGINE_ERROR_ << LogOut("[%s][%ld]", "insert", 0) << "Generate ids fail: " << status.message(); - // return status; - // } - // - // std::vector vectors; - // auto size = num_entities_added * dimension * sizeof(float); - // vectors.resize(size); - // memcpy(vectors.data(), vectors_.float_data_.data() + current_num_vectors_added * dimension, size); - // LOG_ENGINE_DEBUG_ << LogOut("[%s][%ld]", "insert", 0) << "Insert into segment"; - // status = segment_writer_ptr->AddVectors("", vectors, vector_ids_to_add); - // if (status.ok()) { - // current_num_vectors_added += num_entities_added; - // vector_ids_.insert(vector_ids_.end(), std::make_move_iterator(vector_ids_to_add.begin()), - // std::make_move_iterator(vector_ids_to_add.end())); - // } - // - // // don't need to add current_num_attrs_added again - // if (!status.ok()) { - // LOG_ENGINE_ERROR_ << LogOut("[%s][%ld]", "insert", 0) << "SSVectorSource::Add failed: " + - // status.ToString(); return status; - // } - // - // return status; - return Status::OK(); + + auto status = segment_writer_ptr->AddChunk(chunk_, current_num_added_, num_entities_added); + if (!status.ok()) { + return status; + } + + current_num_added_ += num_entities_added; + return status; } bool diff --git a/core/src/db/insert/SSVectorSource.h b/core/src/db/insert/SSVectorSource.h index f9c171e07ae66..8f6189b79569c 100644 --- a/core/src/db/insert/SSVectorSource.h +++ b/core/src/db/insert/SSVectorSource.h @@ -33,7 +33,8 @@ class SSVectorSource { explicit SSVectorSource(const DataChunkPtr& chunk); Status - Add(const segment::SSSegmentWriterPtr& segment_writer_ptr, const size_t& num_attrs_to_add, size_t& num_attrs_added); + Add(const segment::SSSegmentWriterPtr& segment_writer_ptr, const int64_t& num_attrs_to_add, + int64_t& num_attrs_added); bool AllAdded(); @@ -41,7 +42,7 @@ class SSVectorSource { private: DataChunkPtr chunk_; - size_t current_num_added_ = 0; + int64_t current_num_added_ = 0; }; // SSVectorSource using SSVectorSourcePtr = std::shared_ptr; diff --git a/core/src/db/merge/SSMergeManagerImpl.cpp b/core/src/db/merge/SSMergeManagerImpl.cpp index f46a70761f6d9..8d117e0477ffa 100644 --- a/core/src/db/merge/SSMergeManagerImpl.cpp +++ b/core/src/db/merge/SSMergeManagerImpl.cpp @@ -70,6 +70,8 @@ SSMergeManagerImpl::MergeFiles(const std::string& collection_name) { for (it = part2seg.begin(); it != part2seg.end();) { if (it->second.size() <= 1) { part2seg.erase(it++); + } else { + it++; } } diff --git a/core/src/db/merge/SSMergeSimpleStrategy.cpp b/core/src/db/merge/SSMergeSimpleStrategy.cpp index ff8f97f711ea3..83547c169c5b5 100644 --- a/core/src/db/merge/SSMergeSimpleStrategy.cpp +++ b/core/src/db/merge/SSMergeSimpleStrategy.cpp @@ -49,6 +49,10 @@ SSMergeSimpleStrategy::RegroupSegments(const snapshot::ScopedSnapshotT& ss, cons continue; } } + + if (!ids.empty()) { + groups.push_back(ids); + } } return Status::OK(); diff --git a/core/src/db/merge/SSMergeTask.cpp b/core/src/db/merge/SSMergeTask.cpp index 6397fb7897b53..7607b5c6f0532 100644 --- a/core/src/db/merge/SSMergeTask.cpp +++ b/core/src/db/merge/SSMergeTask.cpp @@ -15,8 +15,8 @@ #include "db/snapshot/Operations.h" #include "db/snapshot/Snapshots.h" #include "metrics/Metrics.h" -#include "segment/SegmentReader.h" -#include "segment/SegmentWriter.h" +#include "segment/SSSegmentReader.h" +#include "segment/SSSegmentWriter.h" #include "utils/Log.h" #include @@ -40,7 +40,7 @@ SSMergeTask::Execute() { for (auto& id : segments_) { auto seg = snapshot_->GetResource(id); if (!seg) { - return Status(DB_ERROR, "snapshot segment is null"); + return Status(DB_ERROR, "Snapshot segment is null"); } context.stale_segments.push_back(seg); @@ -57,15 +57,78 @@ SSMergeTask::Execute() { return status; } - // TODO: merge each field, each field create a new SegmentFile - snapshot::SegmentFileContext sf_context; - sf_context.field_name = "vector"; - sf_context.field_element_name = "ivfsq8"; - sf_context.segment_id = 1; - sf_context.partition_id = 1; - sf_context.segment_id = new_seg->GetID(); - snapshot::SegmentFilePtr seg_file; - status = op->CommitNewSegmentFile(sf_context, seg_file); + // create segment raw files (placeholder) + auto names = snapshot_->GetFieldNames(); + for (auto& name : names) { + snapshot::SegmentFileContext sf_context; + sf_context.collection_id = new_seg->GetCollectionId(); + sf_context.partition_id = new_seg->GetPartitionId(); + sf_context.segment_id = new_seg->GetID(); + sf_context.field_name = name; + sf_context.field_element_name = engine::DEFAULT_RAW_DATA_NAME; + + snapshot::SegmentFilePtr seg_file; + status = op->CommitNewSegmentFile(sf_context, seg_file); + if (!status.ok()) { + std::string err_msg = "SSMergeTask create segment failed: " + status.ToString(); + LOG_ENGINE_ERROR_ << err_msg; + return status; + } + } + + // create deleted_doc and bloom_filter files (placeholder) + { + snapshot::SegmentFileContext sf_context; + sf_context.collection_id = new_seg->GetCollectionId(); + sf_context.partition_id = new_seg->GetPartitionId(); + sf_context.segment_id = new_seg->GetID(); + sf_context.field_name = engine::DEFAULT_UID_NAME; + sf_context.field_element_name = engine::DEFAULT_DELETED_DOCS_NAME; + + snapshot::SegmentFilePtr delete_doc_file, bloom_filter_file; + status = op->CommitNewSegmentFile(sf_context, delete_doc_file); + if (!status.ok()) { + std::string err_msg = "SSMergeTask create bloom filter segment file failed: " + status.ToString(); + LOG_ENGINE_ERROR_ << err_msg; + return status; + } + + sf_context.field_element_name = engine::DEFAULT_BLOOM_FILTER_NAME; + status = op->CommitNewSegmentFile(sf_context, bloom_filter_file); + if (!status.ok()) { + std::string err_msg = "SSMergeTask create deleted-doc segment file failed: " + status.ToString(); + LOG_ENGINE_ERROR_ << err_msg; + return status; + } + } + + auto ctx = op->GetContext(); + auto visitor = SegmentVisitor::Build(snapshot_, ctx.new_segment, ctx.new_segment_files); + + // create segment writer + segment::SSSegmentWriterPtr segment_writer = + std::make_shared(options_.meta_.path_, visitor); + + // merge + for (auto& id : segments_) { + auto seg = snapshot_->GetResource(id); + + auto read_visitor = SegmentVisitor::Build(snapshot_, id); + segment::SSSegmentReaderPtr segment_reader = + std::make_shared(options_.meta_.path_, read_visitor); + status = segment_writer->Merge(segment_reader); + if (!status.ok()) { + std::string err_msg = "SSMergeTask merge failed: " + status.ToString(); + LOG_ENGINE_ERROR_ << err_msg; + return status; + } + } + + status = segment_writer->Serialize(); + if (!status.ok()) { + LOG_ENGINE_ERROR_ << "Failed to serialize segment: " << new_seg->GetID(); + return status; + } status = op->Push(); diff --git a/core/src/db/snapshot/Snapshot.h b/core/src/db/snapshot/Snapshot.h index 0c75e06599a58..f79c14053e1bf 100644 --- a/core/src/db/snapshot/Snapshot.h +++ b/core/src/db/snapshot/Snapshot.h @@ -95,7 +95,8 @@ class Snapshot : public ReferenceProxy { Status GetPartitionId(const std::string& name, ID_TYPE& id) const { - auto it = partition_names_map_.find(name); + std::string real_name = name.empty() ? DEFAULT_PARTITON_TAG : name; + auto it = partition_names_map_.find(real_name); if (it == partition_names_map_.end()) { return Status(SS_NOT_FOUND_ERROR, "Specified partition name not found"); } diff --git a/core/src/db/wal/WalDefinations.h b/core/src/db/wal/WalDefinations.h index 6808e902240e0..8e38282f6e66f 100644 --- a/core/src/db/wal/WalDefinations.h +++ b/core/src/db/wal/WalDefinations.h @@ -18,6 +18,7 @@ #include "db/Types.h" #include "db/meta/MetaTypes.h" +#include "segment/Segment.h" namespace milvus { namespace engine { @@ -41,12 +42,14 @@ struct MXLogRecord { const IDNumber* ids; uint32_t data_size; const void* data; - std::vector field_names; + std::vector field_names; // will be removed // std::vector attrs_size; // std::vector attrs_data; - std::unordered_map attr_nbytes; - std::unordered_map attr_data_size; - std::unordered_map> attr_data; + std::unordered_map attr_nbytes; // will be removed + std::unordered_map attr_data_size; // will be removed + std::unordered_map> attr_data; // will be removed + + engine::DataChunkPtr data_chunk; // for hybird data transfer }; struct MXLogConfiguration { diff --git a/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h b/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h index 35283cda31997..3997e67ff5daf 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h +++ b/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h @@ -25,6 +25,7 @@ namespace milvus { namespace knowhere { +#define INDEX_DATA "INDEX_DATA" #define RAW_DATA "RAW_DATA" #define SQ8_DATA "SQ8_DATA" diff --git a/core/src/segment/SSSegmentReader.cpp b/core/src/segment/SSSegmentReader.cpp index 8db3e0c4be7f1..b81f3568161b4 100644 --- a/core/src/segment/SSSegmentReader.cpp +++ b/core/src/segment/SSSegmentReader.cpp @@ -17,13 +17,16 @@ #include "segment/SSSegmentReader.h" +#include #include +#include #include "Vectors.h" #include "codecs/snapshot/SSCodec.h" #include "db/Types.h" #include "db/snapshot/ResourceHelper.h" #include "knowhere/index/vector_index/VecIndex.h" +#include "knowhere/index/vector_index/helpers/IndexParameter.h" #include "storage/disk/DiskIOReader.h" #include "storage/disk/DiskIOWriter.h" #include "storage/disk/DiskOperation.h" @@ -34,84 +37,74 @@ namespace segment { SSSegmentReader::SSSegmentReader(const std::string& dir_root, const engine::SegmentVisitorPtr& segment_visitor) : dir_root_(dir_root), segment_visitor_(segment_visitor) { - auto& segment_ptr = segment_visitor_->GetSegment(); + Initialize(); +} + +Status +SSSegmentReader::Initialize() { std::string directory = - engine::snapshot::GetResPath(dir_root_, segment_visitor->GetSegment()); + engine::snapshot::GetResPath(dir_root_, segment_visitor_->GetSegment()); storage::IOReaderPtr reader_ptr = std::make_shared(); storage::IOWriterPtr writer_ptr = std::make_shared(); storage::OperationPtr operation_ptr = std::make_shared(directory); fs_ptr_ = std::make_shared(reader_ptr, writer_ptr, operation_ptr); - segment_ptr_ = std::make_shared(); -} + segment_ptr_ = std::make_shared(); + + const engine::SegmentVisitor::IdMapT& field_map = segment_visitor_->GetFieldVisitors(); + for (auto& iter : field_map) { + const engine::snapshot::FieldPtr& field = iter.second->GetField(); + std::string name = field->GetName(); + engine::FIELD_TYPE ftype = static_cast(field->GetFtype()); + if (ftype == engine::FIELD_TYPE::VECTOR || ftype == engine::FIELD_TYPE::VECTOR_FLOAT || + ftype == engine::FIELD_TYPE::VECTOR_BINARY) { + json params = field->GetParams(); + if (params.find(knowhere::meta::DIM) == params.end()) { + std::string msg = "Vector field params must contain: dimension"; + LOG_SERVER_ERROR_ << msg; + return Status(DB_ERROR, msg); + } + + int64_t field_width = 0; + int64_t dimension = params[knowhere::meta::DIM]; + if (ftype == engine::FIELD_TYPE::VECTOR_BINARY) { + field_width += (dimension / 8); + } else { + field_width += (dimension * sizeof(float)); + } + segment_ptr_->AddField(name, ftype, field_width); + } else { + segment_ptr_->AddField(name, ftype); + } + } -Status -SSSegmentReader::LoadCache(bool& in_cache) { - in_cache = false; return Status::OK(); } Status SSSegmentReader::Load() { - try { - // auto& ss_codec = codec::SSCodec::instance(); + STATUS_CHECK(LoadFields()); - auto uid_field_visitor = segment_visitor_->GetFieldVisitor(engine::DEFAULT_UID_NAME); + STATUS_CHECK(LoadBloomFilter()); + + STATUS_CHECK(LoadDeletedDocs()); + + STATUS_CHECK(LoadVectorIndice()); - /* load UID's raw data */ - auto uid_raw_visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_RAW); - std::string uid_raw_path = - engine::snapshot::GetResPath(dir_root_, uid_raw_visitor->GetFile()); - STATUS_CHECK(LoadUids(uid_raw_path, segment_ptr_->vectors_ptr_->GetMutableUids())); - - /* load UID's deleted docs */ - auto uid_del_visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_DELETED_DOCS); - std::string uid_del_path = - engine::snapshot::GetResPath(dir_root_, uid_del_visitor->GetFile()); - STATUS_CHECK(LoadDeletedDocs(uid_del_path, segment_ptr_->deleted_docs_ptr_)); - - /* load other data */ - Status s; - auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); - for (auto& f_kv : field_visitors_map) { - auto& fv = f_kv.second; - auto& field = fv->GetField(); - for (auto& file_kv : fv->GetElementVistors()) { - auto& fev = file_kv.second; - std::string file_path = - engine::snapshot::GetResPath(dir_root_, fev->GetFile()); - if (!s.ok()) { - LOG_ENGINE_WARNING_ << "Cannot get resource path"; - } - - auto& segment_file = fev->GetFile(); - if (segment_file == nullptr) { - continue; - } - auto& field_element = fev->GetElement(); - - if ((field->GetFtype() == engine::FieldType::VECTOR_FLOAT || - field->GetFtype() == engine::FieldType::VECTOR_BINARY) && - field_element->GetFtype() == engine::FieldElementType::FET_RAW) { - STATUS_CHECK(LoadVectors(file_path, 0, INT64_MAX, segment_ptr_->vectors_ptr_->GetMutableData())); - } - - /* SS TODO: load attr data ? */ - } - } - } catch (std::exception& e) { - return Status(DB_ERROR, e.what()); - } return Status::OK(); } Status -SSSegmentReader::LoadVectors(const std::string& file_path, off_t offset, size_t num_bytes, - std::vector& raw_vectors) { +SSSegmentReader::LoadField(const std::string& field_name, std::vector& raw) { try { + auto field_visitor = segment_visitor_->GetFieldVisitor(field_name); + auto raw_visitor = field_visitor->GetElementVisitor(engine::FieldElementType::FET_RAW); + std::string file_path = + engine::snapshot::GetResPath(dir_root_, raw_visitor->GetFile()); + auto& ss_codec = codec::SSCodec::instance(); - ss_codec.GetVectorsFormat()->read_vectors(fs_ptr_, file_path, offset, num_bytes, raw_vectors); + ss_codec.GetBlockFormat()->read(fs_ptr_, file_path, raw); } catch (std::exception& e) { std::string err_msg = "Failed to load raw vectors: " + std::string(e.what()); LOG_ENGINE_ERROR_ << err_msg; @@ -121,55 +114,177 @@ SSSegmentReader::LoadVectors(const std::string& file_path, off_t offset, size_t } Status -SSSegmentReader::LoadAttrs(const std::string& field_name, off_t offset, size_t num_bytes, - std::vector& raw_attrs) { - try { - auto& ss_codec = codec::SSCodec::instance(); - ss_codec.GetAttrsFormat()->read_attrs(fs_ptr_, field_name, offset, num_bytes, raw_attrs); - } catch (std::exception& e) { - std::string err_msg = "Failed to load raw attributes: " + std::string(e.what()); - LOG_ENGINE_ERROR_ << err_msg; - return Status(DB_ERROR, err_msg); +SSSegmentReader::LoadFields() { + engine::FIXEDX_FIELD_MAP& field_map = segment_ptr_->GetFixedFields(); + auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); + for (auto& iter : field_visitors_map) { + const engine::snapshot::FieldPtr& field = iter.second->GetField(); + std::string name = field->GetName(); + engine::FIXED_FIELD_DATA raw_data; + segment_ptr_->GetFixedFieldData(name, raw_data); + + auto element_visitor = iter.second->GetElementVisitor(engine::FieldElementType::FET_RAW); + std::string file_path = + engine::snapshot::GetResPath(dir_root_, element_visitor->GetFile()); + STATUS_CHECK(LoadField(file_path, raw_data)); + + field_map.insert(std::make_pair(name, raw_data)); } + return Status::OK(); } Status -SSSegmentReader::LoadUids(const std::string& file_path, std::vector& uids) { +SSSegmentReader::LoadEntities(const std::string& field_name, const std::vector& offsets, + std::vector& raw) { try { + auto field_visitor = segment_visitor_->GetFieldVisitor(field_name); + auto raw_visitor = field_visitor->GetElementVisitor(engine::FieldElementType::FET_RAW); + std::string file_path = + engine::snapshot::GetResPath(dir_root_, raw_visitor->GetFile()); + + int64_t field_width = 0; + segment_ptr_->GetFixedFieldWidth(field_name, field_width); + if (field_width <= 0) { + return Status(DB_ERROR, "Invalid field width"); + } + + codec::ReadRanges ranges; + for (auto offset : offsets) { + ranges.push_back(codec::ReadRange(offset, field_width)); + } auto& ss_codec = codec::SSCodec::instance(); - ss_codec.GetVectorsFormat()->read_uids(fs_ptr_, file_path, uids); + ss_codec.GetBlockFormat()->read(fs_ptr_, file_path, ranges, raw); } catch (std::exception& e) { - std::string err_msg = "Failed to load uids: " + std::string(e.what()); + std::string err_msg = "Failed to load raw vectors: " + std::string(e.what()); LOG_ENGINE_ERROR_ << err_msg; return Status(DB_ERROR, err_msg); } + return Status::OK(); } Status -SSSegmentReader::GetSegment(SegmentPtr& segment_ptr) { - segment_ptr = segment_ptr_; +SSSegmentReader::LoadFieldsEntities(const std::vector& fields_name, const std::vector& offsets, + engine::DataChunkPtr& data_chunk) { + data_chunk = std::make_shared(); + data_chunk->count_ = offsets.size(); + for (auto& name : fields_name) { + engine::FIXED_FIELD_DATA raw_data; + auto status = LoadEntities(name, offsets, raw_data); + if (!status.ok()) { + return status; + } + + data_chunk->fixed_fields_[name] = raw_data; + } + return Status::OK(); } Status -SSSegmentReader::LoadVectorIndex(const std::string& location, codec::ExternalData external_data, - segment::VectorIndexPtr& vector_index_ptr) { +SSSegmentReader::LoadUids(std::vector& uids) { + std::vector raw; + auto status = LoadField(engine::DEFAULT_UID_NAME, raw); + if (!status.ok()) { + LOG_ENGINE_ERROR_ << status.message(); + return status; + } + + if (raw.size() % sizeof(int64_t) != 0) { + std::string err_msg = "Failed to load uids: illegal file size"; + LOG_ENGINE_ERROR_ << err_msg; + return Status(DB_ERROR, err_msg); + } + + uids.clear(); + uids.resize(raw.size() / sizeof(int64_t)); + memcpy(uids.data(), raw.data(), raw.size()); + + return Status::OK(); +} + +Status +SSSegmentReader::LoadVectorIndex(const std::string& field_name, segment::VectorIndexPtr& vector_index_ptr) { try { auto& ss_codec = codec::SSCodec::instance(); - ss_codec.GetVectorIndexFormat()->read(fs_ptr_, location, external_data, vector_index_ptr); + auto field_visitor = segment_visitor_->GetFieldVisitor(field_name); + knowhere::BinarySet index_data; + knowhere::BinaryPtr raw_data, compress_data; + + auto index_visitor = field_visitor->GetElementVisitor(engine::FieldElementType::FET_INDEX); + if (index_visitor) { + std::string file_path = + engine::snapshot::GetResPath(dir_root_, index_visitor->GetFile()); + ss_codec.GetVectorIndexFormat()->read_index(fs_ptr_, file_path, index_data); + } + + engine::FIXED_FIELD_DATA fixed_data; + auto status = segment_ptr_->GetFixedFieldData(field_name, fixed_data); + if (status.ok()) { + ss_codec.GetVectorIndexFormat()->convert_raw(fixed_data, raw_data); + } else if (auto visitor = field_visitor->GetElementVisitor(engine::FieldElementType::FET_RAW)) { + std::string file_path = + engine::snapshot::GetResPath(dir_root_, visitor->GetFile()); + + ss_codec.GetVectorIndexFormat()->read_raw(fs_ptr_, file_path, raw_data); + } + + if (auto visitor = field_visitor->GetElementVisitor(engine::FieldElementType::FET_COMPRESS_SQ8)) { + std::string file_path = + engine::snapshot::GetResPath(dir_root_, visitor->GetFile()); + ss_codec.GetVectorIndexFormat()->read_compress(fs_ptr_, file_path, compress_data); + } + + knowhere::VecIndexPtr index; + std::string index_name = index_visitor->GetElement()->GetName(); + ss_codec.GetVectorIndexFormat()->construct_index(index_name, index_data, raw_data, compress_data, index); + + vector_index_ptr = std::make_shared(index); } catch (std::exception& e) { std::string err_msg = "Failed to load vector index: " + std::string(e.what()); LOG_ENGINE_ERROR_ << err_msg; return Status(DB_ERROR, err_msg); } + + return Status::OK(); +} + +Status +SSSegmentReader::LoadVectorIndice() { + auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); + for (auto& iter : field_visitors_map) { + const engine::snapshot::FieldPtr& field = iter.second->GetField(); + std::string name = field->GetName(); + + auto element_visitor = iter.second->GetElementVisitor(engine::FieldElementType::FET_INDEX); + if (element_visitor == nullptr) { + continue; + } + + if (field->GetFtype() == engine::FIELD_TYPE::VECTOR || field->GetFtype() == engine::FIELD_TYPE::VECTOR_FLOAT || + field->GetFtype() == engine::FIELD_TYPE::VECTOR_BINARY) { + std::string file_path = + engine::snapshot::GetResPath(dir_root_, element_visitor->GetFile()); + + segment::VectorIndexPtr vector_index_ptr; + STATUS_CHECK(LoadVectorIndex(name, vector_index_ptr)); + + segment_ptr_->SetVectorIndex(name, vector_index_ptr->GetVectorIndex()); + } + } + return Status::OK(); } Status -SSSegmentReader::LoadBloomFilter(const std::string file_path, segment::IdBloomFilterPtr& id_bloom_filter_ptr) { +SSSegmentReader::LoadBloomFilter(segment::IdBloomFilterPtr& id_bloom_filter_ptr) { try { + auto uid_field_visitor = segment_visitor_->GetFieldVisitor(engine::DEFAULT_UID_NAME); + auto visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_BLOOM_FILTER); + std::string file_path = + engine::snapshot::GetResPath(dir_root_, visitor->GetFile()); + auto& ss_codec = codec::SSCodec::instance(); ss_codec.GetIdBloomFilterFormat()->read(fs_ptr_, file_path, id_bloom_filter_ptr); } catch (std::exception& e) { @@ -181,8 +296,28 @@ SSSegmentReader::LoadBloomFilter(const std::string file_path, segment::IdBloomFi } Status -SSSegmentReader::LoadDeletedDocs(const std::string& file_path, segment::DeletedDocsPtr& deleted_docs_ptr) { +SSSegmentReader::LoadBloomFilter() { + segment::IdBloomFilterPtr id_bloom_filter_ptr; + auto status = LoadBloomFilter(id_bloom_filter_ptr); + if (!status.ok()) { + return status; + } + + segment_ptr_->SetBloomFilter(id_bloom_filter_ptr); + return Status::OK(); +} + +Status +SSSegmentReader::LoadDeletedDocs(segment::DeletedDocsPtr& deleted_docs_ptr) { try { + auto uid_field_visitor = segment_visitor_->GetFieldVisitor(engine::DEFAULT_UID_NAME); + auto visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_DELETED_DOCS); + std::string file_path = + engine::snapshot::GetResPath(dir_root_, visitor->GetFile()); + if (!boost::filesystem::exists(file_path)) { + return Status::OK(); // file doesn't exist + } + auto& ss_codec = codec::SSCodec::instance(); ss_codec.GetDeletedDocsFormat()->read(fs_ptr_, file_path, deleted_docs_ptr); } catch (std::exception& e) { @@ -193,6 +328,18 @@ SSSegmentReader::LoadDeletedDocs(const std::string& file_path, segment::DeletedD return Status::OK(); } +Status +SSSegmentReader::LoadDeletedDocs() { + segment::DeletedDocsPtr deleted_docs_ptr; + auto status = LoadDeletedDocs(deleted_docs_ptr); + if (!status.ok()) { + return status; + } + + segment_ptr_->SetDeletedDocs(deleted_docs_ptr); + return Status::OK(); +} + Status SSSegmentReader::ReadDeletedDocsSize(size_t& size) { try { @@ -205,5 +352,32 @@ SSSegmentReader::ReadDeletedDocsSize(size_t& size) { } return Status::OK(); } + +Status +SSSegmentReader::GetSegment(engine::SegmentPtr& segment_ptr) { + segment_ptr = segment_ptr_; + return Status::OK(); +} + +Status +SSSegmentReader::GetSegmentID(int64_t& id) { + if (segment_visitor_) { + auto segment = segment_visitor_->GetSegment(); + if (segment) { + id = segment->GetID(); + return Status::OK(); + } + } + + return Status(DB_ERROR, "SSSegmentWriter::GetSegmentID: null pointer"); +} + +std::string +SSSegmentReader::GetSegmentPath() { + std::string seg_path = + engine::snapshot::GetResPath(dir_root_, segment_visitor_->GetSegment()); + return seg_path; +} + } // namespace segment } // namespace milvus diff --git a/core/src/segment/SSSegmentReader.h b/core/src/segment/SSSegmentReader.h index d8ab4b123be26..36cab0dd63234 100644 --- a/core/src/segment/SSSegmentReader.h +++ b/core/src/segment/SSSegmentReader.h @@ -23,7 +23,7 @@ #include "codecs/Codec.h" #include "db/SnapshotVisitor.h" -#include "segment/Types.h" +#include "segment/Segment.h" #include "storage/FSHandler.h" #include "utils/Status.h" @@ -34,42 +34,63 @@ class SSSegmentReader { public: explicit SSSegmentReader(const std::string& dir_root, const engine::SegmentVisitorPtr& segment_visitor); - // TODO(zhiru) Status - LoadCache(bool& in_cache); + Load(); Status - Load(); + LoadField(const std::string& field_name, std::vector& raw); + + Status + LoadFields(); + + Status + LoadEntities(const std::string& field_name, const std::vector& offsets, std::vector& raw); + + Status + LoadFieldsEntities(const std::vector& fields_name, const std::vector& offsets, + engine::DataChunkPtr& data_chunk); Status - LoadVectors(const std::string& file_path, off_t offset, size_t num_bytes, std::vector& raw_vectors); + LoadUids(std::vector& uids); Status - LoadAttrs(const std::string& field_name, off_t offset, size_t num_bytes, std::vector& raw_attrs); + LoadVectorIndex(const std::string& field_name, segment::VectorIndexPtr& vector_index_ptr); Status - LoadUids(const std::string& file_path, std::vector& uids); + LoadVectorIndice(); Status - LoadVectorIndex(const std::string& location, codec::ExternalData external_data, - segment::VectorIndexPtr& vector_index_ptr); + LoadBloomFilter(segment::IdBloomFilterPtr& id_bloom_filter_ptr); Status - LoadBloomFilter(const std::string file_path, segment::IdBloomFilterPtr& id_bloom_filter_ptr); + LoadBloomFilter(); Status - LoadDeletedDocs(const std::string& file_path, segment::DeletedDocsPtr& deleted_docs_ptr); + LoadDeletedDocs(segment::DeletedDocsPtr& deleted_docs_ptr); Status - GetSegment(SegmentPtr& segment_ptr); + LoadDeletedDocs(); Status ReadDeletedDocsSize(size_t& size); + Status + GetSegment(engine::SegmentPtr& segment_ptr); + + Status + GetSegmentID(int64_t& id); + + std::string + GetSegmentPath(); + + private: + Status + Initialize(); + private: engine::SegmentVisitorPtr segment_visitor_; storage::FSHandlerPtr fs_ptr_; - SegmentPtr segment_ptr_; + engine::SegmentPtr segment_ptr_; std::string dir_root_; }; diff --git a/core/src/segment/SSSegmentWriter.cpp b/core/src/segment/SSSegmentWriter.cpp index 37dd2727bb128..09ce734ab437f 100644 --- a/core/src/segment/SSSegmentWriter.cpp +++ b/core/src/segment/SSSegmentWriter.cpp @@ -59,7 +59,8 @@ SSSegmentWriter::Initialize() { const engine::snapshot::FieldPtr& field = iter.second->GetField(); std::string name = field->GetName(); engine::FIELD_TYPE ftype = static_cast(field->GetFtype()); - if (ftype == engine::FIELD_TYPE::VECTOR_FLOAT || ftype == engine::FIELD_TYPE::VECTOR_BINARY) { + if (ftype == engine::FIELD_TYPE::VECTOR || ftype == engine::FIELD_TYPE::VECTOR_FLOAT || + ftype == engine::FIELD_TYPE::VECTOR_BINARY) { json params = field->GetParams(); if (params.find(knowhere::meta::DIM) == params.end()) { std::string msg = "Vector field params must contain: dimension"; @@ -89,39 +90,20 @@ SSSegmentWriter::AddChunk(const engine::DataChunkPtr& chunk_ptr) { } Status -SSSegmentWriter::AddChunk(const engine::DataChunkPtr& chunk_ptr, uint64_t from, uint64_t to) { +SSSegmentWriter::AddChunk(const engine::DataChunkPtr& chunk_ptr, int64_t from, int64_t to) { return segment_ptr_->AddChunk(chunk_ptr, from, to); } Status SSSegmentWriter::Serialize() { - auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); - auto uid_field_visitor = segment_visitor_->GetFieldVisitor(engine::DEFAULT_UID_NAME); - - /* write fields raw data */ - for (auto& iter : field_visitors_map) { - const engine::snapshot::FieldPtr& field = iter.second->GetField(); - std::string name = field->GetName(); - engine::FIXED_FIELD_DATA raw_data; - segment_ptr_->GetFixedFieldData(name, raw_data); - - auto element_visitor = iter.second->GetElementVisitor(engine::FieldElementType::FET_RAW); - std::string file_path = - engine::snapshot::GetResPath(dir_root_, element_visitor->GetFile()); - STATUS_CHECK(WriteField(file_path, raw_data)); - } + // write fields raw data + STATUS_CHECK(WriteFields()); - /* write empty UID's deleted docs */ - auto uid_del_visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_DELETED_DOCS); - std::string uid_del_path = - engine::snapshot::GetResPath(dir_root_, uid_del_visitor->GetFile()); - STATUS_CHECK(WriteDeletedDocs(uid_del_path)); + // write empty UID's deleted docs + STATUS_CHECK(WriteDeletedDocs()); - /* don't write UID's bloom filter */ - // auto uid_blf_visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_BLOOM_FILTER); - // std::string uid_blf_path = - // engine::snapshot::GetResPath(dir_root_, uid_blf_visitor->GetFile()); - // STATUS_CHECK(WriteBloomFilter(uid_blf_path, segment_ptr_->GetBloomFilter())); + // write UID's bloom filter + STATUS_CHECK(WriteBloomFilter()); return Status::OK(); } @@ -142,10 +124,26 @@ SSSegmentWriter::WriteField(const std::string& file_path, const engine::FIXED_FI } Status -SSSegmentWriter::WriteBloomFilter(const std::string& file_path) { - try { - auto& ss_codec = codec::SSCodec::instance(); +SSSegmentWriter::WriteFields() { + auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); + for (auto& iter : field_visitors_map) { + const engine::snapshot::FieldPtr& field = iter.second->GetField(); + std::string name = field->GetName(); + engine::FIXED_FIELD_DATA raw_data; + segment_ptr_->GetFixedFieldData(name, raw_data); + + auto element_visitor = iter.second->GetElementVisitor(engine::FieldElementType::FET_RAW); + std::string file_path = + engine::snapshot::GetResPath(dir_root_, element_visitor->GetFile()); + STATUS_CHECK(WriteField(file_path, raw_data)); + } + return Status::OK(); +} + +Status +SSSegmentWriter::WriteBloomFilter() { + try { TimeRecorder recorder("SSSegmentWriter::WriteBloomFilter"); engine::FIXED_FIELD_DATA uid_data; @@ -154,19 +152,26 @@ SSSegmentWriter::WriteBloomFilter(const std::string& file_path) { return status; } - segment::IdBloomFilterPtr bloom_filter_ptr; - ss_codec.GetIdBloomFilterFormat()->create(fs_ptr_, bloom_filter_ptr); + auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); + auto uid_field_visitor = segment_visitor_->GetFieldVisitor(engine::DEFAULT_UID_NAME); + auto uid_blf_visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_BLOOM_FILTER); + std::string uid_blf_path = + engine::snapshot::GetResPath(dir_root_, uid_blf_visitor->GetFile()); - recorder.RecordSection("Initializing bloom filter"); + auto& ss_codec = codec::SSCodec::instance(); + segment::IdBloomFilterPtr bloom_filter_ptr; + ss_codec.GetIdBloomFilterFormat()->create(fs_ptr_, uid_blf_path, bloom_filter_ptr); int64_t* uids = (int64_t*)(uid_data.data()); int64_t row_count = segment_ptr_->GetRowCount(); - for (uint64_t i = 0; i < row_count; i++) { + for (int64_t i = 0; i < row_count; i++) { bloom_filter_ptr->Add(uids[i]); } segment_ptr_->SetBloomFilter(bloom_filter_ptr); - recorder.RecordSection("Adding " + std::to_string(row_count) + " ids to bloom filter"); + recorder.RecordSection("Initialize bloom filter"); + + return WriteBloomFilter(uid_blf_path, segment_ptr_->GetBloomFilter()); } catch (std::exception& e) { std::string err_msg = "Failed to write vectors: " + std::string(e.what()); LOG_ENGINE_ERROR_ << err_msg; @@ -174,15 +179,21 @@ SSSegmentWriter::WriteBloomFilter(const std::string& file_path) { engine::utils::SendExitSignal(); return Status(SERVER_WRITE_ERROR, err_msg); } - - return WriteBloomFilter(file_path, segment_ptr_->GetBloomFilter()); } Status SSSegmentWriter::WriteBloomFilter(const std::string& file_path, const IdBloomFilterPtr& id_bloom_filter_ptr) { + if (id_bloom_filter_ptr == nullptr) { + return Status(DB_ERROR, "WriteBloomFilter: null pointer"); + } + try { + TimeRecorder recorder("SSSegmentWriter::WriteBloomFilter"); + auto& ss_codec = codec::SSCodec::instance(); ss_codec.GetIdBloomFilterFormat()->write(fs_ptr_, file_path, id_bloom_filter_ptr); + + recorder.RecordSection("Write bloom filter file"); } catch (std::exception& e) { std::string err_msg = "Failed to write bloom filter: " + std::string(e.what()); LOG_ENGINE_ERROR_ << err_msg; @@ -194,16 +205,25 @@ SSSegmentWriter::WriteBloomFilter(const std::string& file_path, const IdBloomFil } Status -SSSegmentWriter::WriteDeletedDocs(const std::string& file_path) { - DeletedDocsPtr deleted_docs_ptr = std::make_shared(); - STATUS_CHECK(WriteDeletedDocs(file_path, deleted_docs_ptr)); - segment_ptr_->SetDeletedDocs(deleted_docs_ptr); - return Status::OK(); +SSSegmentWriter::WriteDeletedDocs() { + auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); + auto uid_field_visitor = segment_visitor_->GetFieldVisitor(engine::DEFAULT_UID_NAME); + auto del_doc_visitor = uid_field_visitor->GetElementVisitor(engine::FieldElementType::FET_DELETED_DOCS); + std::string file_path = + engine::snapshot::GetResPath(dir_root_, del_doc_visitor->GetFile()); + + return WriteDeletedDocs(file_path, segment_ptr_->GetDeletedDocs()); } Status SSSegmentWriter::WriteDeletedDocs(const std::string& file_path, const DeletedDocsPtr& deleted_docs) { + if (deleted_docs == nullptr) { + return Status::OK(); + } + try { + TimeRecorderAuto recorder("SSSegmentWriter::WriteDeletedDocs"); + auto& ss_codec = codec::SSCodec::instance(); ss_codec.GetDeletedDocsFormat()->write(fs_ptr_, file_path, deleted_docs); } catch (std::exception& e) { @@ -217,70 +237,68 @@ SSSegmentWriter::WriteDeletedDocs(const std::string& file_path, const DeletedDoc } Status -SSSegmentWriter::GetSegment(engine::SegmentPtr& segment_ptr) { - segment_ptr = segment_ptr_; - return Status::OK(); -} +SSSegmentWriter::Merge(const SSSegmentReaderPtr& segment_reader) { + if (segment_reader == nullptr) { + return Status(DB_ERROR, "Segment reader is null"); + } -Status -SSSegmentWriter::Merge(const SSSegmentReaderPtr& segment_to_merge) { - // if (dir_to_merge == fs_ptr_->operation_ptr_->GetDirectory()) { - // return Status(DB_ERROR, "Cannot Merge Self"); - // } - // - // LOG_ENGINE_DEBUG_ << "Merging from " << dir_to_merge << " to " << fs_ptr_->operation_ptr_->GetDirectory(); - // - // TimeRecorder recorder("SSSegmentWriter::Merge"); - // - // SSSegmentReader segment_reader_to_merge(dir_to_merge); - // bool in_cache; - // auto status = segment_reader_to_merge.LoadCache(in_cache); - // if (!in_cache) { - // status = segment_reader_to_merge.Load(); - // if (!status.ok()) { - // std::string msg = "Failed to load segment from " + dir_to_merge; - // LOG_ENGINE_ERROR_ << msg; - // return Status(DB_ERROR, msg); - // } - // } - // SegmentPtr segment_to_merge; - // segment_reader_to_merge.GetSegment(segment_to_merge); - // // auto& uids = segment_to_merge->vectors_ptr_->GetUids(); - // - // recorder.RecordSection("Loading segment"); - // - // if (segment_to_merge->deleted_docs_ptr_ != nullptr) { - // auto offsets_to_delete = segment_to_merge->deleted_docs_ptr_->GetDeletedDocs(); - // - // // Erase from raw data - // segment_to_merge->vectors_ptr_->Erase(offsets_to_delete); - // } - // - // recorder.RecordSection("erase"); - // - // AddVectors(name, segment_to_merge->vectors_ptr_->GetData(), segment_to_merge->vectors_ptr_->GetUids()); - // - // auto rows = segment_to_merge->vectors_ptr_->GetCount(); - // recorder.RecordSection("Adding " + std::to_string(rows) + " vectors and uids"); - // - // std::unordered_map attr_nbytes; - // std::unordered_map> attr_data; - // auto attr_it = segment_to_merge->attrs_ptr_->attrs.begin(); - // for (; attr_it != segment_to_merge->attrs_ptr_->attrs.end(); attr_it++) { - // attr_nbytes.insert(std::make_pair(attr_it->first, attr_it->second->GetNbytes())); - // attr_data.insert(std::make_pair(attr_it->first, attr_it->second->GetData())); - // - // if (segment_to_merge->deleted_docs_ptr_ != nullptr) { - // auto offsets_to_delete = segment_to_merge->deleted_docs_ptr_->GetDeletedDocs(); - // - // // Erase from field data - // attr_it->second->Erase(offsets_to_delete); - // } - // } - // AddAttrs(name, attr_nbytes, attr_data, segment_to_merge->vectors_ptr_->GetUids()); - // - // LOG_ENGINE_DEBUG_ << "Merging completed from " << dir_to_merge << " to " << - // fs_ptr_->operation_ptr_->GetDirectory(); + // check conflict + int64_t src_id, target_id; + auto status = GetSegmentID(target_id); + if (!status.ok()) { + return status; + } + status = segment_reader->GetSegmentID(src_id); + if (!status.ok()) { + return status; + } + if (src_id == target_id) { + return Status(DB_ERROR, "Cannot Merge Self"); + } + + LOG_ENGINE_DEBUG_ << "Merging from " << segment_reader->GetSegmentPath() << " to " << GetSegmentPath(); + + TimeRecorder recorder("SSSegmentWriter::Merge"); + + // merge deleted docs (Note: this step must before merge raw data) + segment::DeletedDocsPtr src_deleted_docs; + status = segment_reader->LoadDeletedDocs(src_deleted_docs); + if (!status.ok()) { + return status; + } + + engine::SegmentPtr src_segment; + status = segment_reader->GetSegment(src_segment); + if (!status.ok()) { + return status; + } + + if (src_deleted_docs) { + const std::vector& delete_ids = src_deleted_docs->GetDeletedDocs(); + for (auto offset : delete_ids) { + src_segment->DeleteEntity(offset); + } + } + + // merge filed raw data + engine::DataChunkPtr chunk = std::make_shared(); + auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); + for (auto& iter : field_visitors_map) { + const engine::snapshot::FieldPtr& field = iter.second->GetField(); + std::string name = field->GetName(); + engine::FIXED_FIELD_DATA raw_data; + segment_reader->LoadField(name, raw_data); + chunk->fixed_fields_[name] = raw_data; + } + + auto& uid_data = chunk->fixed_fields_[engine::DEFAULT_UID_NAME]; + chunk->count_ = uid_data.size() / sizeof(int64_t); + status = AddChunk(chunk); + if (!status.ok()) { + return status; + } + + // Note: no need to merge bloom filter, the bloom filter will be created during serialize return Status::OK(); } @@ -301,15 +319,38 @@ SSSegmentWriter::SetVectorIndex(const std::string& field_name, const milvus::kno } Status -SSSegmentWriter::WriteVectorIndex(const std::string& field_name, const std::string& file_path) { +SSSegmentWriter::WriteVectorIndex(const std::string& field_name) { try { knowhere::VecIndexPtr index; - segment_ptr_->GetVectorIndex(field_name, index); - segment::VectorIndexPtr index_ptr = std::make_shared(index); + auto status = segment_ptr_->GetVectorIndex(field_name, index); + if (!status.ok() || index == nullptr) { + return Status(DB_ERROR, "Index doesn't exist: " + status.message()); + } + + auto& field_visitors_map = segment_visitor_->GetFieldVisitors(); + auto field = segment_visitor_->GetFieldVisitor(field_name); + if (field == nullptr) { + return Status(DB_ERROR, "Invalid filed name: " + field_name); + } + + auto element_visitor = field->GetElementVisitor(engine::FieldElementType::FET_INDEX); + if (element_visitor == nullptr) { + return Status(DB_ERROR, "Invalid filed name: " + field_name); + } auto& ss_codec = codec::SSCodec::instance(); fs_ptr_->operation_ptr_->CreateDirectory(); - ss_codec.GetVectorIndexFormat()->write(fs_ptr_, file_path, index_ptr); + + std::string file_path = + engine::snapshot::GetResPath(dir_root_, element_visitor->GetFile()); + ss_codec.GetVectorIndexFormat()->write_index(fs_ptr_, file_path, index); + + element_visitor = field->GetElementVisitor(engine::FieldElementType::FET_COMPRESS_SQ8); + if (element_visitor != nullptr) { + file_path = + engine::snapshot::GetResPath(dir_root_, element_visitor->GetFile()); + ss_codec.GetVectorIndexFormat()->write_compress(fs_ptr_, file_path, index); + } } catch (std::exception& e) { std::string err_msg = "Failed to write vector index: " + std::string(e.what()); LOG_ENGINE_ERROR_ << err_msg; @@ -317,8 +358,35 @@ SSSegmentWriter::WriteVectorIndex(const std::string& field_name, const std::stri engine::utils::SendExitSignal(); return Status(SERVER_WRITE_ERROR, err_msg); } + + return Status::OK(); +} + +Status +SSSegmentWriter::GetSegment(engine::SegmentPtr& segment_ptr) { + segment_ptr = segment_ptr_; return Status::OK(); } +Status +SSSegmentWriter::GetSegmentID(int64_t& id) { + if (segment_visitor_) { + auto segment = segment_visitor_->GetSegment(); + if (segment) { + id = segment->GetID(); + return Status::OK(); + } + } + + return Status(DB_ERROR, "SSSegmentWriter::GetSegmentID: null pointer"); +} + +std::string +SSSegmentWriter::GetSegmentPath() { + std::string seg_path = + engine::snapshot::GetResPath(dir_root_, segment_visitor_->GetSegment()); + return seg_path; +} + } // namespace segment } // namespace milvus diff --git a/core/src/segment/SSSegmentWriter.h b/core/src/segment/SSSegmentWriter.h index fa3f85e904cae..7ccb99d0ffd14 100644 --- a/core/src/segment/SSSegmentWriter.h +++ b/core/src/segment/SSSegmentWriter.h @@ -40,7 +40,7 @@ class SSSegmentWriter { AddChunk(const engine::DataChunkPtr& chunk_ptr); Status - AddChunk(const engine::DataChunkPtr& chunk_ptr, uint64_t from, uint64_t to); + AddChunk(const engine::DataChunkPtr& chunk_ptr, int64_t from, int64_t to); Status WriteBloomFilter(const std::string& file_path, const IdBloomFilterPtr& bloom_filter_ptr); @@ -52,10 +52,7 @@ class SSSegmentWriter { Serialize(); Status - GetSegment(engine::SegmentPtr& segment_ptr); - - Status - Merge(const SSSegmentReaderPtr& segment_to_merge); + Merge(const SSSegmentReaderPtr& segment_reader); size_t Size(); @@ -67,7 +64,16 @@ class SSSegmentWriter { SetVectorIndex(const std::string& field_name, const knowhere::VecIndexPtr& index); Status - WriteVectorIndex(const std::string& field_name, const std::string& file_path); + WriteVectorIndex(const std::string& field_name); + + Status + GetSegment(engine::SegmentPtr& segment_ptr); + + Status + GetSegmentID(int64_t& id); + + std::string + GetSegmentPath(); private: Status @@ -77,10 +83,13 @@ class SSSegmentWriter { WriteField(const std::string& file_path, const engine::FIXED_FIELD_DATA& raw); Status - WriteBloomFilter(const std::string& file_path); + WriteFields(); + + Status + WriteBloomFilter(); Status - WriteDeletedDocs(const std::string& file_path); + WriteDeletedDocs(); private: engine::SegmentVisitorPtr segment_visitor_; diff --git a/core/src/segment/Segment.cpp b/core/src/segment/Segment.cpp index 8166b10938dd3..10a3102a0d1f8 100644 --- a/core/src/segment/Segment.cpp +++ b/core/src/segment/Segment.cpp @@ -24,12 +24,12 @@ namespace milvus { namespace engine { Status -Segment::AddField(const std::string& field_name, FIELD_TYPE field_type, uint64_t field_width) { +Segment::AddField(const std::string& field_name, FIELD_TYPE field_type, int64_t field_width) { if (field_types_.find(field_name) != field_types_.end()) { return Status(DB_ERROR, "duplicate field: " + field_name); } - uint64_t real_field_width = 0; + int64_t real_field_width = 0; switch (field_type) { case FIELD_TYPE::BOOL: real_field_width = sizeof(bool); @@ -56,7 +56,7 @@ Segment::AddField(const std::string& field_name, FIELD_TYPE field_type, uint64_t case FIELD_TYPE::VECTOR: case FIELD_TYPE::VECTOR_FLOAT: case FIELD_TYPE::VECTOR_BINARY: { - if (field_width == 0) { + if (field_width <= 0) { std::string msg = "vecor field dimension required: " + field_name; LOG_SERVER_ERROR_ << msg; return Status(DB_ERROR, msg); @@ -83,8 +83,9 @@ Segment::AddChunk(const DataChunkPtr& chunk_ptr) { } Status -Segment::AddChunk(const DataChunkPtr& chunk_ptr, uint64_t from, uint64_t to) { - if (chunk_ptr == nullptr || from > chunk_ptr->count_ || to > chunk_ptr->count_ || from >= to) { +Segment::AddChunk(const DataChunkPtr& chunk_ptr, int64_t from, int64_t to) { + if (chunk_ptr == nullptr || from < 0 || to < 0 || from > chunk_ptr->count_ || to > chunk_ptr->count_ || + from >= to) { return Status(DB_ERROR, "invalid input"); } @@ -101,14 +102,14 @@ Segment::AddChunk(const DataChunkPtr& chunk_ptr, uint64_t from, uint64_t to) { } // consume - uint64_t add_count = to - from; + int64_t add_count = to - from; for (auto& width_iter : fixed_fields_width_) { auto input = chunk_ptr->fixed_fields_.find(width_iter.first); auto& data = fixed_fields_[width_iter.first]; size_t origin_bytes = data.size(); - uint64_t add_bytes = add_count * width_iter.second; - uint64_t previous_bytes = row_count_ * width_iter.second; - uint64_t target_bytes = previous_bytes + add_bytes; + int64_t add_bytes = add_count * width_iter.second; + int64_t previous_bytes = row_count_ * width_iter.second; + int64_t target_bytes = previous_bytes + add_bytes; data.resize(target_bytes); if (input == chunk_ptr->fixed_fields_.end()) { // this field is not provided, complicate by 0 @@ -129,9 +130,9 @@ Segment::AddChunk(const DataChunkPtr& chunk_ptr, uint64_t from, uint64_t to) { } Status -Segment::DeleteEntity(int32_t offset) { +Segment::DeleteEntity(int64_t offset) { for (auto& pair : fixed_fields_) { - uint64_t width = fixed_fields_width_[pair.first]; + int64_t width = fixed_fields_width_[pair.first]; if (width != 0) { auto step = offset * width; FIXED_FIELD_DATA& data = pair.second; @@ -154,7 +155,7 @@ Segment::GetFieldType(const std::string& field_name, FIELD_TYPE& type) { } Status -Segment::GetFixedFieldWidth(const std::string& field_name, uint64_t width) { +Segment::GetFixedFieldWidth(const std::string& field_name, int64_t& width) { auto iter = fixed_fields_width_.find(field_name); if (iter == fixed_fields_width_.end()) { return Status(DB_ERROR, "invalid field name: " + field_name); diff --git a/core/src/segment/Segment.h b/core/src/segment/Segment.h index 42dfe7a7aad7d..6eef5d5fd2212 100644 --- a/core/src/segment/Segment.h +++ b/core/src/segment/Segment.h @@ -33,7 +33,7 @@ namespace engine { using FIELD_TYPE = engine::meta::hybrid::DataType; using FIELD_TYPE_MAP = std::unordered_map; -using FIELD_WIDTH_MAP = std::unordered_map; +using FIELD_WIDTH_MAP = std::unordered_map; using FIXED_FIELD_DATA = std::vector; using FIXEDX_FIELD_MAP = std::unordered_map; using VARIABLE_FIELD_DATA = std::vector; @@ -41,7 +41,7 @@ using VARIABLE_FIELD_MAP = std::unordered_map; using VECTOR_INDEX_MAP = std::unordered_map; struct DataChunk { - uint64_t count_ = 0; + int64_t count_ = 0; FIXEDX_FIELD_MAP fixed_fields_; VARIABLE_FIELD_MAP variable_fields_; }; @@ -51,22 +51,22 @@ using DataChunkPtr = std::shared_ptr; class Segment { public: Status - AddField(const std::string& field_name, FIELD_TYPE field_type, uint64_t field_width = 0); + AddField(const std::string& field_name, FIELD_TYPE field_type, int64_t field_width = 0); Status AddChunk(const DataChunkPtr& chunk_ptr); Status - AddChunk(const DataChunkPtr& chunk_ptr, uint64_t from, uint64_t to); + AddChunk(const DataChunkPtr& chunk_ptr, int64_t from, int64_t to); Status - DeleteEntity(int32_t offset); + DeleteEntity(int64_t offset); Status GetFieldType(const std::string& field_name, FIELD_TYPE& type); Status - GetFixedFieldWidth(const std::string& field_name, uint64_t width); + GetFixedFieldWidth(const std::string& field_name, int64_t& width); Status GetFixedFieldData(const std::string& field_name, FIXED_FIELD_DATA& data); @@ -126,7 +126,7 @@ class Segment { VARIABLE_FIELD_MAP variable_fields_; VECTOR_INDEX_MAP vector_indice_; - uint64_t row_count_ = 0; + int64_t row_count_ = 0; segment::DeletedDocsPtr deleted_docs_ptr_ = nullptr; segment::IdBloomFilterPtr id_bloom_filter_ptr_ = nullptr; diff --git a/core/unittest/ssdb/test_db.cpp b/core/unittest/ssdb/test_db.cpp index 29bbb42e5cf44..2b4206c7c02fa 100644 --- a/core/unittest/ssdb/test_db.cpp +++ b/core/unittest/ssdb/test_db.cpp @@ -20,12 +20,13 @@ #include "ssdb/utils.h" #include "db/SnapshotVisitor.h" #include "db/snapshot/IterateHandler.h" +#include "knowhere/index/vector_index/helpers/IndexParameter.h" using SegmentVisitor = milvus::engine::SegmentVisitor; namespace { milvus::Status -CreateCollection(std::shared_ptr db, const std::string &collection_name, const LSN_TYPE &lsn) { +CreateCollection(std::shared_ptr db, const std::string& collection_name, const LSN_TYPE& lsn) { CreateCollectionContext context; context.lsn = lsn; auto collection_schema = std::make_shared(collection_name); @@ -41,6 +42,91 @@ CreateCollection(std::shared_ptr db, const std::string &collection_nam return db->CreateCollection(context); } + +static constexpr int64_t COLLECTION_DIM = 128; + +milvus::Status +CreateCollection2(std::shared_ptr db, const std::string& collection_name, const LSN_TYPE& lsn) { + CreateCollectionContext context; + context.lsn = lsn; + auto collection_schema = std::make_shared(collection_name); + context.collection = collection_schema; + + nlohmann::json params; + params[milvus::knowhere::meta::DIM] = COLLECTION_DIM; + auto vector_field = std::make_shared("vector", 0, milvus::engine::FieldType::VECTOR, params); + context.fields_schema[vector_field] = {}; + + std::unordered_map attr_type = { + {"field_0", milvus::engine::FieldType::INT32}, + {"field_1", milvus::engine::FieldType::INT64}, + {"field_2", milvus::engine::FieldType::DOUBLE}, + }; + + std::vector field_names; + for (auto& pair : attr_type) { + auto field = std::make_shared(pair.first, 0, pair.second); + context.fields_schema[field] = {}; + field_names.push_back(pair.first); + } + + return db->CreateCollection(context); +} + +void +BuildEntities(uint64_t n, uint64_t batch_index, milvus::engine::DataChunkPtr& data_chunk) { + data_chunk = std::make_shared(); + data_chunk->count_ = n; + + milvus::engine::VectorsData vectors; + vectors.vector_count_ = n; + vectors.float_data_.clear(); + vectors.float_data_.resize(n * COLLECTION_DIM); + float* data = vectors.float_data_.data(); + for (uint64_t i = 0; i < n; i++) { + for (int64_t j = 0; j < COLLECTION_DIM; j++) data[COLLECTION_DIM * i + j] = drand48(); + data[COLLECTION_DIM * i] += i / 2000.; + + vectors.id_array_.push_back(n * batch_index + i); + } + + milvus::engine::FIXED_FIELD_DATA& raw = data_chunk->fixed_fields_["vector"]; + raw.resize(vectors.float_data_.size() * sizeof(float)); + memcpy(raw.data(), vectors.float_data_.data(), vectors.float_data_.size() * sizeof(float)); + + std::vector value_0; + std::vector value_1; + std::vector value_2; + value_0.resize(n); + value_1.resize(n); + value_2.resize(n); + + std::default_random_engine e; + std::uniform_real_distribution u(0, 1); + for (uint64_t i = 0; i < n; ++i) { + value_0[i] = i; + value_1[i] = i + n; + value_2[i] = u(e); + } + + { + milvus::engine::FIXED_FIELD_DATA& raw = data_chunk->fixed_fields_["field_0"]; + raw.resize(value_0.size() * sizeof(int32_t)); + memcpy(raw.data(), value_0.data(), value_0.size() * sizeof(int32_t)); + } + + { + milvus::engine::FIXED_FIELD_DATA& raw = data_chunk->fixed_fields_["field_1"]; + raw.resize(value_1.size() * sizeof(int64_t)); + memcpy(raw.data(), value_1.data(), value_1.size() * sizeof(int64_t)); + } + + { + milvus::engine::FIXED_FIELD_DATA& raw = data_chunk->fixed_fields_["field_2"]; + raw.resize(value_2.size() * sizeof(double)); + memcpy(raw.data(), value_2.data(), value_2.size() * sizeof(double)); + } +} } // namespace TEST_F(SSDBTest, CollectionTest) { @@ -250,7 +336,7 @@ TEST_F(SSDBTest, VisitorTest) { status = Snapshots::GetInstance().GetSnapshot(ss, c1); ASSERT_TRUE(status.ok()); - auto executor = [&] (const Segment::Ptr& segment, SegmentIterator* handler) -> Status { + auto executor = [&](const Segment::Ptr& segment, SegmentIterator* handler) -> Status { auto visitor = SegmentVisitor::Build(ss, segment->GetID()); if (!visitor) { return Status(milvus::SS_ERROR, "Cannot build segment visitor"); @@ -313,3 +399,50 @@ TEST_F(SSDBTest, VisitorTest) { ASSERT_EQ(ss->GetCollectionCommit()->GetRowCount(), row_cnt + new_segment_row_cnt); std::cout << ss->ToString() << std::endl; } + +TEST_F(SSDBTest, InsertTest) { + std::string collection_name = "MERGE_TEST"; + auto status = CreateCollection2(db_, collection_name, 0); + ASSERT_TRUE(status.ok()); + + const uint64_t entity_count = 100; + milvus::engine::DataChunkPtr data_chunk; + BuildEntities(entity_count, 0, data_chunk); + + status = db_->InsertEntities(collection_name, "", data_chunk); + ASSERT_TRUE(status.ok()); + + status = db_->Flush(); + ASSERT_TRUE(status.ok()); + + uint64_t row_count = 0; + status = db_->GetCollectionRowCount(collection_name, row_count); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(row_count, entity_count); +} + +TEST_F(SSDBTest, MergeTest) { + std::string collection_name = "MERGE_TEST"; + auto status = CreateCollection2(db_, collection_name, 0); + ASSERT_TRUE(status.ok()); + + const uint64_t entity_count = 100; + milvus::engine::DataChunkPtr data_chunk; + BuildEntities(entity_count, 0, data_chunk); + + int64_t repeat = 2; + for (int32_t i = 0; i < repeat; i++) { + status = db_->InsertEntities(collection_name, "", data_chunk); + ASSERT_TRUE(status.ok()); + + status = db_->Flush(); + ASSERT_TRUE(status.ok()); + } + + sleep(2); // wait to merge + + uint64_t row_count = 0; + status = db_->GetCollectionRowCount(collection_name, row_count); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(row_count, entity_count * repeat); +} diff --git a/core/unittest/ssdb/utils.cpp b/core/unittest/ssdb/utils.cpp index e10e02354e7de..50fcf0305beaa 100644 --- a/core/unittest/ssdb/utils.cpp +++ b/core/unittest/ssdb/utils.cpp @@ -180,6 +180,15 @@ SnapshotTest::TearDown() { } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// +milvus::engine::DBOptions +SSDBTest::GetOptions() { + auto options = milvus::engine::DBOptions(); + options.meta_.path_ = "/tmp/milvus_ss"; + options.meta_.backend_uri_ = "sqlite://:@:/"; + options.wal_enable_ = false; + return options; +} + void SSDBTest::SetUp() { BaseTest::SetUp(); @@ -200,9 +209,7 @@ SSDBTest::SetUp() { milvus::engine::snapshot::Snapshots::GetInstance().Reset(); milvus::engine::snapshot::Snapshots::GetInstance().Init(); - auto options = milvus::engine::DBOptions(); - options.wal_enable_ = false; - db_ = std::make_shared(options); + db_ = std::make_shared(GetOptions()); } void @@ -213,6 +220,9 @@ SSDBTest::TearDown() { milvus::engine::snapshot::EventExecutor::GetInstance().Stop(); milvus::engine::snapshot::OperationExecutor::GetInstance().Stop(); + auto options = GetOptions(); + boost::filesystem::remove_all(options.meta_.path_); + BaseTest::TearDown(); } diff --git a/core/unittest/ssdb/utils.h b/core/unittest/ssdb/utils.h index 961ad175913d2..2e2d5f168d4c3 100644 --- a/core/unittest/ssdb/utils.h +++ b/core/unittest/ssdb/utils.h @@ -304,6 +304,9 @@ class SSDBTest : public BaseTest { protected: std::shared_ptr db_; + milvus::engine::DBOptions + GetOptions(); + void SetUp() override; void