From 077e4e3cfb2260ffd193ae833a35797e459fd3d5 Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 12:54:24 +0200 Subject: [PATCH 1/9] PS-10416: Calculate and send checksum header for uploads to support S3 Object Lock https://perconadev.atlassian.net/browse/PS-10416 Object Lock feature of the AWS S3 requires Content-MD5 request header to be present in the PUT request. Added calculation of this header. It is calculated always, as it simplifies the logic and does not cause any harm even if not needed. --- mysqlshdk/libs/storage/backend/object_storage_bucket.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mysqlshdk/libs/storage/backend/object_storage_bucket.cc b/mysqlshdk/libs/storage/backend/object_storage_bucket.cc index 9409efc1c..1e5ab5786 100644 --- a/mysqlshdk/libs/storage/backend/object_storage_bucket.cc +++ b/mysqlshdk/libs/storage/backend/object_storage_bucket.cc @@ -32,6 +32,8 @@ #include "mysqlshdk/libs/utils/fault_injection.h" #include "mysqlshdk/libs/utils/logger.h" +#include "mysqlshdk/libs/utils/utils_encoding.h" +#include "mysqlshdk/libs/utils/utils_ssl.h" namespace mysqlshdk { namespace storage { @@ -157,6 +159,11 @@ void Container::put_object(const std::string &object_name, const char *data, size_t size) { Headers headers{{"content-type", "application/octet-stream"}}; + auto &md5 = headers["Content-MD5"]; + std::string_view data_view(data, size); + const auto hash = shcore::ssl::restricted::md5(data_view); + shcore::encode_base64(hash.data(), hash.size(), &md5); + auto request = put_object_request(object_name, std::move(headers)); request.body = data; request.size = size; From 3034723a6bf1ae5c9c416e938b8c3f23a14df85e Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 12:56:43 +0200 Subject: [PATCH 2/9] PS-10413: Improve chunking strategy for tables with the composite PK https://perconadev.atlassian.net/browse/PS-10413 Introduced enhanced chunking algorithm. Problem: The original algorithm consider only 1st column of the primary key when chunking the table for parallel dump. If the table contains the composite PK it may happen that there is a huge amount of rows for a given key part. As the result, chunk sizes are not well-balanced. Dumping process is delegated to parallel workers. Each chunk is dumped by the separate thread. If there is a huge chunk and multiple small chunks, all small chunks will be quickly processed in parallel, but the huge one will use a thread for a long time, while other worker threads are idle. Solution: Implemented chunking algorithms that uses other key parts to produce table chunks. Chunking Algorithm Overview The chunking mechanism divides large table into manageable chunks to enable parallel data extraction and optimal memory usage. The algorithm supports two strategies: ORIGINAL and ENHANCED. ORIGINAL is the default behavior of mysqlsh. Integer Column Chunking: For numeric primary keys (INTEGER, UNSIGNED INTEGER, DECIMAL): Phase 1: Range Expansion (Linear)** - Starts with an estimated step size based on: index_range / estimated_chunks - Expands the search range until enough rows are found (rows >= rows_per_chunk) - Stops if maximum range is reached or sufficient rows are found Phase 2: Binary Search (Shrinking)** - If too many rows are found (rows > rows_per_chunk + accuracy) - Uses binary search to narrow the range to match target row count - Continues until: delta <= accuracy OR range shrinks to 1 - Falls back to nested chunking if single value still exceeds row limit Nested Chunking (Deep Chunking): When a single value in the current key part exceeds rows_per_chunk: - Recursively chunks the next key part with boundary condition - Only applies if next column is optimizable (INT-like type) Chunk Gluing (ENHANCED strategy only): The Gluer class merges small chunks to optimize dump file count: - Accumulates consecutive chunks when row count < max_rows_cnt - Flushes when accumulated size > 3 * max_rows_cnt or at table end - Prevents fragmentation by combining undersized chunks - DummyGluer (for ORIGINAL) disables this optimization Introduced dump configuration options: adaptiveStepStrategy - strategy used for determining chunk boundaries original - Default. Use the original implementation enhanced - Use the new approach for calculations maxKeyPrefixLength - limits the number of key parts used for chunking (depth) 0 - Use the whole length of the key (up not compatible column) Default: 1 to keep the original behavior # Conflicts: # modules/util/dump/dump_options.cc --- modules/util/dump/decimal.h | 5 + modules/util/dump/dump_options.cc | 16 +- modules/util/dump/dump_options.h | 19 + modules/util/dump/dumper.cc | 721 ++++++++++++++++++++++++++++-- 4 files changed, 717 insertions(+), 44 deletions(-) diff --git a/modules/util/dump/decimal.h b/modules/util/dump/decimal.h index 2d521d6db..bdcd03966 100644 --- a/modules/util/dump/decimal.h +++ b/modules/util/dump/decimal.h @@ -77,6 +77,11 @@ class Decimal final { Decimal &operator*=(const Decimal &rhs); Decimal &operator/=(const Decimal &rhs); + inline Decimal &operator=(shcore::Bignum rhs) { + m_decimal = shcore::Bignum(rhs); + return *this; + } + inline Decimal &operator+=(shcore::Bignum rhs) { *this += convert(std::move(rhs)); return *this; diff --git a/modules/util/dump/dump_options.cc b/modules/util/dump/dump_options.cc index 40af58caa..a8837b54a 100644 --- a/modules/util/dump/dump_options.cc +++ b/modules/util/dump/dump_options.cc @@ -62,6 +62,8 @@ const shcore::Option_pack_def &Dump_options::options() { .optional("defaultCharacterSet", &Dump_options::m_character_set) // WL17279-FR1.2: `allowDataMasking` option .optional("allowDataMasking", &Dump_options::m_allow_data_masking) + .optional("maxKeyPrefixLength", &Dump_options::m_max_key_prefix_len) + .optional("adaptiveStepStrategy", &Dump_options::set_string_option) .include(&Dump_options::m_dialect_unpacker) .on_done(&Dump_options::on_unpacked_options); @@ -76,6 +78,17 @@ void Dump_options::on_start_unpack(const shcore::Dictionary_t &options) { m_options = options; } +AdaptiveStepStrategy Dump_options::to_adaptive_step_strategy( + const std::string option) { + if (option == "enhanced") { + return AdaptiveStepStrategy::ENHANCED; + } else if (option == "original") { + return AdaptiveStepStrategy::ORIGINAL; + } + throw std::invalid_argument( + "Invalid value for 'adaptiveStepStrategy' option: " + option); +} + void Dump_options::set_string_option(const std::string &option, const std::string &value) { if (option == "maxRate") { @@ -87,9 +100,10 @@ void Dump_options::set_string_option(const std::string &option, throw std::invalid_argument( "The option 'compression' cannot be set to an empty string."); } - m_compression = mysqlshdk::storage::to_compression(value, &m_compression_options); + } else if (option == "adaptiveStepStrategy") { + m_adaptive_step_strategy = to_adaptive_step_strategy(value); } else { // This function should only be called with the options above. assert(false); diff --git a/modules/util/dump/dump_options.h b/modules/util/dump/dump_options.h index edd54cb97..5434cc59b 100644 --- a/modules/util/dump/dump_options.h +++ b/modules/util/dump/dump_options.h @@ -57,6 +57,11 @@ enum class Dry_run { DONT_WRITE_ANY_FILES, }; +enum class AdaptiveStepStrategy { + ORIGINAL, + ENHANCED, +}; + class Dump_options : public mysqlsh::common::Common_options { public: using Filtering_options = mysqlshdk::db::Filtering_options; @@ -158,6 +163,12 @@ class Dump_options : public mysqlsh::common::Common_options { void set_report_dump_option(bool value) { m_report_dump_option = value; } + AdaptiveStepStrategy adaptive_step_strategy() const { + return m_adaptive_step_strategy; + } + + size_t max_key_prefix_length() const { return m_max_key_prefix_len; } + virtual bool split() const = 0; virtual uint64_t bytes_per_chunk() const = 0; @@ -259,6 +270,8 @@ class Dump_options : public mysqlsh::common::Common_options { void validate_partitions() const; + AdaptiveStepStrategy to_adaptive_step_strategy(const std::string option); + // input arguments shcore::Dictionary_t m_options; @@ -304,6 +317,12 @@ class Dump_options : public mysqlsh::common::Common_options { Compatibility_options m_compatibility_options; std::optional m_target_version; std::optional m_lakehouse_target; + + // max nesting depth for composite keys + size_t m_max_key_prefix_len = 1; + + AdaptiveStepStrategy m_adaptive_step_strategy = + AdaptiveStepStrategy::ORIGINAL; }; } // namespace dump diff --git a/modules/util/dump/dumper.cc b/modules/util/dump/dumper.cc index 39e3e7a35..fcf553c5a 100644 --- a/modules/util/dump/dumper.cc +++ b/modules/util/dump/dumper.cc @@ -44,6 +44,7 @@ #include #include +#include "dump_options.h" #include "mysqlshdk/include/scripting/shexcept.h" #include "mysqlshdk/include/shellcore/console.h" #include "mysqlshdk/include/shellcore/shell_init.h" @@ -782,6 +783,14 @@ void Dumper::Output_config::create_directory() { void Dumper::Output_config::fini() { dir.reset(); } +template +std::string to_string(const T &value) { + return std::to_string(value); +} +std::string to_string(const Decimal &value) { return value.to_string(); } + +#define V2S(x) to_string(x).c_str() + class Dumper::Table_worker final { public: enum class Exception_strategy { ABORT, CONTINUE }; @@ -1236,6 +1245,9 @@ class Dumper::Table_worker final { std::string order_by; std::string order_by_desc; std::size_t index_column; + std::size_t &ranges_counter; + + Chunking_info(std::size_t &counter) : ranges_counter(counter) {} }; static std::string compare(const Chunking_info &info, const Row &value, @@ -1388,14 +1400,24 @@ class Dumper::Table_worker final { } template - static T constant_step(const T & /* from */, const T &step) { - return step; + static T mul(uint64_t value1, const T &value2) { + if (value1 == 0 || value2 == 0) return T{0}; + + using R = std::common_type_t; + + R v1 = static_cast(value1); + R v2 = static_cast(value2); + R max = static_cast(std::numeric_limits::max()); + + if (v1 > max / v2) return std::numeric_limits::max(); + + return static_cast(v1 * v2); } template T adaptive_step(const T &from, const T &step, const T &max, - const Chunking_info &info, - const std::string &chunk_id) const { + const Chunking_info &info, const std::string &chunk_id, + uint64_t *rows_cnt, bool *use_returned_cnt) const { static constexpr int k_chunker_retries = 10; static constexpr int k_chunker_iterations = 20; @@ -1408,6 +1430,8 @@ class Dumper::Table_worker final { int retry = 0; uint64_t delta = info.accuracy + 1; + *use_returned_cnt = false; + const auto row_count = [&info, &comment, this](const auto begin, const auto end) { return row_count_from_explain( @@ -1424,6 +1448,7 @@ class Dumper::Table_worker final { if (max - retry * double_step <= from) { // if left boundary is greater than max, stop here middle = max; + *use_returned_cnt = true; break; } @@ -1465,6 +1490,7 @@ class Dumper::Table_worker final { if (delta <= info.accuracy) { // we're close enough + *use_returned_cnt = true; break; } } @@ -1486,13 +1512,329 @@ class Dumper::Table_worker final { } } + *rows_cnt = rows; return ensure_not_zero(middle - from); } + template + static T constant_step(const T &, const T &step_hint, const T & /*max*/, + const Chunking_info &info, const std::string &, + uint64_t *rows_cnt, bool *) { + *rows_cnt = info.rows_per_chunk; + return step_hint; + } + +#define DBG_STEP(x) // x +#define DBG(x) // x +#define DBG_GLUE(x) // x + + template + class IGluer { + public: + struct GlueResult { + T begin; + T end; + uint64_t rows_cnt; + bool empty; + bool flushed; + + GlueResult(T b, T e, uint64_t r, bool em, bool f) + : begin(b), end(e), rows_cnt(r), empty(em), flushed(f) {} + GlueResult() + : begin(0), end(0), rows_cnt(0), empty(true), flushed(false) {} + }; + + virtual ~IGluer() {} + virtual GlueResult flush() = 0; + virtual GlueResult glue(T begin, T end, uint64_t rows_cnt, bool last) = 0; + }; + + template + class DummyGluer : public IGluer { + public: + DummyGluer() {} + + virtual IGluer::GlueResult flush() override { + typename IGluer::GlueResult res; + return res; + } + + virtual IGluer::GlueResult glue(T begin, T end, uint64_t rows_cnt, + bool) override { + typename IGluer::GlueResult res(begin, end, rows_cnt, false, true); + return res; + } + }; + + template + class Gluer : public IGluer { + public: + Gluer(uint64_t max_rows_cnt) + : begin_(0), + end_(0), + rows_cnt_(0), + max_rows_cnt_(max_rows_cnt), + zero_result_(), + empty_(true) {} + + virtual ~Gluer() { + DBG_GLUE(if (!empty_ || rows_cnt_ > 0 || begin_ > 0 || end_ > 0) { + log_debug( + "~Gluer() destroying nonepty Gluer! (%s - %s) rows: %ld, " + "empty?: %d", + V2S(begin_), V2S(end_), rows_cnt_, empty_); + }) + + // Before Gluer deletion, it has to be empty (and flushed). + assert(empty_ && rows_cnt_ == 0 && begin_ == 0 && end_ == 0); + } + + virtual typename IGluer::GlueResult flush() override { + DBG_GLUE(log_debug("Gluer::flush() (%s - %s) rows: %ld, empty?: %d", + V2S(begin_), V2S(end_), rows_cnt_, empty_);) + typename IGluer::GlueResult res(begin_, end_, rows_cnt_, empty_, true); + begin_ = end_ = 0; + rows_cnt_ = 0; + empty_ = true; + return res; + } + + virtual typename IGluer::GlueResult glue(T begin, T end, + uint64_t rows_cnt, + bool last) override { + DBG_GLUE( + log_debug("Gluer::glue() %s - %s, rows: %ld, last?: %d (current: " + "%s - %s, %ld)", + V2S(begin), V2S(end), rows_cnt, last, V2S(begin_), + V2S(end_), rows_cnt_);) + if (empty_) { + // Whatever is the chunk, wait for the next one to decide + DBG_GLUE(log_debug("Gluer::glue() - Gluer empty, starting new glue");) + begin_ = begin; + end_ = end; + rows_cnt_ += rows_cnt; + empty_ = false; + } else { + // We have some rows accumulated + + // If we decided to glue, the next chunk needs to follow what we already + // accumulated + assert(begin == end_ + 1); + DBG_GLUE(if (begin != end_ + 1) { + log_debug("Gluer::glue() inconsistency detected! end_: %s, begin: %s", + V2S(end_), V2S(begin)); + }) + + if (rows_cnt_ > max_rows_cnt_) { + // we are ready to flush, just wait for good conditions + if (rows_cnt > max_rows_cnt_ / 2) { + // if the upcoming chunk is a good candidate to start new glue, + // return the current glue and remember this chunk + DBG_GLUE(log_debug("Gluer::glue() - return current, start new");) + typename IGluer::GlueResult res(begin_, end_, rows_cnt_, empty_, + false); + begin_ = begin; + end_ = end; + rows_cnt_ = rows_cnt; + return res; + } else { + // glue it + DBG_GLUE(log_debug("Gluer::glue() - ready to flush but decided to " + "glue. rows: %ld", + rows_cnt_);) + end_ = end; + rows_cnt_ += rows_cnt; + } + } else { + // we didn't accumulate enough rows yet + // glue it + DBG_GLUE(log_debug("Gluer::glue() - gluing");) + end_ = end; + rows_cnt_ += rows_cnt; + } + + if (rows_cnt_ > 3 * max_rows_cnt_ || last) { + // flush gluer if we accumulated a lot of chunks or this is the last + // one + DBG_GLUE(log_debug("Gluer::glue() - Gluer full. rows: %ld, last?: %d", + rows_cnt_, last);) + return flush(); + } + } + + if (last) { + // flush last chunk + DBG_GLUE( + log_debug( + "Gluer::glue() - Last chunk. Flushing. rows: %ld, last?: %d", + rows_cnt_, last);) + return flush(); + } + + // The chunk was accumulated + DBG_GLUE(log_debug("Gluer::glue() - accumulated %s - %s, rows: %ld", + V2S(begin_), V2S(end_), rows_cnt_);) + return zero_result_; + } + + private: + T begin_; + T end_; + uint64_t rows_cnt_; + uint64_t max_rows_cnt_; + IGluer::GlueResult zero_result_; + bool empty_; + }; + + template + T adaptive_step_v2(const T &from, const T &step_hint, const T &max, + const Chunking_info &info, const std::string &chunk_id, + uint64_t *rows_cnt, bool *use_returned_cnt) const { + auto rows = info.rows_per_chunk; + const auto comment = this->get_query_comment(*info.table, chunk_id); + + uint64_t retry = 0; + uint64_t delta = info.accuracy + 1; + + const auto row_count = [&info, &comment, this](const auto begin, + const auto end) { + return row_count_from_explain( + query("EXPLAIN FORMAT=JSON SELECT " + + m_dumper->optimizer_hints(info.table->info) + "COUNT(*) FROM " + + info.table->quoted_name + info.partition + + where(*info.table, between(info, begin, end)) + info.order_by + + comment) + ->fetch_one_or_throw() + ->get_as_string(0)); + }; + + auto right = from; + + *use_returned_cnt = false; + + /* Phase 1: Linear expansion. + Expand the range until we have enough rows or we reach the maximum range. + */ + while (true) { + // Calculate the current search range + right = sum(from, mul((retry + 1), step_hint)); + right = std::min(right, max); + + DBG_STEP(log_debug( + "Nest level: %ld, retry: %ld searching range: %s - %s (%s), " + "step_hint: %s", + info.index_column, retry, V2S(from), V2S(right), + V2S(right - from), V2S(step_hint));) + + // check if there is enough rows in currently checked range + rows = row_count(from, right); + DBG_STEP(log_debug("Nest level: %ld, rows: %ld in range: %s - %s", + info.index_column, rows, V2S(from), V2S(right));) + if (rows >= info.rows_per_chunk) { + // We have enough rows, no need to expand the range, move to the next + // phase + break; + } + if (right >= max) { + // We have reached the maximum range, stop here. + break; + } + // We didn't find enough rows in this range, move farther to the right + retry++; + } + + // Phase 2: Binary chop - shrinking (if needed) + if (rows > info.rows_per_chunk + info.accuracy) { + /* We have too much rows. + The previous step produced the range that contains too much rows. + Here we will try to shrink it by doing a binary search in therange. + We may eventually end up with the range of 1 and still too much rows. + We will return this range and the caller will try to chunk by the next + keypart if possible. */ + + DBG_STEP(log_debug("Nest level: %ld, rows: %ld Trying to chop the last " + "range: %s - %s", + info.index_column, rows, V2S(from), V2S(right));) + + auto left = from; + right = from + (right - left) / 2; + while (1) { + DBG_STEP( + log_debug("Nest level: %ld, checking range: %s - %s (left: %s)", + info.index_column, V2S(from), V2S(right), V2S(left));) + rows = row_count(from, right); + + delta = rows > info.rows_per_chunk ? rows - info.rows_per_chunk + : info.rows_per_chunk - rows; + if (delta <= info.accuracy) { + DBG_STEP(log_debug("Nest level: %ld, close enough: rows: %ld, " + "rows_per_chunk: %ld, delta: %ld, range: %s - %s", + info.index_column, rows, info.rows_per_chunk, + delta, V2S(from), V2S(right));) + *use_returned_cnt = true; + break; + } + + if (right == from) { + // We shrinked the range to 1. No possibility to shrink more. + // It means that for this keypart there are more rows than requested. + // Return the current result, the caller will try to chunk by the next + // keypart if possible. + log_debug( + " Nest level: %ld, reached range 1, rows: %ld, range: %s " + "- %s", + info.index_column, rows, V2S(from), V2S(right)); + break; + } + + if (left >= right) { + // No way to chop it more. Use the current estimate. + DBG_STEP(log_debug("Nest level: %ld, No way to chop it more. Will " + "use the current estimate. rows: %ld, " + "rows_per_chunk: %ld, delta: %ld, range: %s - %s", + info.index_column, rows, info.rows_per_chunk, + delta, V2S(from), V2S(right));) + *use_returned_cnt = true; + break; + } + + if (rows > info.rows_per_chunk) { + // Shrink the range + DBG_STEP(log_debug("Nest level: %ld, (shrink) > rows: %ld, " + "rows_per_chunk: %ld, range: %s - %s", + info.index_column, rows, info.rows_per_chunk, + V2S(from), V2S(right));) + right = right - ensure_not_zero((right - left) / 2); + } else { + // Expand the range + DBG_STEP(log_debug("Nest level: %ld, (expand) <= rows: %ld, " + "rows_per_chunk: %ld, range: %s - %s", + info.index_column, rows, info.rows_per_chunk, + V2S(from), V2S(right));) + auto left_tmp = right; + right = right + (right - left) / 2; + left = left_tmp; + } + } + } + + *rows_cnt = rows; + DBG_STEP(log_debug("Nest level: %ld, adaptive_step() returning %ld rows " + "(left: %s, right: %s, ret: %s)", + info.index_column, rows, V2S(from), V2S(right), + V2S(ensure_not_zero(right - from)));) + return ensure_not_zero(right - from); + } + + bool optimized_chunking_possible(mysqlshdk::db::Type type) const { + return (type == mysqlshdk::db::Type::Integer || + type == mysqlshdk::db::Type::UInteger || + type == mysqlshdk::db::Type::Decimal); + } + template std::size_t chunk_integer_column(const Chunking_info &info, const T &min, const T &max) const { - std::size_t ranges_count = 0; // if rows_per_chunk <= 1 it may mean that the rows are bigger than chunk // size, which means we # chunks ~= # rows @@ -1514,52 +1856,308 @@ class Dumper::Table_worker final { ? index_range - info.row_count : info.row_count - index_range) <= row_count_accuracy; + DBG(log_debug( + "Nest level: %ld, chunk_integer_column(). min: %s, max: %s, " + "row_count: " + "%ld, estimated_chunks: %ld, estimated_step: %s, constant?: %d", + info.index_column, V2S(min), V2S(max), info.row_count, + estimated_chunks, V2S(estimated_step), use_constant_step);) + std::string chunk_id; - const auto next_step = - use_constant_step - ? std::function( - constant_step) - // using the default capture [&] below results in problems with - // GCC 5.4.0 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80543) - : [&info, &max, &chunk_id, this](const auto &from, - const auto &step) { - return this->adaptive_step(from, step, max, info, chunk_id); - }; - - auto current = min; - const auto step = estimated_step; - - log_info("%sChunking %s using integer algorithm with %s step", - m_log_id.c_str(), info.table->task_name.c_str(), - use_constant_step ? "constant" : "adaptive"); + using Fn = std::function; + + const Fn adaptive_step_fn = + (m_dumper->m_options.adaptive_step_strategy() == + mysqlsh::dump::AdaptiveStepStrategy::ORIGINAL) + ? Fn{[&info, &max, &chunk_id, this]( + const auto &from, const auto &step, uint64_t *rows_cnt, + bool *use_returned_cnt) { + return this->adaptive_step(from, step, max, info, chunk_id, + rows_cnt, use_returned_cnt); + }} + : Fn{[&info, &max, &chunk_id, this]( + const auto &from, const auto &step, uint64_t *rows_cnt, + bool *use_returned_cnt) { + return this->adaptive_step_v2(from, step, max, info, chunk_id, + rows_cnt, use_returned_cnt); + }}; + + const Fn next_step = + use_constant_step ? Fn{[&info, &max, &chunk_id, this]( + const auto &from, const auto &step, + uint64_t *rows_cnt, bool *use_returned_cnt) { + return this->constant_step(from, step, max, info, chunk_id, rows_cnt, + use_returned_cnt); + }} + : adaptive_step_fn; + + auto current_chunk_begin = min; - bool last_chunk = false; + log_debug( + "%sChunking %s using integer algorithm with %s step. step: " + "%s, alg: %s, depth: %ld", + m_log_id.c_str(), info.table->task_name.c_str(), + use_constant_step ? "constant" : "adaptive", V2S(estimated_step), + (m_dumper->m_options.adaptive_step_strategy() == + mysqlsh::dump::AdaptiveStepStrategy::ORIGINAL + ? "original" + : "enhanced"), + m_dumper->m_options.max_key_prefix_length()); + + DBG(log_debug( + "Nest level: %ld, trying to chunk by %ld, rows to chunk: %ld, rows " + "per chunk: %ld, index columns cnt: %ld", + info.index_column, info.index_column, info.row_count, + info.rows_per_chunk, info.table->index.info->columns().size());) + + bool last_chunk_in_dump = false; + bool last_chunk_on_this_level = false; + bool gluing_to_next_chunk = false; + + std::shared_ptr> gluer; + if (m_dumper->m_options.adaptive_step_strategy() == + mysqlsh::dump::AdaptiveStepStrategy::ORIGINAL) { + gluer = std::make_shared>(); + } else { + gluer = std::make_shared>(info.rows_per_chunk); + } - while (!last_chunk) { + while (!last_chunk_in_dump && !last_chunk_on_this_level) { if (m_dumper->m_worker_interrupt.test()) { - return ranges_count; + return info.ranges_counter; } - chunk_id = std::to_string(ranges_count); - const auto begin = current; - auto new_step = next_step(current, step); + chunk_id = std::to_string(info.ranges_counter); + uint64_t rows_cnt = 0; + bool use_returned_cnt = false; + bool processed_by_deep_chunking = false; + + auto new_step = next_step(current_chunk_begin, estimated_step, &rows_cnt, + &use_returned_cnt); + + size_t idx_columns_cnt = info.table->index.info->columns().size(); + bool possible_to_chunk_next_column = + info.index_column < idx_columns_cnt - 1 && + (info.index_column < + m_dumper->m_options.max_key_prefix_length() - 1 || + m_dumper->m_options.max_key_prefix_length() == 0); + + if (new_step == 1 && !possible_to_chunk_next_column) { + DBG(log_debug("Nest level: %ld, Not possible to chunk deeper. Use the " + "current result", + info.index_column);) + use_returned_cnt = true; + } + + if (!use_returned_cnt && new_step == 1 && possible_to_chunk_next_column) { + /* We reached the range of 1. It is possible that: + 1. There is still too much rows + 2. Rows count is OK, or too small */ + if (rows_cnt > info.rows_per_chunk + info.accuracy) { + /* For this keypart, there is too much rows. + We will try to use the next keypart for chunking */ + const auto type = + info.table->index.info->columns()[info.index_column + 1]->type; + if (!optimized_chunking_possible(type)) { + /* The next column is not INT-like type, so it is not possible to + use optimized chunking when using it. + TODO: Maybe we could mix chunk_integer_column and + chunk_non_integer_column approaches in the future? In sucha case + it would be possible to deep chunk all kind PKs. On some levels + (non-INT) it would do the full range scan (not necessarily + the full table scan) and on other level (INT) it would use the + optimized approach */ + DBG(log_debug("Nest level: %ld, range: %s, but rows_cnt: %ld. Next " + "PK column " + "not compatible with deep chunking. Dumping as is.", + info.index_column, V2S(new_step), rows_cnt);) + use_returned_cnt = true; + gluing_to_next_chunk = false; + } else { + /* The next column is compatible with deep chunking. */ + Chunking_info new_info = info; + if (!new_info.boundary.empty()) { + new_info.boundary += " AND "; + } + new_info.boundary += + info.table->index.info->columns()[info.index_column] + ->quoted_name + + "=" + to_string(current_chunk_begin); + new_info.index_column++; + new_info.row_count = rows_cnt; + DBG(log_debug( + "Nest level: %ld, too much rows (tried chunk by %ld), " + "trying to " + "chunk by %ld, rows to chunk: %ld, rows per chunk: %ld, " + "rows_cnt (from previous): %ld, new_step: %s", + info.index_column, new_info.index_column - 1, + new_info.index_column, new_info.row_count, + new_info.rows_per_chunk, rows_cnt, V2S(new_step));) + + // If we have anything in gluer accumulated before going to the next + // level - flush it + typename Gluer::GlueResult glue_res = gluer->flush(); + if (!glue_res.empty) { + DBG(log_debug( + "Nest level: %ld, will chunk deeper. creating dump task " + "for chunk: " + "%s, rows_cnt: %ld (r: %2f, rpc: %ld, acc: %ld), " + "new_step: " + "%s, last?: %d, idx_column: %ld, cond: %s", + info.index_column, chunk_id.c_str(), glue_res.rows_cnt, + (double)glue_res.rows_cnt / (double)info.rows_per_chunk, + info.rows_per_chunk, info.accuracy, V2S((new_step + 1)), + last_chunk_in_dump, info.index_column, + between(info, glue_res.begin, glue_res.end).c_str());) + create_and_push_table_data_chunk_task( + *info.table, between(info, glue_res.begin, glue_res.end), + chunk_id, info.ranges_counter++, last_chunk_in_dump); + } + + chunk_column(new_info); + processed_by_deep_chunking = true; + } // optimized chunking possible + } else { + // We have some rows. For sure not too much. + if (gluing_to_next_chunk) { + /* We already tried to glue the previous chunk to this one, + but still not enough rows. It might happen that the chunker logic + produced the same chunk again. This is the case when small chunk + precedes a huge one. Most probably in the next iteration we will + go into the nested chunking. Just use this chunk. */ + DBG(log_debug("Nest level: %ld, range: %s, but rows_cnt: %ld. No " + "possibility to " + "glue. Using it.", + info.index_column, V2S(new_step), rows_cnt);) + use_returned_cnt = true; + gluing_to_next_chunk = false; + } else { + /* Try to glue this chunk to the next one */ + DBG(log_debug("Nest level: %ld, range: %s, but rows_cnt: %ld. " + "Trying to glue.", + info.index_column, V2S(new_step), rows_cnt);) + gluing_to_next_chunk = true; + } + } + } // nested chunking // ensure that there's no integer overflow --new_step; - current = (current > max - new_step ? max : current + new_step); + auto current_chunk_end = (current_chunk_begin > max - new_step + ? max + : current_chunk_begin + new_step); + + if (current_chunk_end >= max) { + DBG(log_debug("Nest level: %ld, End of range. rows (%ld). No more rows " + "in range.", + info.index_column, rows_cnt);) + DBG(log_debug("Nest level: %ld, This was the last chunk on this level. " + "Need to " + "dump it as it is", + info.index_column);) + last_chunk_on_this_level = true; + } - const auto end = current; + last_chunk_in_dump = last_chunk_on_this_level && info.index_column == 0; - last_chunk = (current >= max); + // If the current chunk was processed by deep chunking, we have nothing to + // dump. Just go to the next chunk on this level. + // In other case, we need to glue. + auto current_chunk_begin_for_glue = current_chunk_begin; - create_and_push_table_data_chunk_task(*info.table, - between(info, begin, end), chunk_id, - ranges_count++, last_chunk); + current_chunk_begin = current_chunk_end; + ++current_chunk_begin; + if (processed_by_deep_chunking && !last_chunk_in_dump) { + continue; + } + + // Put the chunk through gluer logic + typename Gluer::GlueResult glue_res = + gluer->glue(current_chunk_begin_for_glue, current_chunk_end, rows_cnt, + last_chunk_on_this_level || last_chunk_in_dump); + if (!glue_res.empty) { + DBG( + if (last_chunk_in_dump && glue_res.flushed) { + log_debug( + "Nest level: %ld, this is the last chunk in the dump. Was " + "flushed " + "during last glue.", + info.index_column); + } if (last_chunk_on_this_level && !last_chunk_in_dump && + glue_res.flushed) { + log_debug( + "Nest level: %ld, this is the last chunk on this nested " + "level. Was " + "flushed during last glue.", + info.index_column); + } log_info("Nest level: %ld) creating dump task for chunk: %s, " + "rows_cnt: " + "%ld (r: %2f, rpc: %ld, acc: %ld), new_step: %s, last?: " + "%d, idx_column: %ld, cond: %s", + info.index_column, chunk_id.c_str(), glue_res.rows_cnt, + (double)glue_res.rows_cnt / (double)info.rows_per_chunk, + info.rows_per_chunk, info.accuracy, V2S(new_step + 1), + last_chunk_in_dump, info.index_column, + between(info, glue_res.begin, glue_res.end).c_str());) + create_and_push_table_data_chunk_task( + *info.table, between(info, glue_res.begin, glue_res.end), chunk_id, + info.ranges_counter++, (last_chunk_in_dump && glue_res.flushed)); + } - ++current; + if (!glue_res.flushed && + (last_chunk_on_this_level || last_chunk_in_dump)) { + // Gluer was not flushed during last glue, and this is the last chunk on + // this level or on top level + chunk_id = std::to_string(info.ranges_counter); + glue_res = gluer->flush(); + + if (last_chunk_in_dump) { + // top level. Create even an empty last chunk + DBG(log_debug( + "Nest level: %ld, this is the last chunk in the dump. Needed " + "additional flush after last glue.", + info.index_column); + log_debug( + "Nest level: %ld, creating dump task for chunk: %s, " + "rows_cnt: " + "%ld (r: %2f, rpc: %ld, acc: %ld), new_step: %s, last?: " + "%d, idx_column: %ld, cond: %s", + info.index_column, chunk_id.c_str(), glue_res.rows_cnt, + (double)glue_res.rows_cnt / (double)info.rows_per_chunk, + info.rows_per_chunk, info.accuracy, V2S(new_step + 1), + last_chunk_in_dump, info.index_column, + between(info, glue_res.begin, glue_res.end).c_str());) + create_and_push_table_data_chunk_task( + *info.table, between(info, glue_res.begin, glue_res.end), + chunk_id, info.ranges_counter++, true); + } else { + // Nested level. If there is something to dump, do it. + if (!glue_res.empty) { + DBG(log_debug("Nest level: %ld, this is the last chunk on this " + "nested level. " + "Needed " + "additional flush after last glue.", + info.index_column); + log_debug( + "Nest level: %ld, creating dump task for chunk: %s, " + "rows_cnt: " + "%ld (r: " + "%2f, rpc: %ld, acc: %ld), new_step: %s, last?: %d, " + "idx_column: %ld, cond: %s", + info.index_column, chunk_id.c_str(), glue_res.rows_cnt, + (double)glue_res.rows_cnt / (double)info.rows_per_chunk, + info.rows_per_chunk, info.accuracy, V2S(new_step + 1), + last_chunk_in_dump, info.index_column, + between(info, glue_res.begin, glue_res.end).c_str());) + create_and_push_table_data_chunk_task( + *info.table, between(info, glue_res.begin, glue_res.end), + chunk_id, info.ranges_counter++, false); + } + } + } } - return ranges_count; + return info.ranges_counter; } std::size_t chunk_integer_column(const Chunking_info &info, const Row &begin, @@ -1662,8 +2260,11 @@ class Dumper::Table_worker final { auto row = result->fetch_one(); const auto handle_empty_table = [&info, this]() { - create_and_push_table_data_chunk_task(*info.table, info.boundary, "0", 0, - true); + // If this is PK top level, it means the table is empty + if (info.index_column == 0) { + create_and_push_table_data_chunk_task(*info.table, info.boundary, "0", + 0, true); + } return 1; }; @@ -1750,9 +2351,10 @@ class Dumper::Table_worker final { current_console()->print_note(msg); } } + size_t ranges_counter = 0; + Chunking_info info(ranges_counter); - Chunking_info info; - + info.ranges_counter = ranges_counter; info.table = &table; info.row_count = partition ? partition->row_count : table.info->row_count; info.rows_per_chunk = @@ -1889,8 +2491,8 @@ class Dumper::Table_worker final { uint64_t row_count_from_explain(const std::string &explain) const { const auto error = [&explain](const std::string &msg) { - log_error("JSON output of malformed EXPLAIN statement:\n%s", - explain.c_str()); + log_error("JSON output of malformed EXPLAIN statement:\n%s\nmsg: %s", + explain.c_str(), msg.c_str()); return std::runtime_error(msg); }; @@ -1903,13 +2505,26 @@ class Dumper::Table_worker final { "Failed to parse JSON output of an EXPLAIN statement: %s", e.what())); } + if (auto *v = rapidjson::Pointer("/query_block/message").Get(json)) { + if (v->IsString()) { + std::string msg = v->GetString(); + if (msg.find("no matching row") != std::string::npos || + msg.find("no rows") != std::string::npos) { + log_info( + "EXPLAIN statement returned message indicating that there are no " + "rows to process: %s", + msg.c_str()); + return 0; + } + } + } if (!m_json_path) { for (const auto path : { "/query_plan/inputs/0/estimated_rows", // v2 + BUG#35239659 "/inputs/0/estimated_rows", // JSON format ver. 2 "/query_block/table/rows_examined_per_scan", // 5.7+ (ver. 1) "/query_block/ordering_operation/table/rows", // 5.6 - "/query_block/nested_loop/0/table/rows", // MariaDB + "/query_block/nested_loop/0/table/rows" // MariaDB }) { if (rapidjson::Pointer(path).Get(json)) { m_json_path = path; @@ -1940,6 +2555,10 @@ class Dumper::Table_worker final { return value->GetUint64(); } } else { + log_debug( + "Value at path '%s' in JSON output of an EXPLAIN statement is not a " + "number: %s", + m_json_path, shcore::json::to_string(*value).c_str()); throw error( "The row count in JSON output of an EXPLAIN statement is not a " "number"); @@ -1965,6 +2584,10 @@ template <> Decimal Dumper::Table_worker::sum(const Decimal &value, const Decimal &delta) { return value + delta; } +template <> +Decimal Dumper::Table_worker::mul(uint64_t value1, const Decimal &value2) { + return value1 * value2; +} Dumper::Dumper(const Dump_options &options) : m_options(options), @@ -2249,6 +2872,18 @@ void Dumper::do_run() { current_console()->print_status(msg); } + std::string strategy = (m_options.adaptive_step_strategy() == + mysqlsh::dump::AdaptiveStepStrategy::ENHANCED) + ? "enhanced" + : "original"; + msg = "Using " + strategy + " adaptive step strategy."; + current_console()->print_status(msg); + msg = "Maximum chunking nesting depth: " + + (m_options.max_key_prefix_length() == 0 + ? "unlimited" + : std::to_string(m_options.max_key_prefix_length())); + current_console()->print_status(msg); + if (!m_options.is_dry_run() && m_options.show_progress() && m_options.dump_data()) { current_console()->print_note( From ece24b1b0e19ff5022d36aa2d9569a6353cc34b9 Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 12:58:33 +0200 Subject: [PATCH 3/9] PS-10898: Missing help for new options https://perconadev.atlassian.net/browse/PS-10898 --- modules/util/mod_util.cc | 13 ++++ .../js_shell/validation/cli_help_norecord.js | 64 +++++++++++++++++++ .../js_shell/validation/util_help_norecord.js | 36 +++++++++++ .../py_shell/validation/util_help_norecord.py | 36 +++++++++++ 4 files changed, 149 insertions(+) diff --git a/modules/util/mod_util.cc b/modules/util/mod_util.cc index 900cc44db..87c9e8925 100644 --- a/modules/util/mod_util.cc +++ b/modules/util/mod_util.cc @@ -1806,6 +1806,12 @@ compatibility issues with MySQL HeatWave Service. number of bytes to be written to each chunk file, enables chunking. @li threads: int (default: 4) - Use N threads to dump data chunks from the server. +@li adaptiveStepStrategy: string (default: original) - Select which +algorithm to use for chunk boundary calculation. Set to “original” for +the legacy algorithm or “enhanced” for the new algorithm. +@li maxKeyPrefixLength: int (default: 1) - Define how many primary key +columns, starting from the left, are used for chunking. Set to 0 for no limit. +If the value exceeds the key’s length, the entire key is used. )*"); REGISTER_HELP_DETAIL_TEXT(TOPIC_UTIL_DUMP_DDL_COMPRESSION, R"*( @@ -2430,6 +2436,13 @@ number of bytes to be copied in each chunk, enables chunking. the source server and additional N threads to write the data to the target server. +@li adaptiveStepStrategy: string (default: original) - Select which +algorithm to use for chunk boundary calculation. Set to “original” for +the legacy algorithm or “enhanced” for the new algorithm. +@li maxKeyPrefixLength: int (default: 1) - Define how many primary key +columns, starting from the left, are used for chunking. Set to 0 for no limit. +If the value exceeds the key’s length, the entire key is used. + @li maxRate: string (default: "0") - Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. diff --git a/unittest/scripts/auto/js_shell/validation/cli_help_norecord.js b/unittest/scripts/auto/js_shell/validation/cli_help_norecord.js index c5cbdbaf6..c26b443ff 100644 --- a/unittest/scripts/auto/js_shell/validation/cli_help_norecord.js +++ b/unittest/scripts/auto/js_shell/validation/cli_help_norecord.js @@ -344,6 +344,11 @@ OPTIONS with masking policies will result in an error. When enabled, downgrades this error to a warning. Default: false. +--adaptiveStepStrategy= + Select which algorithm to use for chunk boundary calculation. Set + to “original” for the legacy algorithm or “enhanced” for + the new algorithm. Default: original. + --analyzeTables= "off", "on", "histogram" (default: off) - If 'on', executes ANALYZE TABLE for all tables, once copied. If set to 'histogram', only @@ -498,6 +503,11 @@ OPTIONS (Megabytes), G (Gigabytes). Minimum value: 4096. Default: the value of bytesPerChunk. +--maxKeyPrefixLength= + Define how many primary key columns, starting from the left, are + used for chunking. Set to 0 for no limit. If the value exceeds the + key’s length, the entire key is used. Default: 1. + --maxRate= Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. Default: "0". @@ -578,6 +588,11 @@ OPTIONS with masking policies will result in an error. When enabled, downgrades this error to a warning. Default: false. +--adaptiveStepStrategy= + Select which algorithm to use for chunk boundary calculation. Set + to “original” for the legacy algorithm or “enhanced” for + the new algorithm. Default: original. + --analyzeTables= "off", "on", "histogram" (default: off) - If 'on', executes ANALYZE TABLE for all tables, once copied. If set to 'histogram', only @@ -712,6 +727,11 @@ OPTIONS (Megabytes), G (Gigabytes). Minimum value: 4096. Default: the value of bytesPerChunk. +--maxKeyPrefixLength= + Define how many primary key columns, starting from the left, are + used for chunking. Set to 0 for no limit. If the value exceeds the + key’s length, the entire key is used. Default: 1. + --maxRate= Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. Default: "0". @@ -785,6 +805,11 @@ WHERE establish a connection to the target instance. OPTIONS +--adaptiveStepStrategy= + Select which algorithm to use for chunk boundary calculation. Set + to “original” for the legacy algorithm or “enhanced” for + the new algorithm. Default: original. + --all= Copy all views and tables from the specified schema, requires the tables argument to be an empty list. Default: false. @@ -891,6 +916,11 @@ OPTIONS (Megabytes), G (Gigabytes). Minimum value: 4096. Default: the value of bytesPerChunk. +--maxKeyPrefixLength= + Define how many primary key columns, starting from the left, are + used for chunking. Set to 0 for no limit. If the value exceeds the + key’s length, the entire key is used. Default: 1. + --maxRate= Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. Default: "0". @@ -1066,6 +1096,11 @@ OPTIONS with masking policies will result in an error. When enabled, downgrades this error to a warning. Default: false. +--adaptiveStepStrategy= + Select which algorithm to use for chunk boundary calculation. Set + to “original” for the legacy algorithm or “enhanced” for + the new algorithm. Default: original. + --azureConfigFile= Use the specified Azure configuration file instead of the one at the default location. Default: not set. @@ -1230,6 +1265,11 @@ OPTIONS SELECT ... INTO OUTFILE. See Section 13.2.10.1, "SELECT ... INTO Statement". Default: "\n". +--maxKeyPrefixLength= + Define how many primary key columns, starting from the left, are + used for chunking. Set to 0 for no limit. If the value exceeds the + key’s length, the entire key is used. Default: 1. + --maxRate= Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. Default: "0". @@ -1344,6 +1384,11 @@ OPTIONS with masking policies will result in an error. When enabled, downgrades this error to a warning. Default: false. +--adaptiveStepStrategy= + Select which algorithm to use for chunk boundary calculation. Set + to “original” for the legacy algorithm or “enhanced” for + the new algorithm. Default: original. + --azureConfigFile= Use the specified Azure configuration file instead of the one at the default location. Default: not set. @@ -1488,6 +1533,11 @@ OPTIONS SELECT ... INTO OUTFILE. See Section 13.2.10.1, "SELECT ... INTO Statement". Default: "\n". +--maxKeyPrefixLength= + Define how many primary key columns, starting from the left, are + used for chunking. Set to 0 for no limit. If the value exceeds the + key’s length, the entire key is used. Default: 1. + --maxRate= Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. Default: "0". @@ -1597,6 +1647,11 @@ WHERE tables: List of tables/views to be dumped. OPTIONS +--adaptiveStepStrategy= + Select which algorithm to use for chunk boundary calculation. Set + to “original” for the legacy algorithm or “enhanced” for + the new algorithm. Default: original. + --all= Dump all views and tables from the specified schema. Default: false. @@ -1713,6 +1768,11 @@ OPTIONS SELECT ... INTO OUTFILE. See Section 13.2.10.1, "SELECT ... INTO Statement". Default: "\n". +--maxKeyPrefixLength= + Define how many primary key columns, starting from the left, are + used for chunking. Set to 0 for no limit. If the value exceeds the + key’s length, the entire key is used. Default: 1. + --maxRate= Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. Default: "0". @@ -1823,6 +1883,8 @@ OPTIONS with masking policies will result in an error. When enabled, downgrades this error to a warning. Default: false. +--adaptiveStepStrategy= + --azureConfigFile= Use the specified Azure configuration file instead of the one at the default location. Default: not set. @@ -1878,6 +1940,8 @@ OPTIONS SELECT ... INTO OUTFILE. See Section 13.2.10.1, "SELECT ... INTO Statement". Default: "\n". +--maxKeyPrefixLength= + --maxRate= Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. Default: "0". diff --git a/unittest/scripts/auto/js_shell/validation/util_help_norecord.js b/unittest/scripts/auto/js_shell/validation/util_help_norecord.js index 2a14f8aba..5ab8fbd95 100644 --- a/unittest/scripts/auto/js_shell/validation/util_help_norecord.js +++ b/unittest/scripts/auto/js_shell/validation/util_help_norecord.js @@ -298,6 +298,12 @@ DESCRIPTION - threads: int (default: 4) - Use N threads to read the data from the source server and additional N threads to write the data to the target server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - maxRate: string (default: "0") - Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. @@ -455,6 +461,12 @@ DESCRIPTION - threads: int (default: 4) - Use N threads to read the data from the source server and additional N threads to write the data to the target server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - maxRate: string (default: "0") - Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. @@ -595,6 +607,12 @@ DESCRIPTION - threads: int (default: 4) - Use N threads to read the data from the source server and additional N threads to write the data to the target server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - maxRate: string (default: "0") - Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. @@ -891,6 +909,12 @@ DESCRIPTION of bytes to be written to each chunk file, enables chunking. - threads: int (default: 4) - Use N threads to dump data chunks from the server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - fieldsTerminatedBy: string (default: "\t") - This option has the same meaning as the corresponding clause for SELECT ... INTO OUTFILE. - fieldsEnclosedBy: char (default: '') - This option has the same meaning @@ -1363,6 +1387,12 @@ DESCRIPTION of bytes to be written to each chunk file, enables chunking. - threads: int (default: 4) - Use N threads to dump data chunks from the server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - fieldsTerminatedBy: string (default: "\t") - This option has the same meaning as the corresponding clause for SELECT ... INTO OUTFILE. - fieldsEnclosedBy: char (default: '') - This option has the same meaning @@ -1806,6 +1836,12 @@ DESCRIPTION of bytes to be written to each chunk file, enables chunking. - threads: int (default: 4) - Use N threads to dump data chunks from the server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - fieldsTerminatedBy: string (default: "\t") - This option has the same meaning as the corresponding clause for SELECT ... INTO OUTFILE. - fieldsEnclosedBy: char (default: '') - This option has the same meaning diff --git a/unittest/scripts/auto/py_shell/validation/util_help_norecord.py b/unittest/scripts/auto/py_shell/validation/util_help_norecord.py index c672dd4f8..2250d6853 100644 --- a/unittest/scripts/auto/py_shell/validation/util_help_norecord.py +++ b/unittest/scripts/auto/py_shell/validation/util_help_norecord.py @@ -298,6 +298,12 @@ - threads: int (default: 4) - Use N threads to read the data from the source server and additional N threads to write the data to the target server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - maxRate: string (default: "0") - Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. @@ -455,6 +461,12 @@ - threads: int (default: 4) - Use N threads to read the data from the source server and additional N threads to write the data to the target server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - maxRate: string (default: "0") - Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. @@ -595,6 +607,12 @@ - threads: int (default: 4) - Use N threads to read the data from the source server and additional N threads to write the data to the target server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - maxRate: string (default: "0") - Limit data read throughput to maximum rate, measured in bytes per second per thread. Use maxRate="0" to set no limit. @@ -892,6 +910,12 @@ of bytes to be written to each chunk file, enables chunking. - threads: int (default: 4) - Use N threads to dump data chunks from the server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - fieldsTerminatedBy: string (default: "\t") - This option has the same meaning as the corresponding clause for SELECT ... INTO OUTFILE. - fieldsEnclosedBy: char (default: '') - This option has the same meaning @@ -1364,6 +1388,12 @@ of bytes to be written to each chunk file, enables chunking. - threads: int (default: 4) - Use N threads to dump data chunks from the server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - fieldsTerminatedBy: string (default: "\t") - This option has the same meaning as the corresponding clause for SELECT ... INTO OUTFILE. - fieldsEnclosedBy: char (default: '') - This option has the same meaning @@ -1807,6 +1837,12 @@ of bytes to be written to each chunk file, enables chunking. - threads: int (default: 4) - Use N threads to dump data chunks from the server. + - adaptiveStepStrategy: string (default: original) - Select which + algorithm to use for chunk boundary calculation. Set to “original” + for the legacy algorithm or “enhanced” for the new algorithm. + - maxKeyPrefixLength: int (default: 1) - Define how many primary key + columns, starting from the left, are used for chunking. Set to 0 for no + limit. If the value exceeds the key’s length, the entire key is used. - fieldsTerminatedBy: string (default: "\t") - This option has the same meaning as the corresponding clause for SELECT ... INTO OUTFILE. - fieldsEnclosedBy: char (default: '') - This option has the same meaning From 3f4d89a1167cd506f5db46f2241d0a7121dacaf3 Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 12:58:59 +0200 Subject: [PATCH 4/9] PS-10897: util.dumpInstance o/p shows incorrect no of rows w/ adaptiveStepStrategy: "enhanced" PS-10912: Partition table dumpInstance o/p shows incorrect no of rows w/ adaptiveStepStrategy: "enhanced" PS-10935: dumpInstance: rows written does not match for Unique Indexs w/ adaptiveStepStrategy: "enhanced" https://perconadev.atlassian.net/browse/PS-10897 https://perconadev.atlassian.net/browse/PS-10912 https://perconadev.atlassian.net/browse/PS-10935 Problem: When the last PK(0) is processed by the nested chunking, the nested chunk is the last chunk in the dump. In such a case, when we return from nested chunking logic, there is nothing else to be chunked on the top level. However, the top level logic was not aware of the above and attempted to dump the last chunk, which was the whole PK(0) key. Effectively PK(0) was dumped twice: the first time by nested chunking, the second time by the top level. Solution: Top level generates the last chunk which is empty. Generating the last chunk is required by the protocol. --- modules/util/dump/dumper.cc | 43 +++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/modules/util/dump/dumper.cc b/modules/util/dump/dumper.cc index fcf553c5a..e48f0f464 100644 --- a/modules/util/dump/dumper.cc +++ b/modules/util/dump/dumper.cc @@ -2060,15 +2060,28 @@ class Dumper::Table_worker final { last_chunk_in_dump = last_chunk_on_this_level && info.index_column == 0; - // If the current chunk was processed by deep chunking, we have nothing to - // dump. Just go to the next chunk on this level. + // If the current chunk was processed by nested chunking, we have + // nothing to dump. Just go to the next chunk on this level. // In other case, we need to glue. auto current_chunk_begin_for_glue = current_chunk_begin; current_chunk_begin = current_chunk_end; ++current_chunk_begin; - if (processed_by_deep_chunking && !last_chunk_in_dump) { - continue; + if (processed_by_deep_chunking) { + if (!last_chunk_in_dump) continue; + + // This is the last chunk in the dump and it was processed by nested + // chunking. Nothing else to dump. We need to create an empty chunk + // to mark the end of dump. + DBG(log_debug("Nest level: %ld, this is the last chunk in the dump. It " + "was processed by nested chunking. Creating an empty " + "chunk to mark the end of dump.", + info.index_column);) + // The following condition will evaluate to 'false' always, causing + // the empty chunk to be created. + const_cast(info).boundary = "1=0"; + chunk_id = std::to_string(info.ranges_counter); + rows_cnt = 0; } // Put the chunk through gluer logic @@ -2083,22 +2096,24 @@ class Dumper::Table_worker final { "flushed " "during last glue.", info.index_column); - } if (last_chunk_on_this_level && !last_chunk_in_dump && + } + if (last_chunk_on_this_level && !last_chunk_in_dump && glue_res.flushed) { log_debug( "Nest level: %ld, this is the last chunk on this nested " "level. Was " "flushed during last glue.", info.index_column); - } log_info("Nest level: %ld) creating dump task for chunk: %s, " - "rows_cnt: " - "%ld (r: %2f, rpc: %ld, acc: %ld), new_step: %s, last?: " - "%d, idx_column: %ld, cond: %s", - info.index_column, chunk_id.c_str(), glue_res.rows_cnt, - (double)glue_res.rows_cnt / (double)info.rows_per_chunk, - info.rows_per_chunk, info.accuracy, V2S(new_step + 1), - last_chunk_in_dump, info.index_column, - between(info, glue_res.begin, glue_res.end).c_str());) + } + log_info("Nest level: %ld) creating dump task for chunk: %s, " + "rows_cnt: " + "%ld (r: %2f, rpc: %ld, acc: %ld), new_step: %s, last?: " + "%d, idx_column: %ld, cond: %s", + info.index_column, chunk_id.c_str(), glue_res.rows_cnt, + (double)glue_res.rows_cnt / (double)info.rows_per_chunk, + info.rows_per_chunk, info.accuracy, V2S(new_step + 1), + last_chunk_in_dump, info.index_column, + between(info, glue_res.begin, glue_res.end).c_str());) create_and_push_table_data_chunk_task( *info.table, between(info, glue_res.begin, glue_res.end), chunk_id, info.ranges_counter++, (last_chunk_in_dump && glue_res.flushed)); From 70e38233174602bad34742cefad6f02f4538835a Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 12:59:16 +0200 Subject: [PATCH 5/9] PS-10922: Chunking errors when partitioned tables w/ adaptiveStepStrategy: "enhanced" https://perconadev.atlassian.net/browse/PS-10922 Problem: When trying to estimate rows count in a given range, parsing of the EXPLAIN query result fails. This is because the original implementaton of parsing EXPLAIN output JSON does not cover all possible return values. In such a case exception is raised and execution stops with error. Solution: Added handling of the case when EXPLAIN output says 'zero_rows_aggregated', which means zero rows in a range. --- modules/util/dump/dumper.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/modules/util/dump/dumper.cc b/modules/util/dump/dumper.cc index e48f0f464..b08bceb04 100644 --- a/modules/util/dump/dumper.cc +++ b/modules/util/dump/dumper.cc @@ -2520,6 +2520,13 @@ class Dumper::Table_worker final { "Failed to parse JSON output of an EXPLAIN statement: %s", e.what())); } + // This function is fragile, as originally it expects info about rows + // count to be in a specific place in the JSON output of EXPLAIN statement. + // Moreover it expects it to be a number. However, in some cases + // (e.g. when there are no rows to process) the output may differ between + // MySQL versions. + // That's why I think it should return 0 when it is not able to find the row + // count in the expected place. if (auto *v = rapidjson::Pointer("/query_block/message").Get(json)) { if (v->IsString()) { std::string msg = v->GetString(); @@ -2533,6 +2540,18 @@ class Dumper::Table_worker final { } } } + if (auto *v = rapidjson::Pointer("/query_plan/access_type").Get(json)) { + if (v->IsString()) { + std::string msg = v->GetString(); + if (msg.find("zero_rows_aggregated") != std::string::npos) { + log_info( + "EXPLAIN statement returned message indicating that there are no " + "rows to process: %s", + msg.c_str()); + return 0; + } + } + } if (!m_json_path) { for (const auto path : { "/query_plan/inputs/0/estimated_rows", // v2 + BUG#35239659 From 33881b46f26f0e476687e273099823a1be85a5fa Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 13:00:23 +0200 Subject: [PATCH 6/9] PS-10933: dumpInstance() when chunking:false skip messages for step strategy & chunking nesting depth https://perconadev.atlassian.net/browse/PS-10933 Improved config options dependencies handling. --- modules/util/dump/ddl_dumper_options.cc | 40 ++++++++++++++++++++++++ modules/util/dump/ddl_dumper_options.h | 16 ++++++++++ modules/util/dump/dump_options.cc | 15 --------- modules/util/dump/dump_options.h | 14 ++------- modules/util/dump/dumper.cc | 25 ++++++++------- modules/util/dump/export_table_options.h | 6 ++++ 6 files changed, 78 insertions(+), 38 deletions(-) diff --git a/modules/util/dump/ddl_dumper_options.cc b/modules/util/dump/ddl_dumper_options.cc index aca6d5e6d..f518482cf 100644 --- a/modules/util/dump/ddl_dumper_options.cc +++ b/modules/util/dump/ddl_dumper_options.cc @@ -90,6 +90,10 @@ const shcore::Option_pack_def .optional("checksum", &Ddl_dumper_options::m_checksum) .optional("lakehouseTarget", &Ddl_dumper_options::set_lakehouse_target) + .optional("maxKeyPrefixLength", + &Ddl_dumper_options::set_max_key_prefix_len) + .optional("adaptiveStepStrategy", + &Ddl_dumper_options::set_adaptive_step_strategy) .on_done(&Ddl_dumper_options::on_unpacked_options); return opts; @@ -207,6 +211,42 @@ void Ddl_dumper_options::set_threads(uint64_t threads) { m_worker_threads = threads; } +AdaptiveStepStrategy Ddl_dumper_options::to_adaptive_step_strategy( + const std::string option) { + if (option == "enhanced") { + return AdaptiveStepStrategy::ENHANCED; + } else if (option == "original") { + return AdaptiveStepStrategy::ORIGINAL; + } + throw std::invalid_argument( + "Invalid value for 'adaptiveStepStrategy' option: " + option); +} + +void Ddl_dumper_options::set_max_key_prefix_len(const size_t &value) { + if (!split()) { + throw std::invalid_argument( + "The option 'maxKeyPrefixLength' cannot be used if the 'chunking' " + "option is set to false."); + } + + m_max_key_prefix_len = value; +} + +void Ddl_dumper_options::set_adaptive_step_strategy(const std::string &value) { + if (value.empty()) { + throw std::invalid_argument( + "The option 'adaptiveStepStrategy' cannot be set to an empty string."); + } + + if (!split()) { + throw std::invalid_argument( + "The option 'adaptiveStepStrategy' cannot be used if the 'chunking' " + "option is set to false."); + } + + m_adaptive_step_strategy = to_adaptive_step_strategy(value); +} + void Ddl_dumper_options::on_set_url( const std::string &url, Storage_type storage, const mysqlshdk::storage::Config_ptr &config) { diff --git a/modules/util/dump/ddl_dumper_options.h b/modules/util/dump/ddl_dumper_options.h index 8b185b19d..dd5ca0b95 100644 --- a/modules/util/dump/ddl_dumper_options.h +++ b/modules/util/dump/ddl_dumper_options.h @@ -81,6 +81,12 @@ class Ddl_dumper_options : public Dump_options { bool checksum() const override { return m_checksum; } + AdaptiveStepStrategy adaptive_step_strategy() const override { + return m_adaptive_step_strategy; + } + + size_t max_key_prefix_length() const override { return m_max_key_prefix_len; } + void enable_mds_compatibility_checks(); using Dump_options::set_compatibility_option; @@ -106,6 +112,10 @@ class Ddl_dumper_options : public Dump_options { void set_target_version_str(const std::string &value); void set_dry_run(bool dry_run); void set_threads(uint64_t threads); + void set_max_key_prefix_len(const size_t &value); + void set_adaptive_step_strategy(const std::string &value); + + AdaptiveStepStrategy to_adaptive_step_strategy(const std::string option); bool m_split = true; uint64_t m_bytes_per_chunk; @@ -126,6 +136,12 @@ class Ddl_dumper_options : public Dump_options { bool m_skip_consistency_checks = false; bool m_skip_upgrade_checks = false; bool m_checksum = false; + + // max nesting depth for composite keys + size_t m_max_key_prefix_len = 1; + + AdaptiveStepStrategy m_adaptive_step_strategy = + AdaptiveStepStrategy::ORIGINAL; }; } // namespace dump diff --git a/modules/util/dump/dump_options.cc b/modules/util/dump/dump_options.cc index a8837b54a..0f09007be 100644 --- a/modules/util/dump/dump_options.cc +++ b/modules/util/dump/dump_options.cc @@ -62,8 +62,6 @@ const shcore::Option_pack_def &Dump_options::options() { .optional("defaultCharacterSet", &Dump_options::m_character_set) // WL17279-FR1.2: `allowDataMasking` option .optional("allowDataMasking", &Dump_options::m_allow_data_masking) - .optional("maxKeyPrefixLength", &Dump_options::m_max_key_prefix_len) - .optional("adaptiveStepStrategy", &Dump_options::set_string_option) .include(&Dump_options::m_dialect_unpacker) .on_done(&Dump_options::on_unpacked_options); @@ -78,17 +76,6 @@ void Dump_options::on_start_unpack(const shcore::Dictionary_t &options) { m_options = options; } -AdaptiveStepStrategy Dump_options::to_adaptive_step_strategy( - const std::string option) { - if (option == "enhanced") { - return AdaptiveStepStrategy::ENHANCED; - } else if (option == "original") { - return AdaptiveStepStrategy::ORIGINAL; - } - throw std::invalid_argument( - "Invalid value for 'adaptiveStepStrategy' option: " + option); -} - void Dump_options::set_string_option(const std::string &option, const std::string &value) { if (option == "maxRate") { @@ -102,8 +89,6 @@ void Dump_options::set_string_option(const std::string &option, } m_compression = mysqlshdk::storage::to_compression(value, &m_compression_options); - } else if (option == "adaptiveStepStrategy") { - m_adaptive_step_strategy = to_adaptive_step_strategy(value); } else { // This function should only be called with the options above. assert(false); diff --git a/modules/util/dump/dump_options.h b/modules/util/dump/dump_options.h index 5434cc59b..fa6799c42 100644 --- a/modules/util/dump/dump_options.h +++ b/modules/util/dump/dump_options.h @@ -163,11 +163,9 @@ class Dump_options : public mysqlsh::common::Common_options { void set_report_dump_option(bool value) { m_report_dump_option = value; } - AdaptiveStepStrategy adaptive_step_strategy() const { - return m_adaptive_step_strategy; - } + virtual AdaptiveStepStrategy adaptive_step_strategy() const = 0; - size_t max_key_prefix_length() const { return m_max_key_prefix_len; } + virtual size_t max_key_prefix_length() const = 0; virtual bool split() const = 0; @@ -270,8 +268,6 @@ class Dump_options : public mysqlsh::common::Common_options { void validate_partitions() const; - AdaptiveStepStrategy to_adaptive_step_strategy(const std::string option); - // input arguments shcore::Dictionary_t m_options; @@ -317,12 +313,6 @@ class Dump_options : public mysqlsh::common::Common_options { Compatibility_options m_compatibility_options; std::optional m_target_version; std::optional m_lakehouse_target; - - // max nesting depth for composite keys - size_t m_max_key_prefix_len = 1; - - AdaptiveStepStrategy m_adaptive_step_strategy = - AdaptiveStepStrategy::ORIGINAL; }; } // namespace dump diff --git a/modules/util/dump/dumper.cc b/modules/util/dump/dumper.cc index b08bceb04..79bdaca32 100644 --- a/modules/util/dump/dumper.cc +++ b/modules/util/dump/dumper.cc @@ -2906,17 +2906,20 @@ void Dumper::do_run() { current_console()->print_status(msg); } - std::string strategy = (m_options.adaptive_step_strategy() == - mysqlsh::dump::AdaptiveStepStrategy::ENHANCED) - ? "enhanced" - : "original"; - msg = "Using " + strategy + " adaptive step strategy."; - current_console()->print_status(msg); - msg = "Maximum chunking nesting depth: " + - (m_options.max_key_prefix_length() == 0 - ? "unlimited" - : std::to_string(m_options.max_key_prefix_length())); - current_console()->print_status(msg); + // print chunking strategy information only if chunking is enabled + if (m_options.split()) { + std::string strategy = (m_options.adaptive_step_strategy() == + mysqlsh::dump::AdaptiveStepStrategy::ENHANCED) + ? "enhanced" + : "original"; + msg = "Using " + strategy + " adaptive step strategy."; + current_console()->print_status(msg); + msg = "Maximum chunking nesting depth: " + + (m_options.max_key_prefix_length() == 0 + ? "unlimited" + : std::to_string(m_options.max_key_prefix_length())); + current_console()->print_status(msg); + } if (!m_options.is_dry_run() && m_options.show_progress() && m_options.dump_data()) { diff --git a/modules/util/dump/export_table_options.h b/modules/util/dump/export_table_options.h index e43a382fd..14d445bcc 100644 --- a/modules/util/dump/export_table_options.h +++ b/modules/util/dump/export_table_options.h @@ -93,6 +93,12 @@ class Export_table_options : public Dump_options { bool checksum() const override { return false; } + AdaptiveStepStrategy adaptive_step_strategy() const override { + return AdaptiveStepStrategy::ORIGINAL; + } + + size_t max_key_prefix_length() const override { return 1; } + private: void on_set_session( const std::shared_ptr &session) override; From d0ea7a0963c1f9fdeb19d2e46e275e42bc41bf27 Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 13:00:45 +0200 Subject: [PATCH 7/9] Addressed review comments. --- modules/util/dump/ddl_dumper_options.cc | 22 +++++++++++----------- modules/util/dump/ddl_dumper_options.h | 2 -- modules/util/dump/dumper.cc | 3 ++- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/modules/util/dump/ddl_dumper_options.cc b/modules/util/dump/ddl_dumper_options.cc index f518482cf..95aad0743 100644 --- a/modules/util/dump/ddl_dumper_options.cc +++ b/modules/util/dump/ddl_dumper_options.cc @@ -211,17 +211,6 @@ void Ddl_dumper_options::set_threads(uint64_t threads) { m_worker_threads = threads; } -AdaptiveStepStrategy Ddl_dumper_options::to_adaptive_step_strategy( - const std::string option) { - if (option == "enhanced") { - return AdaptiveStepStrategy::ENHANCED; - } else if (option == "original") { - return AdaptiveStepStrategy::ORIGINAL; - } - throw std::invalid_argument( - "Invalid value for 'adaptiveStepStrategy' option: " + option); -} - void Ddl_dumper_options::set_max_key_prefix_len(const size_t &value) { if (!split()) { throw std::invalid_argument( @@ -232,6 +221,17 @@ void Ddl_dumper_options::set_max_key_prefix_len(const size_t &value) { m_max_key_prefix_len = value; } +static AdaptiveStepStrategy to_adaptive_step_strategy( + const std::string &option) { + if (option == "enhanced") { + return AdaptiveStepStrategy::ENHANCED; + } else if (option == "original") { + return AdaptiveStepStrategy::ORIGINAL; + } + throw std::invalid_argument( + "Invalid value for 'adaptiveStepStrategy' option: " + option); +} + void Ddl_dumper_options::set_adaptive_step_strategy(const std::string &value) { if (value.empty()) { throw std::invalid_argument( diff --git a/modules/util/dump/ddl_dumper_options.h b/modules/util/dump/ddl_dumper_options.h index dd5ca0b95..302ed7378 100644 --- a/modules/util/dump/ddl_dumper_options.h +++ b/modules/util/dump/ddl_dumper_options.h @@ -115,8 +115,6 @@ class Ddl_dumper_options : public Dump_options { void set_max_key_prefix_len(const size_t &value); void set_adaptive_step_strategy(const std::string &value); - AdaptiveStepStrategy to_adaptive_step_strategy(const std::string option); - bool m_split = true; uint64_t m_bytes_per_chunk; diff --git a/modules/util/dump/dumper.cc b/modules/util/dump/dumper.cc index 79bdaca32..d23b98169 100644 --- a/modules/util/dump/dumper.cc +++ b/modules/util/dump/dumper.cc @@ -1653,7 +1653,8 @@ class Dumper::Table_worker final { rows_cnt_ += rows_cnt; } - if (rows_cnt_ > 3 * max_rows_cnt_ || last) { + static constexpr uint64_t flush_threshold_multiplier = 3; + if (rows_cnt_ > flush_threshold_multiplier * max_rows_cnt_ || last) { // flush gluer if we accumulated a lot of chunks or this is the last // one DBG_GLUE(log_debug("Gluer::glue() - Gluer full. rows: %ld, last?: %d", From e5e57ab636bd88d9e3554f269db1d3f1903df0ec Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 23 Apr 2026 14:17:03 +0200 Subject: [PATCH 8/9] 1. Addressed review comments. 2. The project default language mode moved to C++23 in 9.7.0, which changes how 0 -> std::string is resolved, and that trips the deleted std::string(nullptr_t) constructor when Decimal is constructed from 0. --- modules/util/dump/decimal.cc | 2 ++ modules/util/dump/decimal.h | 3 +++ modules/util/dump/dumper.cc | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/util/dump/decimal.cc b/modules/util/dump/decimal.cc index 967bcd38d..407b5b0eb 100644 --- a/modules/util/dump/decimal.cc +++ b/modules/util/dump/decimal.cc @@ -48,6 +48,8 @@ Decimal::Decimal(const std::string &d) { } } +Decimal::Decimal(std::int64_t value) : m_decimal(value) {} + std::string Decimal::to_string() const { auto str = m_decimal.to_string(); diff --git a/modules/util/dump/decimal.h b/modules/util/dump/decimal.h index bdcd03966..de16af4ac 100644 --- a/modules/util/dump/decimal.h +++ b/modules/util/dump/decimal.h @@ -51,6 +51,9 @@ class Decimal final { public: explicit Decimal(const std::string &d); + /** Integer value with no fractional digits (e.g. generic chunking code). */ + explicit Decimal(std::int64_t value); + Decimal(const Decimal &other) = default; Decimal(Decimal &&other) = default; diff --git a/modules/util/dump/dumper.cc b/modules/util/dump/dumper.cc index d23b98169..2807f9582 100644 --- a/modules/util/dump/dumper.cc +++ b/modules/util/dump/dumper.cc @@ -1401,7 +1401,7 @@ class Dumper::Table_worker final { template static T mul(uint64_t value1, const T &value2) { - if (value1 == 0 || value2 == 0) return T{0}; + if (value1 == 0 || value2 == 0) return T(0); using R = std::common_type_t; From 4b62d970e48e356a892bdbf5cb94a384f16d6c38 Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 14 May 2026 15:11:40 +0200 Subject: [PATCH 9/9] Chunker: stabilize EXPLAIN row-count estimates by forcing the chunking index in adaptive_step_v2 Problem: When chunking integer columns with adaptiveStepStrategy: "enhanced", adaptive_step_v2() asks the server for a per-range row-count estimate via EXPLAIN FORMAT=JSON SELECT COUNT(*) and uses that estimate to drive a binary chop on the chunking range. This relies on the estimate being roughly monotonic in the range width. In practice, on tables with composite keys and additional secondary indexes covering a leading key part, the optimizer can pick a different access path for the EXPLAIN'd COUNT(*) than the index the chunker is iterating on (e.g. a `ref` lookup on a shorter index that ignores the BETWEEN predicate on a later key part). When that happens, EXPLAIN returns a constant cardinality (~ rows for the leading key part) regardless of the BETWEEN range, while for narrower ranges it can flip to the primary key and report 0. The chop loop sees this 0/N flapping, its `left` cursor is raised on the first false "expand" probe, and the loop exits via `left >= right` with rows == 0 and a wide step. Because new_step != 1, the deep-chunking branch (chunk by next key part) is never entered and the chunk is emitted as one big slice. Solution: Pin the EXPLAIN COUNT(*) probe in adaptive_step_v2() to the same index the chunker is iterating on by appending a FORCE INDEX (...) clause after the table reference. With that, EXPLAIN's `rows` is the records_in_range estimate against the chunking index, monotone in the range width (modulo small dive noise). The chop converges to range 1 when the range really is too big, which lets the existing deep-chunking path engage on the next key part as designed. The change is intentionally narrow: * only adaptive_step_v2 (the "enhanced" strategy); * only the EXPLAIN probe (not the dump-data SELECT, not the boundary SELECTs in chunk_column / chunk_non_integer_column, not the original adaptive_step v1); * only the integer column path (adaptive_step_v2 is reachable only from chunk_integer_column). To carry the index name into the chunker, Instance_cache::Index gains a quoted_name() accessor populated via set_name() at build time from the index map key in the cache. A small helper force_index_clause() in dumper.cc formats the clause and returns "" when the table has no usable chunking index, so behavior is unchanged in that case. --- modules/util/dump/dumper.cc | 14 ++++++++++++++ modules/util/dump/instance_cache.cc | 5 +++++ modules/util/dump/instance_cache.h | 6 ++++++ 3 files changed, 25 insertions(+) diff --git a/modules/util/dump/dumper.cc b/modules/util/dump/dumper.cc index 2807f9582..d1eef6169 100644 --- a/modules/util/dump/dumper.cc +++ b/modules/util/dump/dumper.cc @@ -190,6 +190,19 @@ std::string quote(const std::string &schema, const std::string &table) { shcore::quote_identifier(table); } +// Returns a `FORCE INDEX (...)` clause (with leading space) that pins the query +// to the index used by the chunker, or an empty string when no index is in use. +// Used by the EXPLAIN COUNT(*) probes in `adaptive_step_v2` for the integer +// chunking path, to stabilize optimizer row-count estimates - without it, the +// optimizer may pick a different access path (e.g. a `ref` lookup on a shorter +// index that ignores the BETWEEN predicate on a later key part), making the +// EXPLAIN row counts non-monotonic in the range width and breaking the binary +// chop loop. +std::string force_index_clause(const Instance_cache::Index *index) { + if (!index) return {}; + return " FORCE INDEX (" + index->quoted_name() + ")"; +} + Row fetch_row(const mysqlshdk::db::IRow *row) { Row result; @@ -1703,6 +1716,7 @@ class Dumper::Table_worker final { query("EXPLAIN FORMAT=JSON SELECT " + m_dumper->optimizer_hints(info.table->info) + "COUNT(*) FROM " + info.table->quoted_name + info.partition + + force_index_clause(info.table->index.info) + where(*info.table, between(info, begin, end)) + info.order_by + comment) ->fetch_one_or_throw() diff --git a/modules/util/dump/instance_cache.cc b/modules/util/dump/instance_cache.cc index 116d40f62..4fe9ff9ae 100644 --- a/modules/util/dump/instance_cache.cc +++ b/modules/util/dump/instance_cache.cc @@ -133,6 +133,10 @@ void Instance_cache::Index::add_column(const Column *column) { m_columns_sql += column->quoted_name; } +void Instance_cache::Index::set_name(const std::string &name) { + m_quoted_name = shcore::quote_identifier(name); +} + Instance_cache_builder::Instance_cache_builder( const std::shared_ptr &session, const mysqlshdk::db::Filtering_options &filters, Instance_cache &&cache) @@ -839,6 +843,7 @@ void Instance_cache_builder::fetch_table_indexes() { } if (add_index) { + new_index.set_name(index.first); auto ptr = &t.indexes.emplace(index.first, std::move(new_index)) .first->second; diff --git a/modules/util/dump/instance_cache.h b/modules/util/dump/instance_cache.h index 470e6554e..a425dc7f7 100644 --- a/modules/util/dump/instance_cache.h +++ b/modules/util/dump/instance_cache.h @@ -86,9 +86,15 @@ struct Instance_cache { inline const std::string &columns_sql() const { return m_columns_sql; } + inline const std::string "ed_name() const { return m_quoted_name; } + + void set_name(const std::string &name); + void add_column(const Column *column); private: + std::string m_quoted_name; + std::vector m_columns; std::string m_columns_sql;