From 007081a182744ad9c608e3b4c72c34cbb93d349a Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Sat, 24 Oct 2020 12:54:37 -0700 Subject: [PATCH] Faster value decoding Upstream commit ID : fb-mysql-5.6.35/0e45b20a6a7e205f230314e1c6d1d91b752ac372 PS-7731 : Merge percona-202102 Summary: Similar to integer key decoding improvement, but for values: * Remove dependency to field* and cache those information in Rdb_field_encoder * No more Field::move_field - just write to buffer directly We were originally planning to use Rdb_protocol_value_decoder for MySQL bypass project to eliminate overhead of decoding to integer then convert to string, but in practice it probably makes little difference and it is not going to be needed when we go with bypass RPC (which is thrift binary protocol). So this changes removes Rdb_protocol_value_decoder completely. One thing that worth mentioning is I've unified pack_length and pack_length_in_rec because they are essentially identical in our scenarios and in the case they could be different (more packed BITS format that borrows bits from null byte when storage engine supports it) we don't really support it at all, so unifying it to avoid confusion and assert it. With this MyRocks is now either faster or on par with InnoDB when it comes to SELECT integer columns stored in value. NOTE:FDO data needs to be updated for this diff in order to take full advantage of this change. Reference Patch: https://github.com/facebook/mysql-5.6/commit/f4cefbaca8753c582ee1b372be9be9c6fc9915b4 Reviewed By: Pushapgl Differential Revision: D25880543 fbshipit-source-id: 19faf536554 --- storage/rocksdb/rdb_converter.cc | 147 +++++++++++++++---------------- storage/rocksdb/rdb_converter.h | 19 ++-- storage/rocksdb/rdb_datadic.cc | 2 +- storage/rocksdb/rdb_datadic.h | 15 +++- 4 files changed, 93 insertions(+), 90 deletions(-) diff --git a/storage/rocksdb/rdb_converter.cc b/storage/rocksdb/rdb_converter.cc index 8fec38072f0e..bac2eda7acbe 100644 --- a/storage/rocksdb/rdb_converter.cc +++ b/storage/rocksdb/rdb_converter.cc @@ -59,56 +59,43 @@ void dbug_modify_key_varchar8(String *on_disk_rec) { 0 OK other HA_ERR error code (can be SE-specific) */ -int Rdb_convert_to_record_value_decoder::decode(uchar *const buf, uint *offset, - TABLE *table, - my_core::Field *field, +int Rdb_convert_to_record_value_decoder::decode(uchar *const buf, TABLE *table, Rdb_field_encoder *field_dec, Rdb_string_reader *reader, bool decode, bool is_null) { DBUG_TRACE; int err = HA_EXIT_SUCCESS; - - uint field_offset = field->field_ptr() - table->record[0]; - *offset = field_offset; - uint null_offset = field->null_offset(); - bool maybe_null = field->is_nullable(); - field->move_field(buf + field_offset, - maybe_null ? buf + null_offset : nullptr, field->null_bit); - + auto ptr = buf + field_dec->m_field_offset; if (is_null) { - if (decode) { + if (decode && field_dec->maybe_null()) { // This sets the NULL-bit of this record - field->set_null(); + buf[field_dec->m_field_null_offset] |= field_dec->m_field_null_mask; + /* Besides that, set the field value to default value. CHECKSUM TABLE depends on this. */ - memcpy(field->field_ptr(), table->s->default_values + field_offset, - field->pack_length()); + memcpy(ptr, table->s->default_values + field_dec->m_field_offset, + field_dec->m_field_pack_length); } } else { - if (decode) { + if (decode && field_dec->maybe_null()) { // sets non-null bits for this record - field->set_notnull(); + buf[field_dec->m_field_null_offset] &= ~(field_dec->m_field_null_mask); } - if (!field->is_virtual_gcol()) { + if (!field_dec->m_is_virtual_gcol) { if (field_dec->m_field_type == MYSQL_TYPE_BLOB || field_dec->m_field_type == MYSQL_TYPE_JSON) { - err = decode_blob(table, field, reader, decode); + err = decode_blob(table, ptr, field_dec, reader, decode); } else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR) { - err = decode_varchar(field, reader, decode); + err = decode_varchar(ptr, field_dec, reader, decode); } else { - err = decode_fixed_length_field(field, field_dec, reader, decode); + err = decode_fixed_length_field(ptr, field_dec, reader, decode); } } } - // Restore field->ptr and field->null_ptr - field->move_field(table->record[0] + field_offset, - maybe_null ? table->record[0] + null_offset : nullptr, - field->null_bit); - return err; } @@ -122,21 +109,20 @@ int Rdb_convert_to_record_value_decoder::decode(uchar *const buf, uint *offset, 0 OK other HA_ERR error code (can be SE-specific) */ -int Rdb_convert_to_record_value_decoder::decode_blob(TABLE *table, Field *field, - Rdb_string_reader *reader, - bool decode) { - my_core::Field_blob *blob = (my_core::Field_blob *)field; - +int Rdb_convert_to_record_value_decoder::decode_blob( + TABLE *table, uchar *const buf, Rdb_field_encoder *field_dec, + Rdb_string_reader *reader, bool decode) { // Get the number of bytes needed to store length - const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr; + const uint length_bytes = + field_dec->m_field_pack_length - portable_sizeof_char_ptr; const char *data_len_str; if (!(data_len_str = reader->read(length_bytes))) { return HA_ERR_ROCKSDB_CORRUPT_DATA; } - memcpy(blob->field_ptr(), data_len_str, length_bytes); - uint32 data_len = blob->get_length( + memcpy(buf, data_len_str, length_bytes); + uint32 data_len = Field_blob::get_length( reinterpret_cast(data_len_str), length_bytes); const char *blob_ptr; if (!(blob_ptr = reader->read(data_len))) { @@ -146,8 +132,8 @@ int Rdb_convert_to_record_value_decoder::decode_blob(TABLE *table, Field *field, if (decode) { // set 8-byte pointer to 0, like innodb does (relevant for 32-bit // platforms) - memset(blob->field_ptr() + length_bytes, 0, 8); - memcpy(blob->field_ptr() + length_bytes, &blob_ptr, sizeof(uchar **)); + memset(buf + length_bytes, 0, 8); + memcpy(buf + length_bytes, &blob_ptr, sizeof(uchar **)); } return HA_EXIT_SUCCESS; @@ -165,9 +151,9 @@ int Rdb_convert_to_record_value_decoder::decode_blob(TABLE *table, Field *field, other HA_ERR error code (can be SE-specific) */ int Rdb_convert_to_record_value_decoder::decode_fixed_length_field( - my_core::Field *const field, Rdb_field_encoder *field_dec, + uchar *const buf, Rdb_field_encoder *field_dec, Rdb_string_reader *const reader, bool decode) { - uint len = field_dec->m_pack_length_in_rec; + uint len = field_dec->m_field_pack_length; if (len > 0) { const char *data_bytes; if ((data_bytes = reader->read(len)) == nullptr) { @@ -175,7 +161,7 @@ int Rdb_convert_to_record_value_decoder::decode_fixed_length_field( } if (decode) { - memcpy(field->field_ptr(), data_bytes, len); + memcpy(buf, data_bytes, len); } } @@ -193,24 +179,23 @@ int Rdb_convert_to_record_value_decoder::decode_fixed_length_field( other HA_ERR error code (can be SE-specific) */ int Rdb_convert_to_record_value_decoder::decode_varchar( - Field *field, Rdb_string_reader *const reader, bool decode) { - my_core::Field_varstring *const field_var = (my_core::Field_varstring *)field; - + uchar *const buf, Rdb_field_encoder *field_dec, + Rdb_string_reader *const reader, bool decode) { const char *data_len_str; - if (!(data_len_str = reader->read(field_var->get_length_bytes()))) { + if (!(data_len_str = reader->read(field_dec->m_field_length_bytes))) { return HA_ERR_ROCKSDB_CORRUPT_DATA; } uint data_len; - // field_var->length_bytes is 1 or 2 - if (field_var->get_length_bytes() == 1) { + // field_dec->length_bytes is 1 or 2 + if (field_dec->m_field_length_bytes == 1) { data_len = (uchar)data_len_str[0]; } else { - DBUG_ASSERT(field_var->get_length_bytes() == 2); + DBUG_ASSERT(field_dec->m_field_length_bytes == 2); data_len = uint2korr(data_len_str); } - if (data_len > field_var->field_length) { + if (data_len > field_dec->m_field_length) { // The data on disk is longer than table DDL allows? return HA_ERR_ROCKSDB_CORRUPT_DATA; } @@ -220,8 +205,7 @@ int Rdb_convert_to_record_value_decoder::decode_varchar( } if (decode) { - memcpy(field_var->field_ptr(), data_len_str, - field_var->get_length_bytes() + data_len); + memcpy(buf, data_len_str, field_dec->m_field_length_bytes + data_len); } return HA_EXIT_SUCCESS; @@ -241,7 +225,6 @@ Rdb_value_field_iterator::Rdb_value_field_iterator( m_field_iter = fields->begin(); m_field_end = fields->end(); m_null_bytes = rdb_converter->get_null_bytes(); - m_offset = 0; } // Iterate each requested field and decode one by one @@ -262,11 +245,9 @@ int Rdb_value_field_iterator::next() { return HA_ERR_ROCKSDB_CORRUPT_DATA; } - m_field = m_table->field[m_field_dec->m_field_index]; // Decode each field - err = value_field_decoder::decode(m_buf, &m_offset, m_table, m_field, - m_field_dec, m_value_slice_reader, decode, - m_is_null); + err = value_field_decoder::decode(m_buf, m_table, m_field_dec, + m_value_slice_reader, decode, m_is_null); if (err != HA_EXIT_SUCCESS) { return err; } @@ -284,18 +265,6 @@ bool Rdb_value_field_iterator::end_of_fields() const { return m_field_iter == m_field_end; } -template -Field *Rdb_value_field_iterator::get_field() const { - DBUG_ASSERT(m_field != nullptr); - return m_field; -} - -template -void *Rdb_value_field_iterator::get_dst() const { - DBUG_ASSERT(m_buf != nullptr); - return m_buf + m_offset; -} - template int Rdb_value_field_iterator::get_field_index() const { DBUG_ASSERT(m_field_dec != nullptr); @@ -311,7 +280,6 @@ enum_field_types Rdb_value_field_iterator::get_field_type() template bool Rdb_value_field_iterator::is_null() const { - DBUG_ASSERT(m_field != nullptr); return m_is_null; } @@ -437,7 +405,7 @@ void Rdb_converter::setup_field_decoders(const MY_BITMAP *field_map, } else { // Fixed-width field can be skipped without looking at it. // Add appropriate skip_size to the next field. - skip_size += m_encoder_arr[i].m_pack_length_in_rec; + skip_size += m_encoder_arr[i].m_field_pack_length; } } } @@ -489,13 +457,41 @@ void Rdb_converter::setup_field_encoders() { } } - m_encoder_arr[i].m_field_type = field->real_type(); + /* + The difference between pack_length and pack_length_in_rec is fairly + subtle. The only difference is in Field_bit case where it borrows some + bits in null bytes in memory to store the 'uneven' high bits, therefore + the pack_length is the length of remaining bits while the + pack_length_in_rec is the full length of all bits when you store it on + disk. Only MyIsam and archive supports it, indicating by + HA_CAN_BIT_FIELD. We don't handle this case today at all (nor do we need + to), and we use pack_length everywhere, so just assert it and move on. + */ + DBUG_ASSERT(field->pack_length() == field->pack_length_in_rec()); + + auto field_type = field->real_type(); + m_encoder_arr[i].m_field_type = field_type; m_encoder_arr[i].m_field_index = i; - m_encoder_arr[i].m_pack_length_in_rec = field->pack_length_in_rec(); + m_encoder_arr[i].m_field_pack_length = field->pack_length(); + m_encoder_arr[i].m_field_offset = field->field_ptr() - m_table->record[0]; + m_encoder_arr[i].m_field_type = field_type; + m_encoder_arr[i].m_is_virtual_gcol = field->is_virtual_gcol(); + + if (field_type == MYSQL_TYPE_VARCHAR) { + auto varchar = reinterpret_cast(field); + m_encoder_arr[i].m_field_length = varchar->field_length; + m_encoder_arr[i].m_field_length_bytes = varchar->get_length_bytes(); + } else { + m_encoder_arr[i].m_field_length = -1; + m_encoder_arr[i].m_field_length_bytes = -1; + } - if (field->is_nullable()) { + auto maybe_null = field->is_nullable(); + if (maybe_null) { m_encoder_arr[i].m_null_mask = cur_null_mask; m_encoder_arr[i].m_null_offset = null_bytes_length; + m_encoder_arr[i].m_field_null_offset = field->null_offset(); + m_encoder_arr[i].m_field_null_mask = field->null_bit; if (cur_null_mask == 0x80) { cur_null_mask = 0x1; null_bytes_length++; @@ -503,6 +499,7 @@ void Rdb_converter::setup_field_encoders() { cur_null_mask = cur_null_mask << 1; } } else { + m_encoder_arr[i].m_null_offset = 0; m_encoder_arr[i].m_null_mask = 0; } } @@ -626,9 +623,9 @@ int Rdb_converter::convert_record_from_storage_format( err = pk_def->unpack_record(m_table, dst, key_slice, !unpack_slice.empty() ? &unpack_slice : nullptr, false /* verify_checksum */); - } - if (err != HA_EXIT_SUCCESS) { - return err; + if (err != HA_EXIT_SUCCESS) { + return err; + } } if (!decode_value || get_decode_fields()->size() == 0) { @@ -845,7 +842,7 @@ int Rdb_converter::encode_value_slice( field_var->get_length_bytes() + data_len); } else { /* Copy the field data */ - const uint len = field->pack_length_in_rec(); + const uint len = field->pack_length(); m_storage_record.append(reinterpret_cast(field->field_ptr()), len); } diff --git a/storage/rocksdb/rdb_converter.h b/storage/rocksdb/rdb_converter.h index c6d794b31d6d..d9905df9d54a 100644 --- a/storage/rocksdb/rdb_converter.h +++ b/storage/rocksdb/rdb_converter.h @@ -56,20 +56,21 @@ class Rdb_convert_to_record_value_decoder { Rdb_convert_to_record_value_decoder &operator=( const Rdb_convert_to_record_value_decoder &decoder) = delete; - static int decode(uchar *const buf, uint *offset, TABLE *table, - my_core::Field *field, Rdb_field_encoder *field_dec, - Rdb_string_reader *reader, bool decode, bool is_null); + static int decode(uchar *const buf, TABLE *table, + Rdb_field_encoder *field_dec, Rdb_string_reader *reader, + bool decode, bool is_null); private: - static int decode_blob(TABLE *table, Field *field, Rdb_string_reader *reader, - bool decode); - static int decode_fixed_length_field(Field *const field, + static int decode_blob(TABLE *table, uchar *const buf, + Rdb_field_encoder *field_dec, + Rdb_string_reader *reader, bool decode); + static int decode_fixed_length_field(uchar *const buf, Rdb_field_encoder *field_dec, Rdb_string_reader *const reader, bool decode); - static int decode_varchar(Field *const field, Rdb_string_reader *const reader, - bool decode); + static int decode_varchar(uchar *const buf, Rdb_field_encoder *field_dec, + Rdb_string_reader *const reader, bool decode); }; /** @@ -118,8 +119,6 @@ class Rdb_value_field_iterator { int get_field_index() const; // get current field type enum_field_types get_field_type() const; - // get current field - Field *get_field() const; }; /** diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc index 71abafdf1ab9..1a2a6a170223 100644 --- a/storage/rocksdb/rdb_datadic.cc +++ b/storage/rocksdb/rdb_datadic.cc @@ -2117,7 +2117,7 @@ int Rdb_key_def::unpack_record(TABLE *const table, uchar *const buf, has_covered_bitmap ? &covered_bitmap : nullptr, buf); while (iter.has_next()) { err = iter.next(); - if (err) { + if (unlikely(err)) { return err; } } diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h index 04a3833a3572..82d5f639d2e6 100644 --- a/storage/rocksdb/rdb_datadic.h +++ b/storage/rocksdb/rdb_datadic.h @@ -1251,13 +1251,20 @@ class Rdb_field_encoder { STORAGE_TYPE m_storage_type; uint m_null_offset; - uint16 m_field_index; - uchar m_null_mask; // 0 means the field cannot be null + /* + Cached field information + */ my_core::enum_field_types m_field_type; - - uint m_pack_length_in_rec; + uchar m_field_null_mask; + uint16 m_field_index; + uint m_field_pack_length; + uint m_field_length_bytes; + uint m_field_length; + ptrdiff_t m_field_null_offset; + ptrdiff_t m_field_offset; + bool m_is_virtual_gcol; bool maybe_null() const { return m_null_mask != 0; }