Skip to content

Commit

Permalink
Bug#33534218 : replication_sender_observe_commit_only=ON leads to bre…
Browse files Browse the repository at this point in the history
…ak semisync replication

Problem
-------

When setting replication_sender_observe_commit_only into semisync-Source node
in GTID mode, its replica cannot not start semisync-replication after restart
of the mysqld.

Analysis / Root-cause analysis
------------------------------

Additional optimizations implemented in the Observe_transmission_guard
class, which turns off the m_observe_transmission flag in specific cases, turn
off execution of the following hooks: before_send_hook, after_send_hook and
reserve_header_hook. When GTIDs are used and the auto position protocol is
executed, additional heartbeat events are sent to skip transactions and update
log positions. However, reserve_header_hook is not executed and magic number
is not properly encoded in the outgoing message. Replica receives message with
unknown magic number and reports the error.

Solution
--------

Observe_transmission_guard, including before_send_hook and after_send_hook,
apply to outgoing messages with events, therefore:
1. Observe_transmission_guard is created before a call to before_send_hook.
2. The before_send_hook is called before a call to send_packet function.
3. The after_send_hook is called just after execution of the send_packet
function.
This way, reserve_header_hook is always called when sending a heartbeat event
and magic number is properly encoded. Moreover, before_send_hook and
after_send_hook are never called in case the event is in the exclude
group and packet is not sent.

Signed-off-by: Karolina Szczepankiewicz <karolina.szczepankiewicz@oracle.com>
Change-Id: I86309b23447b9745b2f3862626dc8a129b923d41
  • Loading branch information
Karolina Szczepankiewicz committed Jul 28, 2022
1 parent d8c680c commit 650d2f7
Show file tree
Hide file tree
Showing 9 changed files with 200 additions and 8 deletions.
10 changes: 10 additions & 0 deletions mysql-test/include/spawn_monitoring_process.inc
Expand Up @@ -37,6 +37,16 @@ if (!$have_windows) {
OR VARIABLE_NAME LIKE 'enforce_gtid_consistency'
OR VARIABLE_NAME LIKE 'gtid_mode'
ORDER BY VARIABLE_NAME;

if(!$rpl_group_replication) {
INSERT INTO test.r_vars (var_name, var_value)
SELECT * FROM performance_schema.global_variables
WHERE VARIABLE_NAME LIKE 'relay_log'
OR VARIABLE_NAME LIKE 'rpl_semi_sync_replica_enabled'
OR VARIABLE_NAME LIKE 'rpl_semi_sync_source_enabled'
ORDER BY VARIABLE_NAME;
}

if($rpl_group_replication)
{
--let $gr_ssl_disabled= `SELECT @@group_replication_ssl_mode LIKE 'DISABLED'`
Expand Down
Expand Up @@ -31,6 +31,7 @@ SET GLOBAL replication_sender_observe_commit_only = 1;
SHOW VARIABLES LIKE 'rpl_semi_sync_source_enabled';
Variable_name Value
CALL mtr.add_suppression("Semi-sync master failed on net_flush.*");
CALL mtr.add_suppression(".* Timeout waiting for reply of binlog .*");
include/install_semisync_source.inc
[connection server_3]
SET GLOBAL replication_optimize_for_static_plugin_config = 1;
Expand Down
Expand Up @@ -67,6 +67,7 @@ while ($i <= $servers)
{
SHOW VARIABLES LIKE 'rpl_semi_sync_source_enabled';
CALL mtr.add_suppression("Semi-sync master failed on net_flush.*");
CALL mtr.add_suppression(".* Timeout waiting for reply of binlog .*");
--source include/install_semisync_source.inc
}
if ($i != 2)
Expand Down
@@ -0,0 +1,46 @@
# 1. Set-up semi sync replication environment
# a) Setup a 2 layered topology with the given topology 1->2
include/rpl_init.inc [topology=1->2]
Warnings:
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
[connection server_1]
SET GLOBAL replication_sender_observe_commit_only = 1;
SET GLOBAL replication_optimize_for_static_plugin_config = 1;
CALL mtr.add_suppression("Semi-sync master failed on net_flush.*");
CALL mtr.add_suppression("Read semi-sync reply magic number error.*");
CALL mtr.add_suppression("A message intended for a client cannot be sent there as no client-session is attached.*");
CALL mtr.add_suppression("Timeout waiting for reply of binlog.*");
include/install_semisync_source.inc
[connection server_2]
include/install_semisync_replica.inc
CALL mtr.add_suppression("The slave coordinator and worker threads are stopped.*");
# b) Ensure that replication_sender_observe_commit_only is turned on
[connection server_1]
# replication_sender_observe_commit_only: 1
include/assert.inc [replication_sender_observe_commit_only should be ON (source)]
# 2. Prepare a testing environment and execute statements in the background
CREATE TABLE t (a int);
# 3. Issue the replica restart while the source is running statements
[connection server_2]
include/rpl_restart_server.inc [server_number=2 gtids=on parameters: --skip_slave_start=0 --rpl_semi_sync_replica_enabled=1]
# 4. Sanity check - check whether semi-sync replication was correctly restarted
include/check_slave_is_running.inc
include/check_slave_no_error.inc
include/assert.inc [Semi sync should be enabled on the replica]
# 5. Wait for source to finish the execution of statements
[connection server_1]
# 6. Wait for replica to catch-up with latest changes
[connection server_2]
include/rpl_sync.inc
# 7. Check whether the replica replicated table correctly
include/diff_tables.inc [server_1:t, server_2:t]
# 8. Clean up
[connection server_2]
include/uninstall_semisync_replica.inc
[connection server_1]
SET GLOBAL replication_sender_observe_commit_only = OFF;
SET GLOBAL replication_optimize_for_static_plugin_config = OFF;
include/uninstall_semisync_source.inc
DROP TABLE t;
include/rpl_end.inc
@@ -0,0 +1 @@
$SEMISYNC_PLUGIN_OPT
@@ -0,0 +1 @@
$SEMISYNC_PLUGIN_OPT
@@ -0,0 +1,9 @@
!include ../my.cnf

[mysqld.1]
log-replica-updates
server_id=1

[mysqld.2]
log-replica-updates
server_id=2
123 changes: 123 additions & 0 deletions mysql-test/suite/rpl_gtid/t/rpl_semi_sync_observe_commit_only.test
@@ -0,0 +1,123 @@
# ==== Purpose ====
#
# The purpose of this test is to test semi-synchronous replication in case the
# replication_sender_observe_commit_only=ON on the source
# and replication is running in the GTID mode.
#
# ==== Implementation ====
#
# 1) Set-up semi sync replication environment and ensure that
# replication_sender_observe_commit_only=ON on the source
# 2) Prepare a testing environment and execute statements in the background
# 3) Issue replica restart while source is running statements
# 4) Sanity check - check whether semi-sync replication was correctly restarted
# 5) Wait for source to finish the execution of statements
# 6) Wait for replica to catch-up with latest changes
# 7) Check whether the replica replicated table correctly
# 8) Clean up
#
# ==== Requirements ====
#
# The replica should be able to restart its connection while semi-synchronous
# replication is set-up in the GTID mode, and the
# replication_sender_observe_commit_only=ON on the source.
#
# ==== References ====
#
# Bug#33534218: replication_sender_observe_commit_only=ON leads to break
# semisync replication

--source include/have_binlog_format_row.inc

--echo # 1. Set-up semi sync replication environment

--echo # a) Setup a 2 layered topology with the given topology 1->2

--let $rpl_topology = 1->2
--source include/rpl_init.inc

--let $rpl_connection_name= server_1
--source include/rpl_connection.inc
--eval SET GLOBAL replication_sender_observe_commit_only = 1
--eval SET GLOBAL replication_optimize_for_static_plugin_config = 1
CALL mtr.add_suppression("Semi-sync master failed on net_flush.*");
CALL mtr.add_suppression("Read semi-sync reply magic number error.*");
CALL mtr.add_suppression("A message intended for a client cannot be sent there as no client-session is attached.*");
CALL mtr.add_suppression("Timeout waiting for reply of binlog.*");
--source include/install_semisync_source.inc

--let $rpl_connection_name= server_2
--source include/rpl_connection.inc
--source include/install_semisync_replica.inc
CALL mtr.add_suppression("The slave coordinator and worker threads are stopped.*");

--echo # b) Ensure that replication_sender_observe_commit_only is turned on

--let $rpl_connection_name= server_1
--source include/rpl_connection.inc
--let $current_observe_commit_only= `SELECT @@GLOBAL.replication_sender_observe_commit_only`
--echo # replication_sender_observe_commit_only: $current_observe_commit_only

--let $assert_text= replication_sender_observe_commit_only should be ON (source)
--let $assert_variable_name= replication_sender_observe_commit_only
--let $assert_variable_value= 1
--source include/assert_variable.inc

--echo # 2. Prepare a testing environment and execute statements in the background

CREATE TABLE t (a int);
--let $iters = 500
--exec_in_background $MYSQL_SLAP --create-schema=test --delimiter=";" --iterations=$iters --query="INSERT INTO t VALUES (1)" --concurrency=1 --silent 2>&1

--echo # 3. Issue the replica restart while the source is running statements

--let $rpl_connection_name= server_2
--source include/rpl_connection.inc

--let $rpl_server_parameters= --skip_slave_start=0 --rpl_semi_sync_replica_enabled=1
--let $rpl_server_number=2
--let $rpl_start_with_gtids= 1
--source include/rpl_restart_server.inc

--echo # 4. Sanity check - check whether semi-sync replication was correctly restarted

--source include/check_slave_is_running.inc
--source include/check_slave_no_error.inc
--let $assert_text= Semi sync should be enabled on the replica
--let $assert_variable_name= rpl_semi_sync_replica_enabled
--let $assert_variable_value= 1
--source include/assert_variable.inc

--echo # 5. Wait for source to finish the execution of statements

--let $rpl_connection_name= server_1
--source include/rpl_connection.inc
--let $wait_condition=SELECT COUNT(*)=$iters FROM t
--let $wait_timeout= 120
--source include/wait_condition.inc

--echo # 6. Wait for replica to catch-up with latest changes

--let $rpl_connection_name= server_2
--source include/rpl_connection.inc
--let $slave_timeout=600
--source include/rpl_sync.inc

--echo # 7. Check whether the replica replicated table correctly

--let $diff_tables = server_1:t, server_2:t
--source include/diff_tables.inc

--echo # 8. Clean up

--let $rpl_connection_name= server_2
--source include/rpl_connection.inc
--source include/uninstall_semisync_replica.inc

--let $rpl_connection_name= server_1
--source include/rpl_connection.inc
--eval SET GLOBAL replication_sender_observe_commit_only = OFF
--eval SET GLOBAL replication_optimize_for_static_plugin_config = OFF
--source include/uninstall_semisync_source.inc
DROP TABLE t;
--source include/rpl_end.inc
16 changes: 8 additions & 8 deletions sql/rpl_binlog_sender.cc
Expand Up @@ -608,14 +608,9 @@ int Binlog_sender::send_events(File_reader &reader, my_off_t end_pos) {
});

Sender_context_guard ctx_guard(*this, event_type);
Observe_transmission_guard obs_guard(
m_observe_transmission, event_type,
const_cast<const char *>(reinterpret_cast<char *>(event_ptr)),
m_event_checksum_alg, m_prev_event_type);

log_pos = reader.position();

if (before_send_hook(log_file, log_pos)) return 1;
/*
TODO: Set m_exclude_gtid to NULL if all gtids in m_exclude_gtid has
be skipped. and maybe removing the gtid from m_exclude_gtid will make
Expand Down Expand Up @@ -661,11 +656,16 @@ int Binlog_sender::send_events(File_reader &reader, my_off_t end_pos) {
m_packet.length(tmp.length());
}

Observe_transmission_guard obs_guard(
m_observe_transmission, event_type,
const_cast<const char *>(reinterpret_cast<char *>(event_ptr)),
m_event_checksum_alg, m_prev_event_type);

if (before_send_hook(log_file, log_pos)) return 1;
if (unlikely(send_packet())) return 1;
if (unlikely(after_send_hook(log_file, in_exclude_group ? log_pos : 0)))
return 1;
}

if (unlikely(after_send_hook(log_file, in_exclude_group ? log_pos : 0)))
return 1;
}

/*
Expand Down

0 comments on commit 650d2f7

Please sign in to comment.