Permalink
Browse files

BUG#18731252 SLAVES WITH SAME SERVER_ID / SERVER_UUID COMPETE FOR

             MASTER CONNECTION

When two slaves with same UUID are configured to replicate from a
single master, the I/O thread of the slaves keep reconnecting and
generating a new relay log files almost without new content.

Analysis:

The master server do not allow two slave connections with same UUID.

Once a new connection is made from a slave with a given UUID, the
master disconnects the older connection, keeping only one slave
connection per slave UUID.

On the other side, as the master doesn't report any errors to
the slave, the I/O thread of the disconnected slave will try to
reconnect, generating a new relay log file.

Fix:

The master now throws an error to the slave once detecting a
duplicate UUID. By receiving an error from the master, the I/O
thread will not try to reconnect anymore.
  • Loading branch information...
Joao Gramacho
Joao Gramacho committed Mar 3, 2015
1 parent 70fd349 commit 751a3da76dfd66b92395f90f11fce6bd890c9db5
@@ -9,11 +9,12 @@ CALL mtr.add_suppression(".*master and slave have equal MySQL server UUIDs.*");
CALL mtr.add_suppression("Master's UUID has changed, although this should not happen unless you have changed it manually");
CALL mtr.add_suppression("Slave I/O: SET @master_heartbeat_period to master failed with error: Lost connection to MySQL server during query");
CALL mtr.add_suppression("Notifying master by SET @master_binlog_checksum= @@global.binlog_checksum failed with error");
CALL mtr.add_suppression("Slave I/O: Got fatal error .* slave with the same server_uuid as this slave has connected to the master");
include/sync_slave_sql_with_master.inc
# Case 1:
# Master's UUID appears in the result of 'SHOW SLAVE STATUS'.
# Slave's UUID appears in the resule of 'SHOW SLAVE HOSTS'.
# Slave's UUID appears in the result of 'SHOW SLAVE HOSTS'.
-----------------------------------------------------------------------------
SHOW SLAVE HOSTS;
Server_id Host Port Master_id Slave_UUID
@@ -35,7 +36,7 @@ include/stop_slave.inc
include/check_slave_param.inc [Slave_IO_Running]
# Case 3:
# Slave generates an errror and aborts, if master's UUID is
# Slave generates an error and aborts, if master's UUID is
# equal to slave's UUID unless --replicate-same-server-id
# option is set.
-----------------------------------------------------------------------------
@@ -121,21 +122,19 @@ include/start_slave.inc
# Case 6:
# In an existing master-slave replication forum (M->S1), if another
# slave (S2) with the same UUID as S1 joins the forum and connects
# to Master(M), then there will be ping-pong reconnections happens
# between M->S2, M->S1, M->S2, M->S1,.... And both slave1 and slave2
# will be informed about this UUID misconfiguration in their error
# log file. And Master log will be populated with information that
# it found a zombie dump thread and it is killing it(only when
# log_warnings is greater than 1)
# to Master(M), the master will throw an error to the first slave
# connection that will not try to reconnect.
-----------------------------------------------------------------------------
include/rpl_restart_server.inc [server_number=3]
CREATE TABLE t1(i int);
DROP TABLE t1;
include/sync_slave_sql_with_master.inc
include/sync_slave_sql_with_master.inc
Pattern "found a zombie dump thread with the same UUID" found
Pattern "master receives a binlog send request from a duplicate server UUID" found
Pattern "master receives a binlog send request from a duplicate server UUID" found
[connection server_2]
include/wait_for_slave_io_error.inc [errno=1236]
include/assert_grep.inc [Found the expected line in master's error log for server 2 disconnection]
include/start_slave_io.inc
[connection server_3]
include/wait_for_slave_io_error.inc [errno=1236]
include/assert_grep.inc [Found the expected line in master's error log for server 3 disconnection]
include/assert_grep.inc [Found the expected line in server 2 error log]
include/assert_grep.inc [Found the expected line in server 3 error log]
include/rpl_restart_server.inc [server_number=3]
include/stop_slave.inc
include/rpl_end.inc
@@ -2,15 +2,11 @@
[mysqld.1]
log-warnings=2
log-error=../../tmp/rpl_server_uuid_test.mysqld.1.err
[mysqld.2]
log-error=../../tmp/rpl_server_uuid_test.mysqld.2.err
[mysqld.3]
replicate-same-server-id=1
gtid-mode=off
log-error=../../tmp/rpl_server_uuid_test.mysqld.3.err
[ENV]
SERVER_MYPORT_3= @mysqld.3.port
@@ -8,19 +8,21 @@
# --replicate-same-server-id is set.
#
##############################################################################
--let $rpl_server_count= 3
--source include/not_gtid_enabled.inc
--source include/master-slave.inc
--source include/not_embedded.inc
# This test case is binlog_format agnostic
--source include/have_binlog_format_mixed.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--let $rpl_server_count= 3
--source include/master-slave.inc
call mtr.add_suppression("Slave I/O thread .* register on master");
call mtr.add_suppression("Slave I/O: Master command COM_REGISTER_SLAVE failed: .*");
CALL mtr.add_suppression(".*master and slave have equal MySQL server UUIDs.*");
CALL mtr.add_suppression("Master's UUID has changed, although this should not happen unless you have changed it manually");
CALL mtr.add_suppression("Slave I/O: SET @master_heartbeat_period to master failed with error: Lost connection to MySQL server during query");
CALL mtr.add_suppression("Notifying master by SET @master_binlog_checksum= @@global.binlog_checksum failed with error");
CALL mtr.add_suppression("Slave I/O: Got fatal error .* slave with the same server_uuid as this slave has connected to the master");
--let $uuid_file= auto.cnf
@@ -39,7 +41,7 @@ connection master;
--echo
--echo # Case 1:
--echo # Master's UUID appears in the result of 'SHOW SLAVE STATUS'.
--echo # Slave's UUID appears in the resule of 'SHOW SLAVE HOSTS'.
--echo # Slave's UUID appears in the result of 'SHOW SLAVE HOSTS'.
--echo -----------------------------------------------------------------------------
connection master;
--let $master_uuid_on_master= query_get_value(SELECT @@SERVER_UUID, @@SERVER_UUID, 1)
@@ -79,7 +81,7 @@ source include/check_slave_param.inc;
--echo
--echo # Case 3:
--echo # Slave generates an errror and aborts, if master's UUID is
--echo # Slave generates an error and aborts, if master's UUID is
--echo # equal to slave's UUID unless --replicate-same-server-id
--echo # option is set.
--echo -----------------------------------------------------------------------------
@@ -241,12 +243,8 @@ eval CHANGE MASTER TO
--echo # Case 6:
--echo # In an existing master-slave replication forum (M->S1), if another
--echo # slave (S2) with the same UUID as S1 joins the forum and connects
--echo # to Master(M), then there will be ping-pong reconnections happens
--echo # between M->S2, M->S1, M->S2, M->S1,.... And both slave1 and slave2
--echo # will be informed about this UUID misconfiguration in their error
--echo # log file. And Master log will be populated with information that
--echo # it found a zombie dump thread and it is killing it(only when
--echo # log_warnings is greater than 1)
--echo # to Master(M), the master will throw an error to the first slave
--echo # connection that will not try to reconnect.
--echo -----------------------------------------------------------------------------
# Step-1: Copy server 2(S1)'s auto.cnf into server 3 (S2)'s data directory.
@@ -258,38 +256,50 @@ eval CHANGE MASTER TO
--remove_file $datadir3/auto.cnf
--copy_file $datadir2/auto.cnf $datadir3/auto.cnf
# Step-2: Restart the server(two servers with the same UUID)
# Step-2: Restart the server 3 (two servers with the same UUID)
--append_file $MYSQLTEST_VARDIR/log/mysqld.1.err
CASE 6 STEP 2
EOF
--let $rpl_server_number= 3
--source include/rpl_restart_server.inc
# Notice that the other slave has stopped with an error
--let $rpl_connection_name= server_2
--source include/rpl_connection.inc
--let $slave_io_errno= convert_error(ER_MASTER_FATAL_ERROR_READING_BINLOG)
--source include/wait_for_slave_io_error.inc
# Step-3: Just do some command on server_1
--connection server_1
CREATE TABLE t1(i int);
DROP TABLE t1;
# Step-4: Sync the data on both the slaves and observe that
# sync will be success at both slaves.
--let $sync_slave_connection= server_2
--source include/sync_slave_sql_with_master.inc
--connection server_1
--let $sync_slave_connection= server_3
--source include/sync_slave_sql_with_master.inc
# Step-5: Observe that misconfiguration messages are in their respective
# error log files.
--let $assert_file=$MYSQLTEST_VARDIR/log/mysqld.1.err
--let $assert_only_after=CASE 6 STEP 2
--let $assert_count= 1
--let $assert_select=found a zombie dump thread with the same UUID
--let $assert_text= Found the expected line in master's error log for server 2 disconnection
--source include/assert_grep.inc
# Step-3: Connect server 2 and notice that the slave server 3 will error
--append_file $MYSQLTEST_VARDIR/log/mysqld.1.err
CASE 6 STEP 3
EOF
--source include/start_slave_io.inc
# Notice that the other slave has stopped with an error
--let $rpl_connection_name= server_3
--source include/rpl_connection.inc
--let $slave_io_errno= convert_error(ER_MASTER_FATAL_ERROR_READING_BINLOG)
--source include/wait_for_slave_io_error.inc
--let SEARCH_FILE=$MYSQLTEST_VARDIR/tmp/rpl_server_uuid_test.mysqld.1.err
--let SEARCH_PATTERN=found a zombie dump thread with the same UUID
--source include/search_pattern.inc
--let $assert_only_after=CASE 6 STEP 3
--let $assert_text= Found the expected line in master's error log for server 3 disconnection
--source include/assert_grep.inc
--let SEARCH_FILE=$MYSQLTEST_VARDIR/tmp/rpl_server_uuid_test.mysqld.2.err
--let SEARCH_PATTERN=master receives a binlog send request from a duplicate server UUID
--source include/search_pattern.inc
# Step-4: Check for error messages on slaves
--let $assert_file=$MYSQLTEST_VARDIR/log/mysqld.2.err
--let $assert_only_after=CURRENT_TEST: rpl.rpl_server_uuid
--let $assert_select= Slave .* Got fatal error .* from master .* slave with the same server_uuid as this slave
--let $assert_text= Found the expected line in server 2 error log
--source include/assert_grep.inc
--let SEARCH_FILE=$MYSQLTEST_VARDIR/tmp/rpl_server_uuid_test.mysqld.3.err
--let SEARCH_PATTERN=master receives a binlog send request from a duplicate server UUID
--source include/search_pattern.inc
--let $assert_file=$MYSQLTEST_VARDIR/log/mysqld.3.err
--let $assert_text= Found the expected line in server 3 error log
--source include/assert_grep.inc
# Cleanup (restore the server 3's auto.cnf back to proper one and restart)
--remove_file $datadir3/auto.cnf
View
@@ -888,6 +888,7 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
Diagnostics_area temp_da;
Diagnostics_area *saved_da= thd->get_stmt_da();
thd->set_stmt_da(&temp_da);
bool was_killed_by_duplicate_slave_uuid= false;
DBUG_ENTER("mysql_binlog_send");
DBUG_PRINT("enter",("log_ident: '%s' pos: %ld", log_ident, (long) pos));
@@ -1945,6 +1946,21 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
}
end:
/*
If the dump thread was killed because of a duplicate slave UUID we
will fail throwing an error to the slave so it will not try to
reconnect anymore.
*/
mysql_mutex_lock(&thd->LOCK_thd_data);
was_killed_by_duplicate_slave_uuid= thd->duplicate_slave_uuid;
mysql_mutex_unlock(&thd->LOCK_thd_data);
if (was_killed_by_duplicate_slave_uuid)
{
errmsg= "A slave with the same server_uuid as this slave "
"has connected to the master";
my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG;
goto err;
}
thd->set_stmt_da(saved_da);
end_io_cache(&log);
mysql_file_close(file, MYF(MY_WME));
@@ -2087,6 +2103,7 @@ void kill_zombie_dump_threads(String *slave_uuid)
"UUID <%s>, found a zombie dump thread with "
"the same UUID. Master is killing the zombie dump "
"thread(%lu).", slave_uuid->c_ptr(), tmp->thread_id);
tmp->duplicate_slave_uuid= true;
tmp->awake(THD::KILL_QUERY);
mysql_mutex_unlock(&tmp->LOCK_thd_data);
}
View
@@ -920,7 +920,8 @@ THD::THD(bool enable_plugins)
m_enable_plugins(enable_plugins),
owned_gtid_set(global_sid_map),
main_da(0, false),
m_stmt_da(&main_da)
m_stmt_da(&main_da),
duplicate_slave_uuid(false)
{
ulong tmp;
View
@@ -4141,6 +4141,14 @@ class THD :public MDL_context_owner,
*/
LEX_STRING invoker_user;
LEX_STRING invoker_host;
public:
/**
This is only used by master dump threads.
When the master receives a new connection from a slave with a UUID that
is already connected, it will set this flag TRUE before killing the old
slave connection.
*/
bool duplicate_slave_uuid;
};

0 comments on commit 751a3da

Please sign in to comment.