Skip to content

Commit

Permalink
Bug#17280176 TRANSACTIONS SKIPPED ON SLAVE AFTER
Browse files Browse the repository at this point in the history
"STOP/START SLAVE" USING GTID REPLICATION

Analysis: Slave updates 'GTID_RETRIEVED' set upon receiving
GTID_LOG_EVENT for a particular transaction which is first
event in the event group. Say, I/O thread is stopped *after*
adding GTID number to 'gtid_trieved' set and *before* it
actually retrieves all the events from that GTID event
group. Next time when this I/O thread is reconnected,
it sends union of GTID_RETRIEVED + GTID_EXECUTED
set to master. So Master thinks that slave is having all the
events from this GTID set(which includes partially
retrieved GTID) and it will not resend them again.
Hence slave is missing some events for ever.

Fix: It is not easy to find the end of a group of events.
So mysql server is unsure whether I/O thread retrieved the
last gtid transaction events completely or not
(before it is going down because of a crash/normal
shutdown/normal stop slave io_thread). It is possible that
I/O thread would have retrieved and written only partial
transaction events. So Server will request Master to send
the last gtid event once again. We do this by removing
the last I/O thread retrieved gtid event from
"Retrieved_gtid_set".
Possible cases:
1) I/O thread would have retrieved full
transaction already in the first time itself, but retrieving
them again will not cause problem because GTID number is same,
Hence SQL thread will not commit it again.
2) I/O thread would have retrieved full transaction already and
SQL thread would have already executed it. In that case,
We are not going remove last retrieved gtid from
"Retrieved_gtid_set" otherwise we will see gaps in "Retrieved set".
  • Loading branch information
Venkatesh Duggirala committed Oct 30, 2013
1 parent 765b265 commit e7a4cbe
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 17 deletions.
2 changes: 1 addition & 1 deletion mysql-test/include/sync_slave_io.inc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

if ($use_gtids)
{
--let $slave_param= Retrieved_Gtid_set
--let $slave_param= Retrieved_Gtid_Set
--let $slave_param_value= $_saved_gtids
--source include/wait_for_slave_param.inc
}
Expand Down
43 changes: 30 additions & 13 deletions sql/binlog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2538,6 +2538,8 @@ bool MYSQL_BIN_LOG::open_index_file(const char *index_file_name_arg,
this object.
@param prev_gtids If not NULL, then the GTIDs from the
Previous_gtids_log_events are stored in this object.
@param last_gtid If not NULL, then the last GTID information from the
file will be stored in this object.
@param verify_checksum Set to true to verify event checksums.
@retval GOT_GTIDS The file was successfully read and it contains
Expand All @@ -2556,7 +2558,8 @@ enum enum_read_gtids_from_binlog_status
{ GOT_GTIDS, GOT_PREVIOUS_GTIDS, NO_GTIDS, ERROR, TRUNCATED };
static enum_read_gtids_from_binlog_status
read_gtids_from_binlog(const char *filename, Gtid_set *all_gtids,
Gtid_set *prev_gtids, bool verify_checksum)
Gtid_set *prev_gtids, Gtid *last_gtid,
bool verify_checksum)
{
DBUG_ENTER("read_gtids_from_binlog");
DBUG_PRINT("info", ("Opening file %s", filename));
Expand Down Expand Up @@ -2664,24 +2667,34 @@ read_gtids_from_binlog(const char *filename, Gtid_set *all_gtids,
at least one Gtid_log_event, so that we can distinguish the
return values GOT_GTID and GOT_PREVIOUS_GTIDS. We don't need
to read anything else from the binary log.
But if last_gtid is requested (i.e., NOT NULL), we should continue to
read all gtids. Otherwise, we are done.
*/
if (all_gtids == NULL)
if (all_gtids == NULL && last_gtid == NULL)
{
ret= GOT_GTIDS, done= true;
}
else
{
Gtid_log_event *gtid_ev= (Gtid_log_event *)ev;
rpl_sidno sidno= gtid_ev->get_sidno(false/*false=don't need lock*/);
if (sidno < 0)
ret= ERROR, done= true;
else
{
if (all_gtids->ensure_sidno(sidno) != RETURN_STATUS_OK)
ret= ERROR, done= true;
else if (all_gtids->_add_gtid(sidno, gtid_ev->get_gno()) !=
RETURN_STATUS_OK)
ret= ERROR, done= true;
if (all_gtids)
{
if (all_gtids->ensure_sidno(sidno) != RETURN_STATUS_OK)
ret= ERROR, done= true;
else if (all_gtids->_add_gtid(sidno, gtid_ev->get_gno()) !=
RETURN_STATUS_OK)
ret= ERROR, done= true;
DBUG_PRINT("info", ("Got Gtid from file '%s': Gtid(%d, %lld).",
filename, sidno, gtid_ev->get_gno()));
}
if (last_gtid)
last_gtid->set(sidno, gtid_ev->get_gno());
}
DBUG_PRINT("info", ("Got Gtid from file '%s': Gtid(%d, %lld).",
filename, sidno, gtid_ev->get_gno()));
}
break;
}
Expand Down Expand Up @@ -2778,7 +2791,7 @@ bool MYSQL_BIN_LOG::find_first_log_not_in_gtid_set(char *binlog_file_name,
const char *filename= rit->c_str();
DBUG_PRINT("info", ("Read Previous_gtids_log_event from filename='%s'",
filename));
switch (read_gtids_from_binlog(filename, NULL, &previous_gtid_set,
switch (read_gtids_from_binlog(filename, NULL, &previous_gtid_set, NULL,
opt_master_verify_checksum))
{
case ERROR:
Expand Down Expand Up @@ -2829,6 +2842,7 @@ bool MYSQL_BIN_LOG::find_first_log_not_in_gtid_set(char *binlog_file_name,
}

bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
Gtid *last_gtid,
bool verify_checksum, bool need_lock)
{
DBUG_ENTER("MYSQL_BIN_LOG::init_gtid_sets");
Expand Down Expand Up @@ -2888,15 +2902,17 @@ bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
reached_first_file= (rit == filename_list.rend());
DBUG_PRINT("info", ("filename='%s' reached_first_file=%d",
rit->c_str(), reached_first_file));
while (!got_gtids && !reached_first_file)
while ((!got_gtids || (last_gtid && last_gtid->empty()))
&& !reached_first_file)
{
const char *filename= rit->c_str();
rit++;
reached_first_file= (rit == filename_list.rend());
DBUG_PRINT("info", ("filename='%s' got_gtids=%d reached_first_file=%d",
filename, got_gtids, reached_first_file));
switch (read_gtids_from_binlog(filename, all_gtids,
switch (read_gtids_from_binlog(filename, got_gtids ? NULL : all_gtids,
reached_first_file ? lost_gtids : NULL,
last_gtid,
verify_checksum))
{
case ERROR:
Expand All @@ -2919,7 +2935,7 @@ bool MYSQL_BIN_LOG::init_gtid_sets(Gtid_set *all_gtids, Gtid_set *lost_gtids,
{
const char *filename= it->c_str();
DBUG_PRINT("info", ("filename='%s'", filename));
switch (read_gtids_from_binlog(filename, NULL, lost_gtids,
switch (read_gtids_from_binlog(filename, NULL, lost_gtids, NULL,
verify_checksum))
{
case ERROR:
Expand Down Expand Up @@ -4130,6 +4146,7 @@ int MYSQL_BIN_LOG::purge_logs(const char *to_log,
global_sid_lock->wrlock();
error= init_gtid_sets(NULL,
const_cast<Gtid_set *>(gtid_state->get_lost_gtids()),
NULL,
opt_master_verify_checksum,
false/*false=don't need lock*/);
global_sid_lock->unlock();
Expand Down
5 changes: 4 additions & 1 deletion sql/binlog.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,14 +482,17 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
@param lost_groups Will be filled with all GTIDs in the
Previous_gtids_log_event of the first binary log that has a
Previous_gtids_log_event.
@param last_gtid Will be filled with the last availble GTID information
in the binary/relay log files.
@param verify_checksum If true, checksums will be checked.
@param need_lock If true, LOCK_log, LOCK_index, and
global_sid_lock->wrlock are acquired; otherwise they are asserted
to be taken already.
@return false on success, true on error.
*/
bool init_gtid_sets(Gtid_set *gtid_set, Gtid_set *lost_groups,
bool verify_checksum, bool need_lock);
Gtid *last_gtid, bool verify_checksum,
bool need_lock);

void set_previous_gtid_set(Gtid_set *previous_gtid_set_param)
{
Expand Down
1 change: 1 addition & 0 deletions sql/mysqld.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5461,6 +5461,7 @@ int mysqld_main(int argc, char **argv)
if (mysql_bin_log.init_gtid_sets(
const_cast<Gtid_set *>(gtid_state->get_logged_gtids()),
const_cast<Gtid_set *>(gtid_state->get_lost_gtids()),
NULL,
opt_master_verify_checksum,
true/*true=need lock*/))
unireg_abort(1);
Expand Down
14 changes: 14 additions & 0 deletions sql/rpl_gtid.h
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,10 @@ struct Gtid

/// Set both components to 0.
void clear() { sidno= 0; gno= 0; }
// Set both components to input values.
void set(rpl_sidno sno, rpl_gno gtidno) { sidno= sno; gno= gtidno; }
// check if both components are zero or not.
bool empty() const { return (sidno == 0) && (gno == 0); }
/**
The maximal length of the textual representation of a SID, not
including the terminating '\0'.
Expand Down Expand Up @@ -929,6 +933,16 @@ class Gtid_set
*/
enum_return_status _add_gtid(const Gtid &gtid)
{ return _add_gtid(gtid.sidno, gtid.gno); }
/**
Removes the given GTID from this Gtid_set.
@param gtid Gtid to remove.
@return RETURN_STATUS_OK or RETURN_STATUS_REPORTED_ERROR.
*/
enum_return_status _remove_gtid(const Gtid &gtid)
{
return _remove_gtid(gtid.sidno, gtid.gno);
}
/**
Adds all groups from the given Gtid_set to this Gtid_set.
Expand Down
11 changes: 10 additions & 1 deletion sql/rpl_rli.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery

relay_log.init_pthread_objects();
do_server_version_split(::server_version, slave_version_split);

last_retrieved_gtid.clear();
DBUG_VOID_RETURN;
}

Expand Down Expand Up @@ -176,6 +176,7 @@ Relay_log_info::~Relay_log_info()
my_atomic_rwlock_destroy(&slave_open_temp_tables_lock);
relay_log.cleanup();
set_rli_description_event(NULL);
last_retrieved_gtid.clear();

DBUG_VOID_RETURN;
}
Expand Down Expand Up @@ -1760,8 +1761,16 @@ a file name for --relay-log-index option.", opt_relaylog_index_name);
gtid_set.dbug_print("set of GTIDs in relay log before initialization");
global_sid_lock->unlock();
#endif
/*
Below init_gtid_sets() function will parse the available relay logs and
set I/O retrieved gtid event in gtid_state object. We dont need to find
last_retrieved_gtid_event if relay_log_recovery=1 (retrieved set will
be cleared off in that case).
*/
Gtid *last_retrieved_gtid= is_relay_log_recovery ? NULL : get_last_retrieved_gtid();
if (!current_thd &&
relay_log.init_gtid_sets(&gtid_set, NULL,
last_retrieved_gtid,
opt_slave_sql_verify_checksum,
true/*true=need lock*/))
{
Expand Down
4 changes: 4 additions & 0 deletions sql/rpl_rli.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,12 @@ class Relay_log_info : public Rpl_info

private:
Gtid_set gtid_set;
/* Last gtid retrieved by IO thread */
Gtid last_retrieved_gtid;

public:
Gtid *get_last_retrieved_gtid() { return &last_retrieved_gtid; }
void set_last_retrieved_gtid(Gtid gtid) { last_retrieved_gtid= gtid; }
int add_logged_gtid(rpl_sidno sidno, rpl_gno gno)
{
int ret= 0;
Expand Down
54 changes: 53 additions & 1 deletion sql/rpl_slave.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3087,6 +3087,45 @@ static int request_dump(THD *thd, MYSQL* mysql, Master_info* mi,
Gtid_set gtid_executed(&sid_map);
global_sid_lock->wrlock();
gtid_state->dbug_print();

/*
We are unsure whether I/O thread retrieved the last gtid transaction
completely or not (before it is going down because of a crash/normal
shutdown/normal stop slave io_thread). It is possible that I/O thread
would have retrieved and written only partial transaction events. So We
request Master to send the last gtid event once again. We do this by
removing the last I/O thread retrieved gtid event from
"Retrieved_gtid_set". Possible cases: 1) I/O thread would have
retrieved full transaction already in the first time itself, but
retrieving them again will not cause problem because GTID number is
same, Hence SQL thread will not commit it again. 2) I/O thread would
have retrieved full transaction already and SQL thread would have
already executed it. In that case, We are not going remove last
retrieved gtid from "Retrieved_gtid_set" otherwise we will see gaps in
"Retrieved set". The same case is handled in the below code. Please
note there will be paritial transactions written in relay log but they
will not cause any problem incase of transactional tables. But incase
of non-transaction tables, partial trx will create inconsistency
between master and slave. In that case, users need to check manually.
*/

Gtid_set * retrieved_set= (const_cast<Gtid_set *>(mi->rli->get_gtid_set()));
Gtid *last_retrieved_gtid= mi->rli->get_last_retrieved_gtid();

/*
Remove last_retrieved_gtid only if it is not part of
executed_gtid_set
*/
if (!last_retrieved_gtid->empty() &&
!gtid_state->get_logged_gtids()->contains_gtid(*last_retrieved_gtid))
{
if (retrieved_set->_remove_gtid(*last_retrieved_gtid) != RETURN_STATUS_OK)
{
global_sid_lock->unlock();
goto err;
}
}

if (gtid_executed.add_gtid_set(mi->rli->get_gtid_set()) != RETURN_STATUS_OK ||
gtid_executed.add_gtid_set(gtid_state->get_logged_gtids()) !=
RETURN_STATUS_OK)
Expand Down Expand Up @@ -4359,7 +4398,6 @@ Stopping slave I/O thread due to out-of-memory error from master");
"could not queue event from master");
goto err;
}

if (RUN_HOOK(binlog_relay_io, after_queue_event,
(thd, mi, event_buf, event_len, synced)))
{
Expand Down Expand Up @@ -4412,6 +4450,18 @@ ignore_log_space_limit=%d",
log space");
goto err;
}
DBUG_EXECUTE_IF("stop_io_after_reading_gtid_log_event",
if (event_buf[EVENT_TYPE_OFFSET] == GTID_LOG_EVENT)
thd->killed= THD::KILLED_NO_VALUE;
);
DBUG_EXECUTE_IF("stop_io_after_reading_query_log_event",
if (event_buf[EVENT_TYPE_OFFSET] == QUERY_EVENT)
thd->killed= THD::KILLED_NO_VALUE;
);
DBUG_EXECUTE_IF("stop_io_after_reading_xid_log_event",
if (event_buf[EVENT_TYPE_OFFSET] == XID_EVENT)
thd->killed= THD::KILLED_NO_VALUE;
);
}
}

Expand Down Expand Up @@ -6660,6 +6710,8 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
{
global_sid_lock->rdlock();
int ret= rli->add_logged_gtid(gtid.sidno, gtid.gno);
if (!ret)
rli->set_last_retrieved_gtid(gtid);
global_sid_lock->unlock();
if (ret != 0)
goto err;
Expand Down

0 comments on commit e7a4cbe

Please sign in to comment.