Skip to content

Commit

Permalink
Fix regression in retries.
Browse files Browse the repository at this point in the history
5314dbf aimed to make nested Wait objects more accurate with regard to wait time but it also got rid of the "bonus" retry that was implicit in the prior implementation. This meant that if an operation used up the entire allotted timeout, it would not be retried. Object stores especially are noisy places and some amount of retry should always be attempted. So even though removing the "bonus" retry was intended, it turned out not to be a good idea.

Instead of an implicit retry, formalize two retries in the Wait object even if the wait time has expired. Any number of retries are allowed during the wait period. Also remove waitRemaining() since it is no longer needed.

Adjust tests as needed to account for the extra timeouts.

Note that there may still be an underlying issue here that is simply being masked by retries. That is, the issue expressing was that waiting for a socket to be writable was timing out and without a retry that caused a hard error. This patch does nothing to address the source of the write timeout and perhaps there is nothing we can do about it. It does seem similar to the write issue we had with our blocking TLS implementation, but it was never clear if that was a problem with TLS, the kernel, or a bug in pgBackRest itself. It cropped up after a kernel update and we switched to non-blocking TLS to address the issue (c88684e).
  • Loading branch information
dwsteele committed Nov 9, 2023
1 parent 3c116e1 commit fa5b2d4
Show file tree
Hide file tree
Showing 11 changed files with 103 additions and 46 deletions.
24 changes: 24 additions & 0 deletions doc/xml/release/2023/2.49.xml
@@ -1,4 +1,28 @@
<release date="XXXX-XX-XX" version="2.49dev" title="UNDER DEVELOPMENT">
<release-core-list>
<release-bug-list>
<release-item>
<commit subject="Remove unused header."/>
<commit subject="Fix regression in retries.">
<github-issue id="2200"/>
<github-issue id="2214"/>
<github-pull-request id="2221"/>
</commit>

<release-item-contributor-list>
<release-item-ideator id="norman.adkins"/>
<release-item-ideator id="tanel.suurhans"/>
<release-item-ideator id="jordan.english"/>
<release-item-ideator id="timothee.peignier"/>
<release-item-contributor id="david.steele"/>
<release-item-reviewer id="stephen.frost"/>
</release-item-contributor-list>

<p>Fix regression in retries.</p>
</release-item>
</release-bug-list>
</release-core-list>

<release-doc-list>
<release-feature-list>
<release-item>
Expand Down
15 changes: 15 additions & 0 deletions doc/xml/release/contributor.xml
Expand Up @@ -525,6 +525,11 @@
<contributor-id type="github">precision-software</contributor-id>
</contributor>

<contributor id="jordan.english">
<contributor-name-display>Jordan English</contributor-name-display>
<contributor-id type="github">jordanenglish</contributor-id>
</contributor>

<contributor id="julian.zhang">
<contributor-name-display>Julian Zhang</contributor-name-display>
<contributor-id type="github">julianzhang98</contributor-id>
Expand Down Expand Up @@ -760,6 +765,11 @@
<contributor-id type="github">nj3110</contributor-id>
</contributor>

<contributor id="norman.adkins">
<contributor-name-display>Norman Adkins</contributor-name-display>
<contributor-id type="github">adkinsn</contributor-id>
</contributor>

<contributor id="nuno.pires">
<contributor-name-display>Nuno Pires</contributor-name-display>
<contributor-id type="github">nunopi</contributor-id>
Expand Down Expand Up @@ -905,6 +915,11 @@
<contributor-id type="github">Tiago-Anastacio</contributor-id>
</contributor>

<contributor id="tanel.suurhans">
<contributor-name-display>Tanel Suurhans</contributor-name-display>
<contributor-id type="github">tanelsuurhans</contributor-id>
</contributor>

<contributor id="tim.garton">
<contributor-name-display>Tim Garton</contributor-name-display>
<contributor-id type="github">ralfthewise</contributor-id>
Expand Down
7 changes: 3 additions & 4 deletions src/common/io/socket/client.c
Expand Up @@ -97,7 +97,7 @@ sckClientOpen(THIS_VOID)
THROW_ON_SYS_ERROR(fd == -1, HostConnectError, "unable to create socket");

sckOptionSet(fd);
sckConnect(fd, this->host, this->port, addressFound, waitRemaining(wait));
sckConnect(fd, this->host, this->port, addressFound, this->timeoutConnect);

// Create the session
MEM_CONTEXT_PRIOR_BEGIN()
Expand All @@ -119,8 +119,7 @@ sckClientOpen(THIS_VOID)
errRetryAdd(errRetry);

// Increment address info index and reset if the end has been reached. When the end has been reached sleep for a bit
// to hopefully have better chance at succeeding, otherwise continue right to the next address as long as there is
// some time left.
// to hopefully have better chance at succeeding, otherwise continue right to the next address.
addrInfoIdx++;

if (addrInfoIdx >= addrInfoSize(addrInfo))
Expand All @@ -129,7 +128,7 @@ sckClientOpen(THIS_VOID)
retry = waitMore(wait);
}
else
retry = waitRemaining(wait) > 0;
retry = true;

// Error when no retries remain
if (!retry)
Expand Down
48 changes: 19 additions & 29 deletions src/common/wait.c
Expand Up @@ -16,6 +16,7 @@ struct Wait
TimeMSec sleepTime; // Next sleep time (in usec)
TimeMSec sleepPrevTime; // Previous time slept (in usec)
TimeMSec beginTime; // Time the wait began (in epoch usec)
unsigned int retry; // Retries remaining
};

/**********************************************************************************************************************************/
Expand All @@ -33,6 +34,7 @@ waitNew(const TimeMSec waitTime)
*this = (Wait)
{
.waitTime = waitTime,
.retry = 2,
};

// Calculate first sleep time -- start with 1/10th of a second for anything >= 1 second
Expand All @@ -52,31 +54,6 @@ waitNew(const TimeMSec waitTime)
FUNCTION_LOG_RETURN(WAIT, this);
}

/**********************************************************************************************************************************/
FN_EXTERN TimeMSec
waitRemaining(Wait *const this)
{
FUNCTION_TEST_BEGIN();
FUNCTION_TEST_PARAM(WAIT, this);
FUNCTION_TEST_END();

TimeMSec result = 0;

// If any wait time remains
if (this->sleepTime > 0)
{
// Returning remaining time, if any, else set sleepTime to 0 so next call to waitMore will return false
const TimeMSec elapsedTime = timeMSec() - this->beginTime;

if (elapsedTime < this->waitTime)
result = this->waitTime - elapsedTime;
else
this->sleepTime = 0;
}

FUNCTION_TEST_RETURN(TIME_MSEC, result);
}

/**********************************************************************************************************************************/
FN_EXTERN bool
waitMore(Wait *const this)
Expand Down Expand Up @@ -114,13 +91,26 @@ waitMore(Wait *const this)
// Store new sleep times
this->sleepPrevTime = this->sleepTime;
this->sleepTime = sleepTime;

// Need to wait more
result = true;
}
// Else set sleep to zero so next call will return false
// Else are there retries left?
else if (this->retry != 0)
{
// Sleep using the last calculated time
sleepMSec(this->sleepTime);
}
// Else set sleep to zero so call will return false
else
this->sleepTime = 0;

// Caller can continue processing
if (this->sleepTime > 0)
{
// Decrement retries
if (this->retry != 0)
this->retry--;

result = true;
}
}

FUNCTION_LOG_RETURN(BOOL, result);
Expand Down
13 changes: 6 additions & 7 deletions src/common/wait.h
@@ -1,5 +1,10 @@
/***********************************************************************************************************************************
Wait Handler
Used for operations that may fail due to an error or some unexpected condition such as file missing. When waitMore() is called it
will wait (based on a Fibonacci backoff) before returning to give the error or condition time to clear. Even when the wait time has
expired before waitMore() is called, there will still be two retries to compensate for operations that use up the entire time limit.
Any number of retries are allowed within the time limit.
***********************************************************************************************************************************/
#ifndef COMMON_WAIT_H
#define COMMON_WAIT_H
Expand All @@ -17,16 +22,10 @@ Constructors
***********************************************************************************************************************************/
FN_EXTERN Wait *waitNew(TimeMSec waitTime);

/***********************************************************************************************************************************
Getters/Setters
***********************************************************************************************************************************/
// How much time is remaining? Recalculated each time waitMore() is called.
FN_EXTERN TimeMSec waitRemaining(Wait *this);

/***********************************************************************************************************************************
Functions
***********************************************************************************************************************************/
// Wait and return whether the caller has more time left
// Wait and return true if the caller has more time/retries left
FN_EXTERN bool waitMore(Wait *this);

/***********************************************************************************************************************************
Expand Down
1 change: 1 addition & 0 deletions test/src/module/common/ioHttpTest.c
Expand Up @@ -312,6 +312,7 @@ testRun(void)
TEST_ERROR_FMT(
httpRequestResponse(httpRequestNewP(client, STRDEF("GET"), STRDEF("/")), false), HostConnectError,
"unable to connect to 'localhost:%u (127.0.0.1)': [111] Connection refused\n"
"[RETRY DETAIL OMITTED]\n"
"[RETRY DETAIL OMITTED]",
hrnServerPort(0));

Expand Down
6 changes: 5 additions & 1 deletion test/src/module/common/ioTlsTest.c
Expand Up @@ -375,7 +375,11 @@ testRun(void)

// This address should not be in use in a test environment -- if it is the test will fail
TEST_ASSIGN(client, sckClientNew(STRDEF("172.31.255.255"), hrnServerPort(0), 100, 100), "new client");
TEST_ERROR_FMT(ioClientOpen(client), HostConnectError, "timeout connecting to '172.31.255.255:%u'", hrnServerPort(0));
TEST_ERROR_FMT(
ioClientOpen(client), HostConnectError,
"timeout connecting to '172.31.255.255:%u'\n"
"[RETRY DETAIL OMITTED]",
hrnServerPort(0));

// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("sckServerAccept() returns NULL on interrupt");
Expand Down
18 changes: 13 additions & 5 deletions test/src/module/common/waitTest.c
Expand Up @@ -23,32 +23,40 @@ testRun(void)
TimeMSec begin = timeMSec();

TEST_ASSIGN(wait, waitNew(0), "new wait");
TEST_RESULT_UINT(waitRemaining(wait), 0, " check remaining time");
TEST_RESULT_UINT(wait->waitTime, 0, " check wait time");
TEST_RESULT_UINT(wait->sleepTime, 0, " check sleep time");
TEST_RESULT_UINT(wait->sleepPrevTime, 0, " check sleep prev time");
TEST_RESULT_BOOL(waitMore(wait), false, " no wait more");

// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("100ms using only waitRemaining()");
TEST_TITLE("100ms with retries after time expired");

TEST_ASSIGN(wait, waitNew(100), "new wait");
sleepMSec(100);
TEST_RESULT_UINT(waitRemaining(wait), 0, " check remaining time");

TEST_RESULT_BOOL(waitMore(wait), true, " time expired, first retry");
TEST_RESULT_BOOL(waitMore(wait), true, " time expired, second retry");
TEST_RESULT_BOOL(waitMore(wait), false, " time expired, retries expired");

// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("200ms wait");

begin = timeMSec();

TEST_ASSIGN(wait, waitNew(200), "new wait = 0.2 sec");
TEST_RESULT_BOOL(waitRemaining(wait) <= 200, true, " check remaining time upper range");
TEST_RESULT_BOOL(waitRemaining(wait) >= 150, true, " check remaining time lowe range");
TEST_RESULT_UINT(wait->waitTime, 200, " check wait time");
TEST_RESULT_UINT(wait->sleepTime, 20, " check sleep time");
TEST_RESULT_UINT(wait->sleepPrevTime, 0, " check sleep prev time");
TEST_RESULT_BOOL(wait->beginTime > (TimeMSec)1483228800000, true, " check begin time");

TEST_RESULT_BOOL(waitMore(wait), true, " first retry");
TEST_RESULT_UINT(wait->retry, 1, " check retry");

TEST_RESULT_BOOL(waitMore(wait), true, " second retry");
TEST_RESULT_UINT(wait->retry, 0, " check retry");

TEST_RESULT_BOOL(waitMore(wait), true, " still going because of time");

while (waitMore(wait));
TimeMSec end = timeMSec();

Expand Down
1 change: 1 addition & 0 deletions test/src/module/db/dbTest.c
Expand Up @@ -525,6 +525,7 @@ testRun(void)
HRN_PQ_SCRIPT_CHECKPOINT(2),
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),

// Wait for standby to sync
HRN_PQ_SCRIPT_REPLAY_TARGET_REACHED_GE_10(2, "5/5", false, "5/3"),
Expand Down
10 changes: 10 additions & 0 deletions test/src/module/postgres/clientTest.c
Expand Up @@ -153,6 +153,10 @@ testRun(void)
{.function = HRN_PQ_SENDQUERY, .param = "[\"" TEST_QUERY "\"]", .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 600},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_GETCANCEL},
{.function = HRN_PQ_CANCEL, .resultInt = 1},
{.function = HRN_PQ_FREECANCEL},
Expand Down Expand Up @@ -180,6 +184,8 @@ testRun(void)
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 300},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_GETCANCEL},
{.function = HRN_PQ_CANCEL, .resultInt = 0, .resultZ = TEST_PQ_ERROR},
{.function = HRN_PQ_FREECANCEL});
Expand All @@ -202,6 +208,10 @@ testRun(void)
{.function = HRN_PQ_SENDQUERY, .param = "[\"" TEST_QUERY "\"]", .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 600},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_GETCANCEL, .resultNull = true});

TEST_ERROR(
Expand Down
6 changes: 6 additions & 0 deletions test/src/module/storage/sftpTest.c
Expand Up @@ -2051,6 +2051,12 @@ testRun(void)
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
// Path missing
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "\",0]",
.attrPerms = LIBSSH2_SFTP_S_IFDIR, .resultInt = LIBSSH2_ERROR_NONE},
Expand Down

0 comments on commit fa5b2d4

Please sign in to comment.