Skip to content

Commit

Permalink
holddown times for certain backend connection errors
Browse files Browse the repository at this point in the history
This is similar to the vca pace: Depending on the backend connection
error, it does not make sense to re-try in rapid succession, instead
not attempting the failed connection again for some time will save
resources both locally and remotely, where applicable and should thus
help improve the overall situation.

Should fix or at least mitigate varnishcache#2622
  • Loading branch information
nigoroll committed Apr 4, 2018
1 parent 4708235 commit d291941
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 0 deletions.
39 changes: 39 additions & 0 deletions bin/varnishd/cache/cache_tcp_pool.c
Expand Up @@ -94,6 +94,8 @@ struct conn_pool {
struct lock mtx;
struct VSC_vcp *stats;
struct vsc_seg *vsc;
double holddown;
int holddown_errno;

/* length: stat->n_conn */
VTAILQ_HEAD(, pfd) connlist;
Expand Down Expand Up @@ -243,6 +245,7 @@ VCP_New(struct conn_pool *cp, const void *id, void *priv,
cp->priv = priv;
cp->methods = cm;
cp->refcnt = 1;
cp->holddown = 0;
Lck_New(&cp->mtx, lck_tcp_pool);
VTAILQ_INIT(&cp->connlist);
VTAILQ_INIT(&cp->killlist);
Expand Down Expand Up @@ -347,27 +350,52 @@ VCP_Open(struct conn_pool *cp, double tmo, const void **privp)

CHECK_OBJ_NOTNULL(cp, CONN_POOL_MAGIC);

while (cp->holddown > 0) {
Lck_Lock(&cp->mtx);
if (cp->holddown == 0) {
Lck_Unlock(&cp->mtx);
break;
}

if (VTIM_mono() >= cp->holddown) {
cp->holddown = 0;
Lck_Unlock(&cp->mtx);
break;
}

cp->stats->helddown++;
errno = cp->holddown_errno;
Lck_Unlock(&cp->mtx);
return (-1);
}

r = cp->methods->open(cp, tmo, privp);

if (r >= 0) {
cp->stats->opened++;
return (r);
}

h = 0;

/* stats access unprotected */
switch (errno) {
case EACCES:
case EPERM:
cp->stats->fail_eacces++;
h = cache_param->backend_local_error_holddown;
break;
case EADDRNOTAVAIL:
cp->stats->fail_eaddrnotavail++;
h = cache_param->backend_local_error_holddown;
break;
case ECONNREFUSED:
cp->stats->fail_econnrefused++;
h = cache_param->backend_remote_error_holddown;
break;
case ENETUNREACH:
cp->stats->fail_enetunreach++;
h = cache_param->backend_remote_error_holddown;
break;
case ETIMEDOUT:
cp->stats->fail_etimedout++;
Expand All @@ -380,6 +408,17 @@ VCP_Open(struct conn_pool *cp, double tmo, const void **privp)
}
cp->stats->fail++;

if (h == 0)
return (r);

Lck_Lock(&cp->mtx);
h += VTIM_mono();
if (cp->holddown == 0 || h < cp->holddown) {
cp->holddown = h;
cp->holddown_errno = errno;
}

Lck_Unlock(&cp->mtx);
return (r);
}

Expand Down
39 changes: 39 additions & 0 deletions include/tbl/params.h
Expand Up @@ -285,6 +285,45 @@ PARAM(
/* func */ NULL
)

PARAM(
/* name */ backend_local_error_holddown,
/* typ */ timeout,
/* min */ "0.000",
/* max */ NULL,
/* default */ "10.000",
/* units */ "seconds",
/* flags */ EXPERIMENTAL,
/* s-text */
"When connecting to backends, certain error codes "
"(EADDRNOTAVAIL, EACCESS, EPERM) signal a local resource shortage "
"or configuration issue for which retrying connection attempts "
"may worsen the situation due to the complexity of the operations "
"involved in the kernel.\n"
"This parameter prevents repeated connection attempts for the "
"configured duration.",
/* l-text */ "",
/* func */ NULL
)

PARAM(
/* name */ backend_remote_error_holddown,
/* typ */ timeout,
/* min */ "0.000",
/* max */ NULL,
/* default */ "0.250",
/* units */ "seconds",
/* flags */ EXPERIMENTAL,
/* s-text */
"When connecting to backends, certain error codes (ECONNREFUSED, "
"ENETUNREACH) signal fundamental connection issues such as the backend "
"not accepting connections or routing problems for which repeated "
"connection attempts are considered useless\n"
"This parameter prevents repeated connection attempts for the "
"configured duration.",
/* l-text */ "",
/* func */ NULL
)

PARAM(
/* name */ cli_limit,
/* typ */ bytes_u,
Expand Down

0 comments on commit d291941

Please sign in to comment.