-
Notifications
You must be signed in to change notification settings - Fork 286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support retrying downloads if they transiently fail #1594
Changes from 7 commits
73226ae
5b78601
63b1ba4
30b6c08
3c51ee4
d8ad578
ba8a265
e283a05
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,15 +52,15 @@ fetch_uri_sync_on_complete (GObject *object, | |
data->done = TRUE; | ||
} | ||
|
||
gboolean | ||
_ostree_fetcher_mirrored_request_to_membuf (OstreeFetcher *fetcher, | ||
GPtrArray *mirrorlist, | ||
const char *filename, | ||
OstreeFetcherRequestFlags flags, | ||
GBytes **out_contents, | ||
guint64 max_size, | ||
GCancellable *cancellable, | ||
GError **error) | ||
static gboolean | ||
_ostree_fetcher_mirrored_request_to_membuf_once (OstreeFetcher *fetcher, | ||
GPtrArray *mirrorlist, | ||
const char *filename, | ||
OstreeFetcherRequestFlags flags, | ||
GBytes **out_contents, | ||
guint64 max_size, | ||
GCancellable *cancellable, | ||
GError **error) | ||
{ | ||
gboolean ret = FALSE; | ||
g_autoptr(GMainContext) mainctx = NULL; | ||
|
@@ -108,11 +108,42 @@ _ostree_fetcher_mirrored_request_to_membuf (OstreeFetcher *fetcher, | |
return ret; | ||
} | ||
|
||
gboolean | ||
_ostree_fetcher_mirrored_request_to_membuf (OstreeFetcher *fetcher, | ||
GPtrArray *mirrorlist, | ||
const char *filename, | ||
OstreeFetcherRequestFlags flags, | ||
guint n_network_retries, | ||
GBytes **out_contents, | ||
guint64 max_size, | ||
GCancellable *cancellable, | ||
GError **error) | ||
{ | ||
g_autoptr(GError) local_error = NULL; | ||
guint n_retries_remaining = n_network_retries; | ||
|
||
do | ||
{ | ||
g_clear_error (&local_error); | ||
if (_ostree_fetcher_mirrored_request_to_membuf_once (fetcher, mirrorlist, | ||
filename, flags, | ||
out_contents, max_size, | ||
cancellable, &local_error)) | ||
return TRUE; | ||
} | ||
while (_ostree_fetcher_should_retry_request (local_error, n_retries_remaining--)); | ||
|
||
g_assert (local_error != NULL); | ||
g_propagate_error (error, g_steal_pointer (&local_error)); | ||
return FALSE; | ||
} | ||
|
||
/* Helper for callers who just want to fetch single one-off URIs */ | ||
gboolean | ||
_ostree_fetcher_request_uri_to_membuf (OstreeFetcher *fetcher, | ||
OstreeFetcherURI *uri, | ||
OstreeFetcherRequestFlags flags, | ||
guint n_network_retries, | ||
GBytes **out_contents, | ||
guint64 max_size, | ||
GCancellable *cancellable, | ||
|
@@ -121,7 +152,7 @@ _ostree_fetcher_request_uri_to_membuf (OstreeFetcher *fetcher, | |
g_autoptr(GPtrArray) mirrorlist = g_ptr_array_new (); | ||
g_ptr_array_add (mirrorlist, uri); /* no transfer */ | ||
return _ostree_fetcher_mirrored_request_to_membuf (fetcher, mirrorlist, NULL, flags, | ||
out_contents, max_size, | ||
n_network_retries, out_contents, max_size, | ||
cancellable, error); | ||
} | ||
|
||
|
@@ -144,3 +175,65 @@ _ostree_fetcher_journal_failure (const char *remote_name, | |
NULL); | ||
#endif | ||
} | ||
|
||
/* Check whether a particular operation should be retried. This is entirely | ||
* based on how it failed (if at all) last time, and whether the operation has | ||
* some retries left. The retry count is set when the operation is first | ||
* created, and must be decremented by the caller. (@n_retries_remaining == 0) | ||
* will always return %FALSE from this function. | ||
* | ||
* FIXME: In future, we may decide to use transient failures like this as a hint | ||
* to prioritise other mirrors for a particular pull operation (for example). */ | ||
gboolean | ||
_ostree_fetcher_should_retry_request (const GError *error, | ||
guint n_retries_remaining) | ||
{ | ||
if (error == NULL) | ||
g_debug ("%s: error: unset, n_retries_remaining: %u", | ||
G_STRFUNC, n_retries_remaining); | ||
else | ||
g_debug ("%s: error: %u:%u %s, n_retries_remaining: %u", | ||
G_STRFUNC, error->domain, error->code, error->message, | ||
n_retries_remaining); | ||
|
||
if (error == NULL || n_retries_remaining == 0) | ||
return FALSE; | ||
|
||
/* Return TRUE for transient errors. */ | ||
if (g_error_matches (error, G_IO_ERROR, G_IO_ERROR_TIMED_OUT) || | ||
g_error_matches (error, G_IO_ERROR, G_IO_ERROR_HOST_NOT_FOUND) || | ||
g_error_matches (error, G_IO_ERROR, G_IO_ERROR_HOST_UNREACHABLE) || | ||
#if !GLIB_CHECK_VERSION(2, 44, 0) | ||
g_error_matches (error, G_IO_ERROR, G_IO_ERROR_BROKEN_PIPE) || | ||
#else | ||
g_error_matches (error, G_IO_ERROR, G_IO_ERROR_CONNECTION_CLOSED) || | ||
#endif | ||
g_error_matches (error, G_RESOLVER_ERROR, G_RESOLVER_ERROR_NOT_FOUND) || | ||
g_error_matches (error, G_RESOLVER_ERROR, G_RESOLVER_ERROR_TEMPORARY_FAILURE)) | ||
{ | ||
g_debug ("Should retry request (remaining: %u retries), due to transient error: %s", | ||
n_retries_remaining, error->message); | ||
return TRUE; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: while we're at it, maybe let's also add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That’s already covered by one of the
|
||
|
||
return FALSE; | ||
} | ||
|
||
/* Convert a HTTP status code representing an error from libsoup or libcurl to | ||
* a #GIOError. This will return %G_IO_ERROR_FAILED if the status code is | ||
* unknown or otherwise unhandled. */ | ||
GIOError | ||
_ostree_fetcher_http_status_code_to_io_error (guint status_code) | ||
{ | ||
switch (status_code) | ||
{ | ||
case 403: /* SOUP_STATUS_FORBIDDEN */ | ||
case 404: /* SOUP_STATUS_NOT_FOUND */ | ||
case 410: /* SOUP_STATUS_GONE */ | ||
return G_IO_ERROR_NOT_FOUND; | ||
case 408: /* SOUP_STATUS_REQUEST_TIMEOUT */ | ||
return G_IO_ERROR_TIMED_OUT; | ||
default: | ||
return G_IO_ERROR_FAILED; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One thing that feels odd to me is that the way this is structured, we'll be retrying the whole set of mirrors if the last one on the list returns a transient error. So e.g. the first N-1 mirrors could return
NOT_FOUND
and if the Nth mirror times out, we'll be retrying all N mirrors, right? Whereas I think what we want is to retry only the Nth mirror. It seems like fixing this properly would probably require indeed teaching each backend-specific fetcher about retries.Anyway, I think it's definitely worth getting the current approach in and iterate on that since AFAIK mirrorlists are not widely used yet.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I’d definitely prefer to get the current approach in first. That said, I’m not sure whether I will have time afterwards to work on improvements to mirror handling (I’m working on this to fix a problem with our Endless downloads timing out, and mirrors don’t really impact on that).
However, I’m not entirely convinced about the situation you’re considering. Are we ever expecting the majority of mirrors to return
NOT_FOUND
? Sounds like an unlikely corner case to me. Even so,OstreeFetcherSoup
maintains state about its mirrors, so I think the problem could be addressed by storing a bit more state there, rather than by rearchitecting things. Specifically, instead of storing a singlemirrorlist_idx
pointing to the best mirror to try, store some flags for each mirror, and track whether each mirror has returnedNOT_FOUND
, or returned a transient error, at all. Prioritise requesting from mirrors which haven’t. (This could also be implemented by storing a priority for each mirror, and demoting it onNOT_FOUND
or transient errors.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To clarify, I wasn't suggesting you would fix those up. Just something that'd need to be done down the road. As I said, I'm not aware of any big users of mirrorlists yet, so it's not a pressing concern.
Heh, sure. The
NOT_FOUND
was an example; anything "non-transient" would work. Anyway, I do agree with you that it could be done without lowering the retry logic down a level.