Skip to content

Commit

Permalink
lib/repo-pull: Support retrying requests on transient network errors
Browse files Browse the repository at this point in the history
Allow network requests to be re-queued if they failed with a transient
error, such as a socket timeout. Retry each request up to a limit
(default: 5), and only then fail the entire pull and propagate the error
to the caller.

Add a new ostree_repo_pull_with_options() option, n-network-retries, to
control the number of retries (including setting it back to the old
default of 0, if the caller wants).

Currently, retries are not supported for FetchDeltaSuperData requests,
as they are not queued. Once they are queued, adding support for retries
should be trivial. A FIXME comment has been left for this.

Signed-off-by: Philip Withnall <withnall@endlessm.com>
  • Loading branch information
pwithnall committed May 22, 2018
1 parent ebad101 commit 433259a
Showing 1 changed file with 85 additions and 8 deletions.
93 changes: 85 additions & 8 deletions src/libostree/ostree-repo-pull.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@
#define OSTREE_REPO_PULL_CONTENT_PRIORITY (OSTREE_FETCHER_DEFAULT_PRIORITY)
#define OSTREE_REPO_PULL_METADATA_PRIORITY (OSTREE_REPO_PULL_CONTENT_PRIORITY - 100)

/* Arbitrarily chosen number of retries for all download operations when they
* receive a transient network error (such as a socket timeout) — see
* should_retry_request(). This is the default value for the `n-network-retries`
* pull option. */
#define DEFAULT_N_NETWORK_RETRIES 5

typedef enum {
OSTREE_FETCHER_SECURITY_STATE_CA_PINNED,
OSTREE_FETCHER_SECURITY_STATE_TLS,
Expand Down Expand Up @@ -92,6 +98,7 @@ typedef struct {
gboolean dry_run;
gboolean dry_run_emitted_progress;
gboolean legacy_transaction_resuming;
guint n_network_retries;
enum {
OSTREE_PULL_PHASE_FETCHING_REFS,
OSTREE_PULL_PHASE_FETCHING_OBJECTS
Expand Down Expand Up @@ -177,6 +184,7 @@ typedef struct {
gboolean object_is_stored;

OstreeCollectionRef *requested_ref; /* (nullable) */
guint n_retries_remaining;
} FetchObjectData;

typedef struct {
Expand All @@ -187,6 +195,7 @@ typedef struct {
char *to_revision;
guint i;
guint64 size;
guint n_retries_remaining;
} FetchStaticDeltaData;

typedef struct {
Expand All @@ -195,6 +204,7 @@ typedef struct {
OstreeObjectType objtype;
guint recursion_depth; /* NB: not used anymore, though might be nice to print */
OstreeCollectionRef *requested_ref; /* (nullable) */
guint n_retries_remaining;
} ScanObjectQueueData;

static void
Expand Down Expand Up @@ -478,6 +488,33 @@ scan_object_queue_data_free (ScanObjectQueueData *scan_data)
g_free (scan_data);
}

/* Check whether a particular operation should be retried. This is entirely
* based on how it failed (if at all) last time, and whether the operation has
* some retries left. The retry count is set when the operation is first
* created, and must be decremented by the caller. (@n_retries_remaining == 0)
* will always return %FALSE from this function. */
static gboolean
should_retry_request (const GError *error,
guint n_retries_remaining)
{
if (error == NULL || n_retries_remaining == 0)
return FALSE;

/* Return TRUE for transient errors. */
if (g_error_matches (error, G_IO_ERROR, G_IO_ERROR_TIMED_OUT) ||
g_error_matches (error, G_IO_ERROR, G_IO_ERROR_HOST_NOT_FOUND) ||
g_error_matches (error, G_IO_ERROR, G_IO_ERROR_HOST_UNREACHABLE) ||
g_error_matches (error, G_RESOLVER_ERROR, G_RESOLVER_ERROR_NOT_FOUND) ||
g_error_matches (error, G_RESOLVER_ERROR, G_RESOLVER_ERROR_TEMPORARY_FAILURE))
{
g_debug ("Should retry request (remaining: %u retries), due to transient error: %s",
n_retries_remaining, error->message);
return TRUE;
}

return FALSE;
}

/* Called out of the main loop to process the "scan object queue", which is a
* queue of metadata objects (commits and dirtree, but not dirmeta) to parse to
* look for further objects. Basically wraps execution of
Expand All @@ -502,8 +539,13 @@ idle_worker (gpointer user_data)
scan_one_metadata_object (pull_data, checksum, scan_data->objtype,
scan_data->path, scan_data->recursion_depth,
scan_data->requested_ref, pull_data->cancellable, &error);
check_outstanding_requests_handle_error (pull_data, &error);
scan_object_queue_data_free (scan_data);

if (should_retry_request (error, scan_data->n_retries_remaining--))
queue_scan_one_metadata_object_s (pull_data, g_steal_pointer (&scan_data));
else
check_outstanding_requests_handle_error (pull_data, &error);

g_clear_pointer (&scan_data, scan_object_queue_data_free);

return G_SOURCE_CONTINUE;
}
Expand Down Expand Up @@ -718,6 +760,7 @@ on_local_object_imported (GObject *object,
pull_data->n_imported_content++;
g_assert_cmpint (pull_data->n_outstanding_content_write_requests, >, 0);
pull_data->n_outstanding_content_write_requests--;
/* No retries for local reads. */
check_outstanding_requests_handle_error (pull_data, &local_error);
}

Expand Down Expand Up @@ -1017,6 +1060,7 @@ content_fetch_on_write_complete (GObject *object,
pull_data->n_fetched_deltapart_fallbacks++;
out:
pull_data->n_outstanding_content_write_requests--;
/* No retries for local writes. */
check_outstanding_requests_handle_error (pull_data, &local_error);
fetch_object_data_free (fetch_data);
}
Expand Down Expand Up @@ -1102,9 +1146,14 @@ content_fetch_on_complete (GObject *object,

out:
pull_data->n_outstanding_content_fetches--;
check_outstanding_requests_handle_error (pull_data, &local_error);

if (should_retry_request (local_error, fetch_data->n_retries_remaining--))
enqueue_one_object_request_s (pull_data, g_steal_pointer (&fetch_data));
else
check_outstanding_requests_handle_error (pull_data, &local_error);

if (free_fetch_data)
fetch_object_data_free (fetch_data);
g_clear_pointer (&fetch_data, fetch_object_data_free);
}

static void
Expand Down Expand Up @@ -1148,6 +1197,7 @@ on_metadata_written (GObject *object,
pull_data->n_outstanding_metadata_write_requests--;
fetch_object_data_free (fetch_data);

/* No need to retry local write operations. */
check_outstanding_requests_handle_error (pull_data, &local_error);
}

Expand Down Expand Up @@ -1286,9 +1336,14 @@ meta_fetch_on_complete (GObject *object,
g_assert (pull_data->n_outstanding_metadata_fetches > 0);
pull_data->n_outstanding_metadata_fetches--;
pull_data->n_fetched_metadata++;
check_outstanding_requests_handle_error (pull_data, &local_error);

if (should_retry_request (local_error, fetch_data->n_retries_remaining--))
enqueue_one_object_request_s (pull_data, g_steal_pointer (&fetch_data));
else
check_outstanding_requests_handle_error (pull_data, &local_error);

if (free_fetch_data)
fetch_object_data_free (fetch_data);
g_clear_pointer (&fetch_data, fetch_object_data_free);
}

static void
Expand Down Expand Up @@ -1320,6 +1375,7 @@ on_static_delta_written (GObject *object,
out:
g_assert (pull_data->n_outstanding_deltapart_write_requests > 0);
pull_data->n_outstanding_deltapart_write_requests--;
/* No need to retry on failure to write locally. */
check_outstanding_requests_handle_error (pull_data, &local_error);
/* Always free state */
fetch_static_delta_data_free (fetch_data);
Expand Down Expand Up @@ -1366,9 +1422,14 @@ static_deltapart_fetch_on_complete (GObject *object,
g_assert (pull_data->n_outstanding_deltapart_fetches > 0);
pull_data->n_outstanding_deltapart_fetches--;
pull_data->n_fetched_deltaparts++;
check_outstanding_requests_handle_error (pull_data, &local_error);

if (should_retry_request (local_error, fetch_data->n_retries_remaining--))
enqueue_one_static_delta_part_request_s (pull_data, g_steal_pointer (&fetch_data));
else
check_outstanding_requests_handle_error (pull_data, &local_error);

if (free_fetch_data)
fetch_static_delta_data_free (fetch_data);
g_clear_pointer (&fetch_data, fetch_static_delta_data_free);
}

static gboolean
Expand Down Expand Up @@ -1794,6 +1855,7 @@ queue_scan_one_metadata_object_c (OtPullData *pull_data,
scan_data->path = g_strdup (path);
scan_data->recursion_depth = recursion_depth;
scan_data->requested_ref = (ref != NULL) ? ostree_collection_ref_dup (ref) : NULL;
scan_data->n_retries_remaining = pull_data->n_network_retries;

queue_scan_one_metadata_object_s (pull_data, g_steal_pointer (&scan_data));
}
Expand Down Expand Up @@ -1971,6 +2033,7 @@ enqueue_one_object_request (OtPullData *pull_data,
fetch_data->is_detached_meta = is_detached_meta;
fetch_data->object_is_stored = object_is_stored;
fetch_data->requested_ref = (ref != NULL) ? ostree_collection_ref_dup (ref) : NULL;
fetch_data->n_retries_remaining = pull_data->n_network_retries;

gboolean is_meta = OSTREE_OBJECT_TYPE_IS_META (objtype);

Expand Down Expand Up @@ -2237,6 +2300,7 @@ process_one_static_delta (OtPullData *pull_data,
fetch_data->is_detached_meta = FALSE;
fetch_data->object_is_stored = FALSE;
fetch_data->requested_ref = (ref != NULL) ? ostree_collection_ref_dup (ref) : NULL;
fetch_data->n_retries_remaining = pull_data->n_network_retries;

ostree_repo_write_metadata_async (pull_data->repo, OSTREE_OBJECT_TYPE_COMMIT, to_checksum,
to_commit,
Expand Down Expand Up @@ -2310,6 +2374,7 @@ process_one_static_delta (OtPullData *pull_data,
fetch_data->expected_checksum = ostree_checksum_from_bytes_v (csum_v);
fetch_data->size = size;
fetch_data->i = i;
fetch_data->n_retries_remaining = pull_data->n_network_retries;

if (inline_part_bytes != NULL)
{
Expand Down Expand Up @@ -2593,6 +2658,8 @@ on_superblock_fetched (GObject *src,
g_assert (pull_data->n_outstanding_metadata_fetches > 0);
pull_data->n_outstanding_metadata_fetches--;
pull_data->n_fetched_metadata++;

/* FIXME: This should check should_retry_request(). */
check_outstanding_requests_handle_error (pull_data, &local_error);

g_clear_pointer (&fetch_data, fetch_delta_super_data_free);
Expand Down Expand Up @@ -3277,6 +3344,9 @@ initiate_request (OtPullData *pull_data,
* * update-frequency (u): Frequency to call the async progress callback in milliseconds, if any; only values higher than 0 are valid
* * localcache-repos (as): File paths for local repos to use as caches when doing remote fetches
* * append-user-agent (s): Additional string to append to the user agent
* * n-network-retries (u): Number of times to retry each download on receiving
* a transient network error, such as a socket timeout; default is 5, 0
* means return errors without retrying
*/
gboolean
ostree_repo_pull_with_options (OstreeRepo *self,
Expand Down Expand Up @@ -3310,6 +3380,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
gboolean opt_gpg_verify_set = FALSE;
gboolean opt_gpg_verify_summary_set = FALSE;
gboolean opt_collection_refs_set = FALSE;
gboolean opt_n_network_retries_set = FALSE;
const char *main_collection_id = NULL;
const char *url_override = NULL;
gboolean inherit_transaction = FALSE;
Expand Down Expand Up @@ -3349,6 +3420,8 @@ ostree_repo_pull_with_options (OstreeRepo *self,
(void) g_variant_lookup (options, "localcache-repos", "^a&s", &opt_localcache_repos);
(void) g_variant_lookup (options, "timestamp-check", "b", &pull_data->timestamp_check);
(void) g_variant_lookup (options, "append-user-agent", "s", &pull_data->append_user_agent);
opt_n_network_retries_set =
g_variant_lookup (options, "n-network-retries", "u", &pull_data->n_network_retries);

if (pull_data->remote_refspec_name != NULL)
pull_data->remote_name = g_strdup (pull_data->remote_refspec_name);
Expand Down Expand Up @@ -3389,6 +3462,9 @@ ostree_repo_pull_with_options (OstreeRepo *self,
pull_data->main_context = g_main_context_ref_thread_default ();
pull_data->flags = flags;

if (!opt_n_network_retries_set)
pull_data->n_network_retries = DEFAULT_N_NETWORK_RETRIES;

pull_data->repo = self;
pull_data->progress = progress;

Expand Down Expand Up @@ -5597,6 +5673,7 @@ ostree_repo_pull_from_remotes_async (OstreeRepo *self,
copy_option (&options_dict, &local_options_dict, "subdirs", G_VARIANT_TYPE ("as"));
copy_option (&options_dict, &local_options_dict, "update-frequency", G_VARIANT_TYPE ("u"));
copy_option (&options_dict, &local_options_dict, "append-user-agent", G_VARIANT_TYPE ("s"));
copy_option (&options_dict, &local_options_dict, "n-network-retries", G_VARIANT_TYPE ("u"));

local_options = g_variant_dict_end (&local_options_dict);

Expand Down

0 comments on commit 433259a

Please sign in to comment.