Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add backoff jitter #10486

Merged
merged 10 commits into from Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions mlflow/environment_variables.py
Expand Up @@ -101,6 +101,12 @@ def get(self):
"MLFLOW_HTTP_REQUEST_BACKOFF_FACTOR", int, 2
)

#: Specifies the backoff jitter between MLflow HTTP request failures
#: (default: ``1.0``)
MLFLOW_HTTP_REQUEST_BACKOFF_JITTER = _EnvironmentVariable(
"MLFLOW_HTTP_REQUEST_BACKOFF_JITTER", float, 1.0
)

#: Specifies the timeout in seconds for MLflow HTTP requests
#: (default: ``120``)
MLFLOW_HTTP_REQUEST_TIMEOUT = _EnvironmentVariable("MLFLOW_HTTP_REQUEST_TIMEOUT", int, 120)
Expand Down
20 changes: 17 additions & 3 deletions mlflow/utils/request_utils.py
Expand Up @@ -71,6 +71,7 @@ def download_chunk(*, range_start, range_end, headers, download_path, http_uri):
def _cached_get_request_session(
max_retries,
backoff_factor,
backoff_jitter,
retry_codes,
raise_on_status,
# To create a new Session object for each process, we use the process id as the cache key.
Expand All @@ -92,6 +93,7 @@ def _cached_get_request_session(
"status": max_retries,
"status_forcelist": retry_codes,
"backoff_factor": backoff_factor,
"backoff_jitter": backoff_jitter,
"raise_on_status": raise_on_status,
}
if Version(urllib3.__version__) >= Version("1.26.0"):
Expand All @@ -106,14 +108,15 @@ def _cached_get_request_session(
return session


def _get_request_session(max_retries, backoff_factor, retry_codes, raise_on_status):
def _get_request_session(max_retries, backoff_factor, backoff_jitter, retry_codes, raise_on_status):
"""
Returns a `Requests.Session` object for making an HTTP request.

:param max_retries: Maximum total number of retries.
:param backoff_factor: a time factor for exponential backoff. e.g. value 5 means the HTTP
request will be retried with interval 5, 10, 20... seconds. A value of 0 turns off the
exponential backoff.
:param backoff_jitter: A random jitter to add to the backoff interval.
:param retry_codes: a list of HTTP response error codes that qualifies for retry.
:param raise_on_status: whether to raise an exception, or return a response, if status falls
in retry_codes range and retries have been exhausted.
Expand All @@ -122,14 +125,22 @@ def _get_request_session(max_retries, backoff_factor, retry_codes, raise_on_stat
return _cached_get_request_session(
max_retries,
backoff_factor,
backoff_jitter,
retry_codes,
raise_on_status,
_pid=os.getpid(),
)


def _get_http_response_with_retries(
method, url, max_retries, backoff_factor, retry_codes, raise_on_status=True, **kwargs
method,
url,
max_retries,
backoff_factor,
backoff_jitter,
retry_codes,
raise_on_status=True,
**kwargs,
):
"""
Performs an HTTP request using Python's `requests` module with an automatic retry policy.
Expand All @@ -140,14 +151,17 @@ def _get_http_response_with_retries(
:param backoff_factor: a time factor for exponential backoff. e.g. value 5 means the HTTP
request will be retried with interval 5, 10, 20... seconds. A value of 0 turns off the
exponential backoff.
:param backoff_jitter: A random jitter to add to the backoff interval.
:param retry_codes: a list of HTTP response error codes that qualifies for retry.
:param raise_on_status: whether to raise an exception, or return a response, if status falls
in retry_codes range and retries have been exhausted.
:param kwargs: Additional keyword arguments to pass to `requests.Session.request()`

:return: requests.Response object.
"""
session = _get_request_session(max_retries, backoff_factor, retry_codes, raise_on_status)
session = _get_request_session(
max_retries, backoff_factor, backoff_jitter, retry_codes, raise_on_status
)
return session.request(method, url, **kwargs)


Expand Down
14 changes: 11 additions & 3 deletions mlflow/utils/rest_utils.py
Expand Up @@ -5,6 +5,7 @@

from mlflow.environment_variables import (
MLFLOW_HTTP_REQUEST_BACKOFF_FACTOR,
MLFLOW_HTTP_REQUEST_BACKOFF_JITTER,
MLFLOW_HTTP_REQUEST_MAX_RETRIES,
MLFLOW_HTTP_REQUEST_TIMEOUT,
)
Expand All @@ -30,6 +31,7 @@ def http_request(
method,
max_retries=None,
backoff_factor=None,
backoff_jitter=None,
extra_headers=None,
retry_codes=_TRANSIENT_FAILURE_RESPONSE_CODES,
timeout=None,
Expand Down Expand Up @@ -60,9 +62,14 @@ def http_request(

:return: requests.Response object.
"""
max_retries = max_retries or MLFLOW_HTTP_REQUEST_MAX_RETRIES.get()
backoff_factor = backoff_factor or MLFLOW_HTTP_REQUEST_BACKOFF_FACTOR.get()
timeout = timeout or MLFLOW_HTTP_REQUEST_TIMEOUT.get()
max_retries = MLFLOW_HTTP_REQUEST_MAX_RETRIES.get() if max_retries is None else max_retries
Comment on lines -63 to +66
Copy link
Member

@harupy harupy Nov 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the original code, max_retries = 0 defaults to MLFLOW_HTTP_REQUEST_MAX_RETRIES.get() but it should not.

backoff_factor = (
MLFLOW_HTTP_REQUEST_BACKOFF_FACTOR.get() if backoff_factor is None else backoff_factor
)
backoff_jitter = (
MLFLOW_HTTP_REQUEST_BACKOFF_JITTER.get() if backoff_jitter is None else backoff_jitter
)
timeout = MLFLOW_HTTP_REQUEST_TIMEOUT.get() if timeout is None else timeout
hostname = host_creds.host
auth_str = None
if host_creds.username and host_creds.password:
Expand Down Expand Up @@ -101,6 +108,7 @@ def http_request(
url,
max_retries,
backoff_factor,
backoff_jitter,
retry_codes,
raise_on_status,
headers=headers,
Expand Down