Skip to content

Commit

Permalink
[SDK] Fix fail to send request to MLRun API with `ConnectionResetErro…
Browse files Browse the repository at this point in the history
…r` [1.0.x] (#2235)
  • Loading branch information
quaark committed Aug 7, 2022
1 parent d5f4553 commit 99e0020
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 3 deletions.
1 change: 1 addition & 0 deletions mlrun/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@
"max_workers": 64,
# See mlrun.api.schemas.APIStates for options
"state": "online",
"retry_api_call_on_exception": "enabled",
"db": {
"commit_retry_timeout": 30,
"commit_retry_interval": 3,
Expand Down
67 changes: 64 additions & 3 deletions mlrun/db/httpdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,23 @@ def bool2str(val):
return "yes" if val else "no"


HTTP_RETRY_COUNT = 3
HTTP_RETRY_BACKOFF_FACTOR = 1

# make sure to only add exceptions that are raised early in the request. For example, ConnectionError can be raised
# during the handling of a request, and therefore should not be retried, as the request might not be idempotent.
HTTP_RETRYABLE_EXCEPTIONS = {
# ConnectionResetError is raised when the server closes the connection prematurely during TCP handshake.
ConnectionResetError: ["Connection reset by peer", "Connection aborted"],
# "Connection aborted" and "Connection refused" happen when the server doesn't respond at all.
ConnectionError: ["Connection aborted", "Connection refused"],
ConnectionRefusedError: ["Connection refused"],
}

http_adapter = HTTPAdapter(
max_retries=Retry(
total=3,
backoff_factor=1,
total=HTTP_RETRY_COUNT,
backoff_factor=HTTP_RETRY_BACKOFF_FACTOR,
status_forcelist=[500, 502, 503, 504],
# we want to retry but not to raise since we do want that last response (to parse details on the
# error from response body) we'll handle raising ourselves
Expand Down Expand Up @@ -140,6 +153,54 @@ def get_base_api_url(self, path: str, version: str = None) -> str:
url = f"{self.base_url}/{path_prefix}/{path}"
return url

def request_with_retry(self, method, url, **kwargs):
max_retries = (
HTTP_RETRY_COUNT
if config.httpdb.retry_api_call_on_exception == "enabled"
else 0
)
retry_count = 0
while True:
try:
response = self.session.request(method, url, **kwargs)
return response
except tuple(HTTP_RETRYABLE_EXCEPTIONS.keys()) as exc:
if retry_count >= max_retries:
logger.warning(
f"Maximum retries exhausted for {method} {url} request",
exception_type=type(exc),
exception_message=str(exc),
retry_interval=HTTP_RETRY_BACKOFF_FACTOR,
retry_count=retry_count,
max_retries=max_retries,
)
raise exc

# only retry on exceptions with the right message
exception_is_retryable = any(
[msg in str(exc) for msg in HTTP_RETRYABLE_EXCEPTIONS[type(exc)]]
)

if not exception_is_retryable:
logger.warning(
f"{method} {url} request failed on non-retryable exception",
exception_type=type(exc),
exception_message=str(exc),
)
raise exc

logger.debug(
f"{method} {url} request failed on retryable exception, "
f"retrying in {HTTP_RETRY_BACKOFF_FACTOR} seconds",
exception_type=type(exc),
exception_message=str(exc),
retry_interval=HTTP_RETRY_BACKOFF_FACTOR,
retry_count=retry_count,
max_retries=max_retries,
)
retry_count += 1
time.sleep(HTTP_RETRY_BACKOFF_FACTOR)

def api_call(
self,
method,
Expand Down Expand Up @@ -217,7 +278,7 @@ def api_call(
self.session.mount("https://", http_adapter)

try:
response = self.session.request(
response = self.request_with_retry(
method, url, timeout=timeout, verify=False, **kw
)
except requests.RequestException as exc:
Expand Down
66 changes: 66 additions & 0 deletions tests/rundb/test_unit_httpdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import enum
import unittest.mock

import pytest

import mlrun.config
import mlrun.db.httpdb


Expand All @@ -27,3 +30,66 @@ def test_api_call_enum_conversion():
for dict_key in ["headers", "params"]:
for value in db.session.request.call_args_list[1][1][dict_key].values():
assert type(value) == str


@pytest.mark.parametrize(
"feature_config,exception_type,exception_message,call_amount",
[
# feature enabled
("enabled", Exception, "some-error", 1),
("enabled", ConnectionError, "some-error", 1),
("enabled", ConnectionResetError, "some-error", 1),
(
"enabled",
ConnectionError,
"Connection aborted",
# one try + the max retries
1 + mlrun.db.httpdb.HTTP_RETRY_COUNT,
),
(
"enabled",
ConnectionResetError,
"Connection reset by peer",
# one try + the max retries
1 + mlrun.db.httpdb.HTTP_RETRY_COUNT,
),
(
"enabled",
ConnectionRefusedError,
"Connection refused",
# one try + the max retries
1 + mlrun.db.httpdb.HTTP_RETRY_COUNT,
),
# feature disabled
("disabled", Exception, "some-error", 1),
("disabled", ConnectionError, "some-error", 1),
("disabled", ConnectionResetError, "some-error", 1),
("disabled", ConnectionError, "Connection aborted", 1),
(
"disabled",
ConnectionResetError,
"Connection reset by peer",
1,
),
(
"disabled",
ConnectionRefusedError,
"Connection refused",
1,
),
],
)
def test_connection_reset_causes_retries(
feature_config, exception_type, exception_message, call_amount
):
mlrun.config.config.httpdb.retry_api_call_on_exception = feature_config
db = mlrun.db.httpdb.HTTPRunDB("fake-url")
db.session = unittest.mock.Mock()
db.session.request.side_effect = exception_type(exception_message)

# patch sleep to make test faster
with unittest.mock.patch("time.sleep"):
with pytest.raises(exception_type):
db.api_call("GET", "some-path")

assert db.session.request.call_count == call_amount

0 comments on commit 99e0020

Please sign in to comment.