Skip to content

Commit

Permalink
Make statsd errors correspond to 5xx only
Browse files Browse the repository at this point in the history
The goal is to make the successful statsd buckets
(e.g. "object-server.GET.timing") have timing information for all the
requests that the server handled correctly, while the error buckets
(e.g. "object-server.GET.errors.timing") have the rest.

Currently, we don't do that great a job of it. We special-case a few
4xx status codes (404, 412, 416) to not count as errors, but we leave
some pretty large holes. If you're graphing errors, you'll see spikes
when client is sending bogus requests (400) or failing to
re-authenticate (403). You'll also see spikes when your drives are
unmounted (507) and when there's bugs that need fixing (500).

This commit makes .errors.timing be just 5xx in the hope that its
graph will be more useful.

Change-Id: I92b41bcbb880c0688c37ab231c19ebe984b18215
  • Loading branch information
smerritt committed Jan 23, 2018
1 parent cbcbc77 commit 38ada71
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 22 deletions.
31 changes: 10 additions & 21 deletions swift/common/utils.py
Expand Up @@ -75,8 +75,7 @@

from swift import gettext_ as _
import swift.common.exceptions
from swift.common.http import is_success, is_redirection, HTTP_NOT_FOUND, \
HTTP_PRECONDITION_FAILED, HTTP_REQUESTED_RANGE_NOT_SATISFIABLE
from swift.common.http import is_server_error
from swift.common.header_key_dict import HeaderKeyDict
from swift.common.linkat import linkat

Expand Down Expand Up @@ -1635,24 +1634,6 @@ def transfer_rate(self, metric, elapsed_time, byte_xfer, sample_rate=None):
sample_rate)


def server_handled_successfully(status_int):
"""
True for successful responses *or* error codes that are not Swift's fault,
False otherwise. For example, 500 is definitely the server's fault, but
412 is an error code (4xx are all errors) that is due to a header the
client sent.
If one is tracking error rates to monitor server health, one would be
advised to use a function like this one, lest a client cause a flurry of
404s or 416s and make a spurious spike in your errors graph.
"""
return (is_success(status_int) or
is_redirection(status_int) or
status_int == HTTP_NOT_FOUND or
status_int == HTTP_PRECONDITION_FAILED or
status_int == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)


def timing_stats(**dec_kwargs):
"""
Returns a decorator that logs timing events or errors for public methods in
Expand All @@ -1665,7 +1646,15 @@ def decorating_func(func):
def _timing_stats(ctrl, *args, **kwargs):
start_time = time.time()
resp = func(ctrl, *args, **kwargs)
if server_handled_successfully(resp.status_int):
# .timing is for successful responses *or* error codes that are
# not Swift's fault. For example, 500 is definitely the server's
# fault, but 412 is an error code (4xx are all errors) that is
# due to a header the client sent.
#
# .errors.timing is for failures that *are* Swift's fault.
# Examples include 507 for an unmounted drive or 500 for an
# unhandled exception.
if not is_server_error(resp.status_int):
ctrl.logger.timing_since(method + '.timing',
start_time, **dec_kwargs)
else:
Expand Down
16 changes: 15 additions & 1 deletion test/unit/common/test_utils.py
Expand Up @@ -4809,6 +4809,13 @@ def METHOD(controller):
self.assertEqual(mock_controller.args[0], 'METHOD.timing')
self.assertTrue(mock_controller.args[1] > 0)

mock_controller = MockController(400)
METHOD(mock_controller)
self.assertEqual(len(mock_controller.args), 2)
self.assertEqual(mock_controller.called, 'timing')
self.assertEqual(mock_controller.args[0], 'METHOD.timing')
self.assertTrue(mock_controller.args[1] > 0)

mock_controller = MockController(404)
METHOD(mock_controller)
self.assertEqual(len(mock_controller.args), 2)
Expand All @@ -4830,7 +4837,14 @@ def METHOD(controller):
self.assertEqual(mock_controller.args[0], 'METHOD.timing')
self.assertTrue(mock_controller.args[1] > 0)

mock_controller = MockController(401)
mock_controller = MockController(500)
METHOD(mock_controller)
self.assertEqual(len(mock_controller.args), 2)
self.assertEqual(mock_controller.called, 'timing')
self.assertEqual(mock_controller.args[0], 'METHOD.errors.timing')
self.assertTrue(mock_controller.args[1] > 0)

mock_controller = MockController(507)
METHOD(mock_controller)
self.assertEqual(len(mock_controller.args), 2)
self.assertEqual(mock_controller.called, 'timing')
Expand Down

0 comments on commit 38ada71

Please sign in to comment.