Skip to content

Commit

Permalink
Add request timeout handling for Mellanox Neutron Agent
Browse files Browse the repository at this point in the history
Add request timeout handling for messages sent to eswitch Daemon.
Using configurable number of retries and increasing waiting interval
between retries resend the message.
If request timeout persists, eswitch daemon is not reachable.
In such case, exit the agent process.

Closes-Bug: #1228827

Change-Id: If1290bedafe7a0dd8a61cbe328510075edeb1e5b
(cherry picked from commit 793c763)
  • Loading branch information
irenaber committed Mar 31, 2014
1 parent f52449e commit eab05c3
Show file tree
Hide file tree
Showing 7 changed files with 237 additions and 5 deletions.
10 changes: 9 additions & 1 deletion etc/neutron/plugins/mlnx/mlnx_conf.ini
Expand Up @@ -34,12 +34,20 @@
# vnic_type = mlnx_direct

# (StrOpt) Eswitch daemon end point connection url
# daemon_endpoint = 'tcp://127.0.0.1:5001'
# daemon_endpoint = 'tcp://127.0.0.1:60001'

# The number of milliseconds the agent will wait for
# response on request to daemon
# request_timeout = 3000

# The number of retries the agent will send request
# to daemon before giving up
# retries = 3

# The backoff rate multiplier for waiting period between retries
# on request to daemon, i.e. value of 2 will double
# the request timeout each retry
# backoff_rate = 2

[agent]
# Agent's polling interval in seconds
Expand Down
4 changes: 4 additions & 0 deletions neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
Expand Up @@ -392,6 +392,10 @@ def daemon_loop(self):
# If treat devices fails - must resync with plugin
sync = self.process_network_ports(port_info)
ports = port_info['current']
except exceptions.RequestTimeout:
LOG.exception(_("Request timeout in agent event loop "
"eSwitchD is not responding - exiting..."))
raise SystemExit(1)
except Exception:
LOG.exception(_("Error in agent event loop"))
sync = True
Expand Down
8 changes: 4 additions & 4 deletions neutron/plugins/mlnx/agent/utils.py
@@ -1,5 +1,3 @@
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright 2013 Mellanox Technologies, Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -19,6 +17,7 @@

from neutron.openstack.common import jsonutils
from neutron.openstack.common import log as logging
from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
from neutron.plugins.mlnx.common import exceptions

LOG = logging.getLogger(__name__)
Expand All @@ -42,6 +41,7 @@ def _conn(self):
self.poller.register(self._conn, zmq.POLLIN)
return self.__conn

@RetryDecorator(exceptions.RequestTimeout)
def send_msg(self, msg):
self._conn.send(msg)

Expand All @@ -55,7 +55,7 @@ def send_msg(self, msg):
self._conn.close()
self.poller.unregister(self._conn)
self.__conn = None
raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
raise exceptions.RequestTimeout()

def parse_response_msg(self, recv_msg):
msg = jsonutils.loads(recv_msg)
Expand All @@ -69,7 +69,7 @@ def parse_response_msg(self, recv_msg):
else:
error_msg = _("Unknown operation status %s") % msg['status']
LOG.error(error_msg)
raise exceptions.MlnxException(error_msg)
raise exceptions.OperationFailed(err_msg=error_msg)

def get_attached_vnics(self):
LOG.debug(_("get_attached_vnics"))
Expand Down
66 changes: 66 additions & 0 deletions neutron/plugins/mlnx/common/comm_utils.py
@@ -0,0 +1,66 @@
# Copyright 2013 Mellanox Technologies, Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import time

from oslo.config import cfg

from neutron.openstack.common import log as logging
from neutron.plugins.mlnx.common import config # noqa

LOG = logging.getLogger(__name__)


class RetryDecorator(object):
"""Retry decorator reruns a method 'retries' times if an exception occurs.
Decorator for retrying a method if exceptionToCheck exception occurs
If method raises exception, retries 'retries' times with increasing
back off period between calls with 'interval' multiplier
:param exceptionToCheck: the exception to check
:param interval: initial delay between retries in seconds
:param retries: number of times to try before giving up
:raises: exceptionToCheck
"""
sleep_fn = time.sleep

def __init__(self, exception_to_check,
interval=cfg.CONF.ESWITCH.request_timeout / 1000,
retries=cfg.CONF.ESWITCH.retries,
backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
self.exc = exception_to_check
self.interval = interval
self.retries = retries
self.backoff_rate = backoff_rate

def __call__(self, original_func):
@functools.wraps(original_func)
def decorated(*args, **kwargs):
sleep_interval = self.interval
num_of_iter = self.retries
while num_of_iter > 0:
try:
return original_func(*args, **kwargs)
except self.exc:
LOG.debug(_("Request timeout - call again after "
"%s seconds"), sleep_interval)
RetryDecorator.sleep_fn(sleep_interval)
num_of_iter -= 1
sleep_interval *= self.backoff_rate

return original_func(*args, **kwargs)
return decorated
7 changes: 7 additions & 0 deletions neutron/plugins/mlnx/common/config.py
Expand Up @@ -48,6 +48,13 @@
cfg.IntOpt('request_timeout', default=3000,
help=_("The number of milliseconds the agent will wait for "
"response on request to daemon.")),
cfg.IntOpt('retries', default=3,
help=_("The number of retries the agent will send request "
"to daemon before giving up")),
cfg.IntOpt('backoff_rate', default=2,
help=_("backoff rate multiplier for waiting period between "
"retries for request to daemon, i.e. value of 2 will "
" double the request timeout each retry")),
]

agent_opts = [
Expand Down
8 changes: 8 additions & 0 deletions neutron/plugins/mlnx/common/exceptions.py
Expand Up @@ -20,3 +20,11 @@

class MlnxException(qexc.NeutronException):
message = _("Mlnx Exception: %(err_msg)s")


class RequestTimeout(qexc.NeutronException):
message = _("Request Timeout: no response from eSwitchD")


class OperationFailed(qexc.NeutronException):
message = _("Operation Failed: %(err_msg)s")
139 changes: 139 additions & 0 deletions neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
@@ -0,0 +1,139 @@
# Copyright (c) 2013 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import mock
from oslo.config import cfg

from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
from neutron.plugins.mlnx.common import config # noqa
from neutron.plugins.mlnx.common import exceptions
from neutron.tests import base


class WrongException(Exception):
pass


class TestRetryDecorator(base.BaseTestCase):
def setUp(self):
super(TestRetryDecorator, self).setUp()
self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
self.sleep_fn = self.sleep_fn_p.start()
self.addCleanup(self.sleep_fn_p.stop)

def test_no_retry_required(self):
self.counter = 0

@RetryDecorator(exceptions.RequestTimeout, interval=2,
retries=3, backoff_rate=2)
def succeeds():
self.counter += 1
return 'success'

ret = succeeds()
self.assertFalse(self.sleep_fn.called)
self.assertEqual(ret, 'success')
self.assertEqual(self.counter, 1)

def test_retry_zero_times(self):
self.counter = 0
interval = 2
backoff_rate = 2
retries = 0

@RetryDecorator(exceptions.RequestTimeout, interval,
retries, backoff_rate)
def always_fails():
self.counter += 1
raise exceptions.RequestTimeout()

self.assertRaises(exceptions.RequestTimeout, always_fails)
self.assertEqual(self.counter, 1)
self.assertFalse(self.sleep_fn.called)

def test_retries_once(self):
self.counter = 0
interval = 2
backoff_rate = 2
retries = 3

@RetryDecorator(exceptions.RequestTimeout, interval,
retries, backoff_rate)
def fails_once():
self.counter += 1
if self.counter < 2:
raise exceptions.RequestTimeout()
else:
return 'success'

ret = fails_once()
self.assertEqual(ret, 'success')
self.assertEqual(self.counter, 2)
self.assertEqual(self.sleep_fn.call_count, 1)
self.sleep_fn.assert_called_with(interval)

def test_limit_is_reached(self):
self.counter = 0
retries = 3
interval = 2
backoff_rate = 4

@RetryDecorator(exceptions.RequestTimeout, interval,
retries, backoff_rate)
def always_fails():
self.counter += 1
raise exceptions.RequestTimeout()

self.assertRaises(exceptions.RequestTimeout, always_fails)
self.assertEqual(self.counter, retries + 1)
self.assertEqual(self.sleep_fn.call_count, retries)

expected_sleep_fn_arg = []
for i in range(retries):
expected_sleep_fn_arg.append(interval)
interval *= backoff_rate

self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))

def test_limit_is_reached_with_conf(self):
self.counter = 0

@RetryDecorator(exceptions.RequestTimeout)
def always_fails():
self.counter += 1
raise exceptions.RequestTimeout()

retry = cfg.CONF.ESWITCH.retries
interval = cfg.CONF.ESWITCH.request_timeout / 1000
delay_rate = cfg.CONF.ESWITCH.backoff_rate

expected_sleep_fn_arg = []
for i in range(retry):
expected_sleep_fn_arg.append(interval)
interval *= delay_rate

self.assertRaises(exceptions.RequestTimeout, always_fails)
self.assertEqual(self.counter, retry + 1)
self.assertEqual(self.sleep_fn.call_count, retry)
self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))

def test_wrong_exception_no_retry(self):

@RetryDecorator(exceptions.RequestTimeout)
def raise_unexpected_error():
raise WrongException("wrong exception")

self.assertRaises(WrongException, raise_unexpected_error)
self.assertFalse(self.sleep_fn.called)

0 comments on commit eab05c3

Please sign in to comment.