Skip to content

Commit

Permalink
Live migration failure in API leaves VM in MIGRATING state
Browse files Browse the repository at this point in the history
When nova-api calls nova-conductor a RPC MessagingTimeout might
occur. In such case we shouldn't leave VM in MIGRATING state. Possible
scenarios are:

* nova-conductor received message but failed to respond, no additional
exceptions raised - live migration will start, VM will be moved to
destination host
* nova-conductor received message but failed to respond, additional
exception raised (e.g., LibvirtError) - LM will not start
* nova-api couldn't reach nova-conductor - LM will not start

Because we can't predict in API layer what happened below, this patch
writes instance fault to database when MessagingTimeout is caught.

Co-Authored-By: Pawel Koniszewski <pawel.koniszewski@intel.com>
                Bartosz Fic <bartosz.fic@intel.com>
Closes-Bug: #1276214
Change-Id: Id800e925fbb689d20e7907b698b67c92fd3da979
(cherry picked from commit f2a1f00)
  • Loading branch information
macsz authored and lyarwood committed Apr 12, 2016
1 parent 5c77410 commit 0c4fc78
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 1 deletion.
12 changes: 11 additions & 1 deletion nova/compute/api.py
Expand Up @@ -27,6 +27,7 @@
import uuid

from oslo_log import log as logging
from oslo_messaging import exceptions as oslo_exceptions
from oslo_serialization import jsonutils
from oslo_utils import excutils
from oslo_utils import strutils
Expand Down Expand Up @@ -3358,10 +3359,19 @@ def live_migrate(self, context, instance, block_migration,
# Some old instances can still have no RequestSpec object attached
# to them, we need to support the old way
request_spec = None
self.compute_task_api.live_migrate_instance(context, instance,
try:
self.compute_task_api.live_migrate_instance(context, instance,
host_name, block_migration=block_migration,
disk_over_commit=disk_over_commit,
request_spec=request_spec)
except oslo_exceptions.MessagingTimeout as messaging_timeout:
with excutils.save_and_reraise_exception():
# NOTE(pkoniszewski): It is possible that MessagingTimeout
# occurs, but LM will still be in progress, so write
# instance fault to database
compute_utils.add_instance_fault_from_exc(context,
instance,
messaging_timeout)

@check_instance_lock
@check_instance_cell
Expand Down
25 changes: 25 additions & 0 deletions nova/tests/unit/compute/test_compute_api.py
Expand Up @@ -19,6 +19,7 @@
import iso8601
import mock
from mox3 import mox
from oslo_messaging import exceptions as oslo_exceptions
from oslo_policy import policy as oslo_policy
from oslo_serialization import jsonutils
from oslo_utils import fixture as utils_fixture
Expand Down Expand Up @@ -1872,6 +1873,30 @@ def test_live_migrate_paused_vm_state(self):
instance = self._create_instance_obj(params=paused_state)
self._live_migrate_instance(instance)

@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(objects.RequestSpec, 'get_by_instance_uuid')
@mock.patch.object(objects.InstanceAction, 'action_start')
@mock.patch.object(objects.Instance, 'save')
def test_live_migrate_messaging_timeout(self, _save, _action, get_spec,
add_instance_fault_from_exc):
instance = self._create_instance_obj()
if self.cell_type == 'api':
api = self.compute_api.cells_rpcapi
else:
api = conductor.api.ComputeTaskAPI

with mock.patch.object(api, 'live_migrate_instance',
side_effect=oslo_exceptions.MessagingTimeout):
self.assertRaises(oslo_exceptions.MessagingTimeout,
self.compute_api.live_migrate,
self.context, instance,
host_name='fake_dest_host',
block_migration=True, disk_over_commit=True)
add_instance_fault_from_exc.assert_called_once_with(
self.context,
instance,
mock.ANY)

@mock.patch.object(objects.RequestSpec, 'get_by_instance_uuid')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(objects.InstanceAction, 'action_start')
Expand Down

0 comments on commit 0c4fc78

Please sign in to comment.