Skip to content

Commit

Permalink
compute: Ensure pre-migrating instances are destroyed during init_host
Browse files Browse the repository at this point in the history
Previously _destroy_evacuated_instances would not remove instances
associated with evacuation migration records in a pre-migrating state.
This could lead to a race between the original source host and the new
destination if the source returned early, calling init_instance during
the evacuation process.

This change now includes pre-migrating migration records when looking
for active evacuations. Additionally the dict of evacuating instances is
then returned to init_host and used to skip running init_instance
against such instances ensuring no race occurs between the two computes.

Closes-bug: #1764883
Change-Id: I379678dfdb2609f12a572d4f99c8e9da4deab803
  • Loading branch information
lyarwood authored and mriedem committed Jul 20, 2018
1 parent b921339 commit c4988cd
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 19 deletions.
27 changes: 20 additions & 7 deletions nova/compute/manager.py
Expand Up @@ -624,18 +624,25 @@ def _destroy_evacuated_instances(self, context):
While nova-compute was down, the instances running on it could be
evacuated to another host. This method looks for evacuation migration
records where this is the source host and which were either started
(accepted) or complete (done). From those migration records, local
instances reported by the hypervisor are compared to the instances
for the migration records and those local guests are destroyed, along
with instance allocation records in Placement for this node.
(accepted), in-progress (pre-migrating) or migrated (done). From those
migration records, local instances reported by the hypervisor are
compared to the instances for the migration records and those local
guests are destroyed, along with instance allocation records in
Placement for this node.
"""
filters = {
'source_compute': self.host,
# NOTE(mriedem): Migration records that have been accepted are
# included in case the source node comes back up while instances
# are being evacuated to another host. We don't want the same
# instance being reported from multiple hosts.
'status': ['accepted', 'done'],
# NOTE(lyarwood): pre-migrating is also included here as the
# source compute can come back online shortly after the RT
# claims on the destination that in-turn moves the migration to
# pre-migrating. If the evacuate fails on the destination host,
# the user can rebuild the instance (in ERROR state) on the source
# host.
'status': ['accepted', 'pre-migrating', 'done'],
'migration_type': 'evacuation',
}
with utils.temporary_mutation(context, read_deleted='yes'):
Expand Down Expand Up @@ -698,6 +705,7 @@ def _destroy_evacuated_instances(self, context):

migration.status = 'completed'
migration.save()
return evacuations

def _is_instance_storage_shared(self, context, instance, host=None):
shared_storage = True
Expand Down Expand Up @@ -1133,9 +1141,14 @@ def init_host(self):

try:
# checking that instance was not already evacuated to other host
self._destroy_evacuated_instances(context)
evacuated_instances = self._destroy_evacuated_instances(context)

# Initialise instances on the host that are not evacuating
for instance in instances:
self._init_instance(context, instance)
if (not evacuated_instances or
instance.uuid not in evacuated_instances):
self._init_instance(context, instance)

finally:
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_off()
Expand Down
25 changes: 14 additions & 11 deletions nova/tests/functional/regressions/test_bug_1764883.py
Expand Up @@ -25,13 +25,9 @@ class TestEvacuationWithSourceReturningDuringRebuild(
test.TestCase, integrated_helpers.InstanceHelperMixin):
"""Assert the behaviour of evacuating instances when the src returns early.
This test asserts that evacuating instances end up in an ERROR state on the
source when that host comes back online during an evacuation while the
migration record is in a pre-migrating state.
This currently leads to a race between the source initialising the instance
and destination claim during the rebuild, both of which attempt to update
the underlying task_state of the instance.
This test asserts that evacuating instances end up in an ACTIVE state on
the destination even when the source host comes back online during an
evacuation while the migration record is in a pre-migrating state.
"""

def setUp(self):
Expand Down Expand Up @@ -113,8 +109,15 @@ def test_evacuation_with_source_compute_returning_during_rebuild(self):
# Start evacuating the instance from the source_host
self.api.post_server_action(server['id'], {'evacuate': {}})

# FIXME(lyarwood): Assert that the evacuation fails at present with the
# instance remaining on the source in an ERROR state.
self._wait_for_state_change(self.api, server, 'ERROR')
# Wait for the instance to go into an ACTIVE state
self._wait_for_state_change(self.api, server, 'ACTIVE')
server = self.api.get_server(server['id'])
self.assertEqual(self.source_compute, server['OS-EXT-SRV-ATTR:host'])
host = server['OS-EXT-SRV-ATTR:host']
migrations = self.api.get_migrations()

# Assert that we have a single `done` migration record after the evac
self.assertEqual(1, len(migrations))
self.assertEqual('done', migrations[0]['status'])

# Assert that the instance is now on the dest
self.assertNotEqual(self.source_compute, host)
5 changes: 4 additions & 1 deletion nova/tests/unit/compute/test_compute.py
Expand Up @@ -7500,7 +7500,10 @@ def test_destroy_evacuated_instance_on_shared_storage(self, mock_save,

mock_get_filter.assert_called_once_with(fake_context,
{'source_compute': self.compute.host,
'status': ['accepted', 'done'],
'status': [
'accepted',
'pre-migrating',
'done'],
'migration_type': 'evacuation'})
mock_get_inst.assert_called_once_with(fake_context)
mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)
Expand Down
30 changes: 30 additions & 0 deletions nova/tests/unit/compute/test_compute_mgr.py
Expand Up @@ -766,6 +766,36 @@ def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
mock.ANY, mock.ANY, mock.ANY)
mock_save.assert_called_once_with()

@mock.patch.object(context, 'get_admin_context')
@mock.patch.object(objects.InstanceList, 'get_by_host')
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
@mock.patch('nova.compute.manager.ComputeManager._init_instance')
@mock.patch('nova.compute.manager.ComputeManager.'
'_destroy_evacuated_instances')
def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
mock_init_instance, mock_init_host, mock_host_get,
mock_admin_ctxt):
"""Assert that init_instance is not called for instances that are
evacuating from the host during init_host.
"""
active_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.active_instance)
evacuating_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.evac_instance)
instance_list = objects.InstanceList(self.context,
objects=[active_instance, evacuating_instance])

mock_host_get.return_value = instance_list
mock_admin_ctxt.return_value = self.context
mock_destroy_evac.return_value = {
uuids.evac_instance: evacuating_instance
}

self.compute.init_host()

mock_init_instance.assert_called_once_with(
self.context, active_instance)

def test_init_instance_with_binding_failed_vif_type(self):
# this instance will plug a 'binding_failed' vif
instance = fake_instance.fake_instance_obj(
Expand Down

0 comments on commit c4988cd

Please sign in to comment.