From c4988cdabf311d29cf64af732091068cfabeedaa Mon Sep 17 00:00:00 2001 From: Lee Yarwood Date: Wed, 18 Apr 2018 14:35:07 +0100 Subject: [PATCH] compute: Ensure pre-migrating instances are destroyed during init_host Previously _destroy_evacuated_instances would not remove instances associated with evacuation migration records in a pre-migrating state. This could lead to a race between the original source host and the new destination if the source returned early, calling init_instance during the evacuation process. This change now includes pre-migrating migration records when looking for active evacuations. Additionally the dict of evacuating instances is then returned to init_host and used to skip running init_instance against such instances ensuring no race occurs between the two computes. Closes-bug: #1764883 Change-Id: I379678dfdb2609f12a572d4f99c8e9da4deab803 --- nova/compute/manager.py | 27 ++++++++++++----- .../regressions/test_bug_1764883.py | 25 +++++++++------- nova/tests/unit/compute/test_compute.py | 5 +++- nova/tests/unit/compute/test_compute_mgr.py | 30 +++++++++++++++++++ 4 files changed, 68 insertions(+), 19 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 093a37e6d0e..73c21a1ccae 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -624,10 +624,11 @@ def _destroy_evacuated_instances(self, context): While nova-compute was down, the instances running on it could be evacuated to another host. This method looks for evacuation migration records where this is the source host and which were either started - (accepted) or complete (done). From those migration records, local - instances reported by the hypervisor are compared to the instances - for the migration records and those local guests are destroyed, along - with instance allocation records in Placement for this node. + (accepted), in-progress (pre-migrating) or migrated (done). From those + migration records, local instances reported by the hypervisor are + compared to the instances for the migration records and those local + guests are destroyed, along with instance allocation records in + Placement for this node. """ filters = { 'source_compute': self.host, @@ -635,7 +636,13 @@ def _destroy_evacuated_instances(self, context): # included in case the source node comes back up while instances # are being evacuated to another host. We don't want the same # instance being reported from multiple hosts. - 'status': ['accepted', 'done'], + # NOTE(lyarwood): pre-migrating is also included here as the + # source compute can come back online shortly after the RT + # claims on the destination that in-turn moves the migration to + # pre-migrating. If the evacuate fails on the destination host, + # the user can rebuild the instance (in ERROR state) on the source + # host. + 'status': ['accepted', 'pre-migrating', 'done'], 'migration_type': 'evacuation', } with utils.temporary_mutation(context, read_deleted='yes'): @@ -698,6 +705,7 @@ def _destroy_evacuated_instances(self, context): migration.status = 'completed' migration.save() + return evacuations def _is_instance_storage_shared(self, context, instance, host=None): shared_storage = True @@ -1133,9 +1141,14 @@ def init_host(self): try: # checking that instance was not already evacuated to other host - self._destroy_evacuated_instances(context) + evacuated_instances = self._destroy_evacuated_instances(context) + + # Initialise instances on the host that are not evacuating for instance in instances: - self._init_instance(context, instance) + if (not evacuated_instances or + instance.uuid not in evacuated_instances): + self._init_instance(context, instance) + finally: if CONF.defer_iptables_apply: self.driver.filter_defer_apply_off() diff --git a/nova/tests/functional/regressions/test_bug_1764883.py b/nova/tests/functional/regressions/test_bug_1764883.py index 9e51c3c2941..8025a50c660 100644 --- a/nova/tests/functional/regressions/test_bug_1764883.py +++ b/nova/tests/functional/regressions/test_bug_1764883.py @@ -25,13 +25,9 @@ class TestEvacuationWithSourceReturningDuringRebuild( test.TestCase, integrated_helpers.InstanceHelperMixin): """Assert the behaviour of evacuating instances when the src returns early. - This test asserts that evacuating instances end up in an ERROR state on the - source when that host comes back online during an evacuation while the - migration record is in a pre-migrating state. - - This currently leads to a race between the source initialising the instance - and destination claim during the rebuild, both of which attempt to update - the underlying task_state of the instance. + This test asserts that evacuating instances end up in an ACTIVE state on + the destination even when the source host comes back online during an + evacuation while the migration record is in a pre-migrating state. """ def setUp(self): @@ -113,8 +109,15 @@ def test_evacuation_with_source_compute_returning_during_rebuild(self): # Start evacuating the instance from the source_host self.api.post_server_action(server['id'], {'evacuate': {}}) - # FIXME(lyarwood): Assert that the evacuation fails at present with the - # instance remaining on the source in an ERROR state. - self._wait_for_state_change(self.api, server, 'ERROR') + # Wait for the instance to go into an ACTIVE state + self._wait_for_state_change(self.api, server, 'ACTIVE') server = self.api.get_server(server['id']) - self.assertEqual(self.source_compute, server['OS-EXT-SRV-ATTR:host']) + host = server['OS-EXT-SRV-ATTR:host'] + migrations = self.api.get_migrations() + + # Assert that we have a single `done` migration record after the evac + self.assertEqual(1, len(migrations)) + self.assertEqual('done', migrations[0]['status']) + + # Assert that the instance is now on the dest + self.assertNotEqual(self.source_compute, host) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 28a87d97e5d..86cec87d763 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -7500,7 +7500,10 @@ def test_destroy_evacuated_instance_on_shared_storage(self, mock_save, mock_get_filter.assert_called_once_with(fake_context, {'source_compute': self.compute.host, - 'status': ['accepted', 'done'], + 'status': [ + 'accepted', + 'pre-migrating', + 'done'], 'migration_type': 'evacuation'}) mock_get_inst.assert_called_once_with(fake_context) mock_get_nw.assert_called_once_with(fake_context, evacuated_instance) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index cc17cf83a35..4a3cefe9a60 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -766,6 +766,36 @@ def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get, mock.ANY, mock.ANY, mock.ANY) mock_save.assert_called_once_with() + @mock.patch.object(context, 'get_admin_context') + @mock.patch.object(objects.InstanceList, 'get_by_host') + @mock.patch.object(fake_driver.FakeDriver, 'init_host') + @mock.patch('nova.compute.manager.ComputeManager._init_instance') + @mock.patch('nova.compute.manager.ComputeManager.' + '_destroy_evacuated_instances') + def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac, + mock_init_instance, mock_init_host, mock_host_get, + mock_admin_ctxt): + """Assert that init_instance is not called for instances that are + evacuating from the host during init_host. + """ + active_instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, uuid=uuids.active_instance) + evacuating_instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, uuid=uuids.evac_instance) + instance_list = objects.InstanceList(self.context, + objects=[active_instance, evacuating_instance]) + + mock_host_get.return_value = instance_list + mock_admin_ctxt.return_value = self.context + mock_destroy_evac.return_value = { + uuids.evac_instance: evacuating_instance + } + + self.compute.init_host() + + mock_init_instance.assert_called_once_with( + self.context, active_instance) + def test_init_instance_with_binding_failed_vif_type(self): # this instance will plug a 'binding_failed' vif instance = fake_instance.fake_instance_obj(