compute: Ensure pre-migrating instances are destroyed during init_host

Previously _destroy_evacuated_instances would not remove instances associated with evacuation migration records in a pre-migrating state. This could lead to a race between the original source host and the new destination if the source returned early, calling init_instance during the evacuation process. This change now includes pre-migrating migration records when looking for active evacuations. Additionally the dict of evacuating instances is then returned to init_host and used to skip running init_instance against such instances ensuring no race occurs between the two computes. Closes-bug: #1764883 Change-Id: I379678dfdb2609f12a572d4f99c8e9da4deab803
openstack · Jul 20, 2018 · c4988cd · c4988cd
1 parent b921339
commit c4988cd
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 19 deletions.
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -624,18 +624,25 @@ def _destroy_evacuated_instances(self, context):
         While nova-compute was down, the instances running on it could be
         evacuated to another host. This method looks for evacuation migration
         records where this is the source host and which were either started
-        (accepted) or complete (done). From those migration records, local
-        instances reported by the hypervisor are compared to the instances
-        for the migration records and those local guests are destroyed, along
-        with instance allocation records in Placement for this node.
+        (accepted), in-progress (pre-migrating) or migrated (done). From those
+        migration records, local instances reported by the hypervisor are
+        compared to the instances for the migration records and those local
+        guests are destroyed, along with instance allocation records in
+        Placement for this node.
         """
         filters = {
             'source_compute': self.host,
             # NOTE(mriedem): Migration records that have been accepted are
             # included in case the source node comes back up while instances
             # are being evacuated to another host. We don't want the same
             # instance being reported from multiple hosts.
-            'status': ['accepted', 'done'],
+            # NOTE(lyarwood): pre-migrating is also included here as the
+            # source compute can come back online shortly after the RT
+            # claims on the destination that in-turn moves the migration to
+            # pre-migrating. If the evacuate fails on the destination host,
+            # the user can rebuild the instance (in ERROR state) on the source
+            # host.
+            'status': ['accepted', 'pre-migrating', 'done'],
             'migration_type': 'evacuation',
         }
         with utils.temporary_mutation(context, read_deleted='yes'):
@@ -698,6 +705,7 @@ def _destroy_evacuated_instances(self, context):
 
             migration.status = 'completed'
             migration.save()
+        return evacuations
 
     def _is_instance_storage_shared(self, context, instance, host=None):
         shared_storage = True
@@ -1133,9 +1141,14 @@ def init_host(self):
 
         try:
             # checking that instance was not already evacuated to other host
-            self._destroy_evacuated_instances(context)
+            evacuated_instances = self._destroy_evacuated_instances(context)
+
+            # Initialise instances on the host that are not evacuating
             for instance in instances:
-                self._init_instance(context, instance)
+                if (not evacuated_instances or
+                        instance.uuid not in evacuated_instances):
+                    self._init_instance(context, instance)
+
         finally:
             if CONF.defer_iptables_apply:
                 self.driver.filter_defer_apply_off()

diff --git a/nova/tests/functional/regressions/test_bug_1764883.py b/nova/tests/functional/regressions/test_bug_1764883.py
@@ -25,13 +25,9 @@ class TestEvacuationWithSourceReturningDuringRebuild(
         test.TestCase, integrated_helpers.InstanceHelperMixin):
     """Assert the behaviour of evacuating instances when the src returns early.
 
-    This test asserts that evacuating instances end up in an ERROR state on the
-    source when that host comes back online during an evacuation while the
-    migration record is in a pre-migrating state.
-
-    This currently leads to a race between the source initialising the instance
-    and destination claim during the rebuild, both of which attempt to update
-    the underlying task_state of the instance.
+    This test asserts that evacuating instances end up in an ACTIVE state on
+    the destination even when the source host comes back online during an
+    evacuation while the migration record is in a pre-migrating state.
     """
 
     def setUp(self):
@@ -113,8 +109,15 @@ def test_evacuation_with_source_compute_returning_during_rebuild(self):
         # Start evacuating the instance from the source_host
         self.api.post_server_action(server['id'], {'evacuate': {}})
 
-        # FIXME(lyarwood): Assert that the evacuation fails at present with the
-        # instance remaining on the source in an ERROR state.
-        self._wait_for_state_change(self.api, server, 'ERROR')
+        # Wait for the instance to go into an ACTIVE state
+        self._wait_for_state_change(self.api, server, 'ACTIVE')
         server = self.api.get_server(server['id'])
-        self.assertEqual(self.source_compute, server['OS-EXT-SRV-ATTR:host'])
+        host = server['OS-EXT-SRV-ATTR:host']
+        migrations = self.api.get_migrations()
+
+        # Assert that we have a single `done` migration record after the evac
+        self.assertEqual(1, len(migrations))
+        self.assertEqual('done', migrations[0]['status'])
+
+        # Assert that the instance is now on the dest
+        self.assertNotEqual(self.source_compute, host)
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
@@ -7500,7 +7500,10 @@ def test_destroy_evacuated_instance_on_shared_storage(self, mock_save,
 
         mock_get_filter.assert_called_once_with(fake_context,
                                          {'source_compute': self.compute.host,
-                                          'status': ['accepted', 'done'],
+                                          'status': [
+                                              'accepted',
+                                              'pre-migrating',
+                                              'done'],
                                           'migration_type': 'evacuation'})
         mock_get_inst.assert_called_once_with(fake_context)
         mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)

diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
@@ -766,6 +766,36 @@ def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
                                              mock.ANY, mock.ANY, mock.ANY)
         mock_save.assert_called_once_with()
 
+    @mock.patch.object(context, 'get_admin_context')
+    @mock.patch.object(objects.InstanceList, 'get_by_host')
+    @mock.patch.object(fake_driver.FakeDriver, 'init_host')
+    @mock.patch('nova.compute.manager.ComputeManager._init_instance')
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                  '_destroy_evacuated_instances')
+    def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
+            mock_init_instance, mock_init_host, mock_host_get,
+            mock_admin_ctxt):
+        """Assert that init_instance is not called for instances that are
+           evacuating from the host during init_host.
+        """
+        active_instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, uuid=uuids.active_instance)
+        evacuating_instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, uuid=uuids.evac_instance)
+        instance_list = objects.InstanceList(self.context,
+            objects=[active_instance, evacuating_instance])
+
+        mock_host_get.return_value = instance_list
+        mock_admin_ctxt.return_value = self.context
+        mock_destroy_evac.return_value = {
+            uuids.evac_instance: evacuating_instance
+        }
+
+        self.compute.init_host()
+
+        mock_init_instance.assert_called_once_with(
+            self.context, active_instance)
+
     def test_init_instance_with_binding_failed_vif_type(self):
         # this instance will plug a 'binding_failed' vif
         instance = fake_instance.fake_instance_obj(