Merge "Error out interrupted builds" into stable/rocky

openstack · Oct 26, 2019 · a526901 · a526901
2 parents 9bd300a + 13bb7ed
commit a526901
Show file tree

Hide file tree

Showing 3 changed files with 278 additions and 27 deletions.
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -645,6 +645,10 @@ def _destroy_evacuated_instances(self, context):
         Then allocations are removed from Placement for every instance that is
         evacuated from this host regardless if the instance is reported by the
         hypervisor or not.
+
+        :param context: The request context
+        :return: A dict keyed by instance uuid mapped to Migration objects
+            for instances that were migrated away from this host
         """
         filters = {
             'source_compute': self.host,
@@ -665,7 +669,7 @@ def _destroy_evacuated_instances(self, context):
             evacuations = objects.MigrationList.get_by_filters(context,
                                                                filters)
         if not evacuations:
-            return
+            return {}
         evacuations = {mig.instance_uuid: mig for mig in evacuations}
 
         # The instances might be deleted in which case we need to avoid
@@ -849,9 +853,8 @@ def _init_instance(self, context, instance):
             # instance has already been scheduled to this particular host.
             LOG.debug("Instance failed to spawn correctly, "
                       "setting to ERROR state", instance=instance)
-            instance.task_state = None
-            instance.vm_state = vm_states.ERROR
-            instance.save()
+            self._set_instance_obj_error_state(
+                context, instance, clean_task_state=True)
             return
 
         if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
@@ -862,9 +865,8 @@ def _init_instance(self, context, instance):
             # spawned so set to ERROR state. This is consistent to BUILD
             LOG.debug("Instance failed to rebuild correctly, "
                       "setting to ERROR state", instance=instance)
-            instance.task_state = None
-            instance.vm_state = vm_states.ERROR
-            instance.save()
+            self._set_instance_obj_error_state(
+                context, instance, clean_task_state=True)
             return
 
         if (instance.vm_state != vm_states.ERROR and
@@ -1241,10 +1243,24 @@ def init_host(self):
 
             # Initialise instances on the host that are not evacuating
             for instance in instances:
-                if (not evacuated_instances or
-                        instance.uuid not in evacuated_instances):
+                if instance.uuid not in evacuated_instances:
                     self._init_instance(context, instance)
 
+            # NOTE(gibi): collect all the instance uuids that is in some way
+            # was already handled above. Either by init_instance or by
+            # _destroy_evacuated_instances. This way we can limit the scope of
+            # the _error_out_instances_whose_build_was_interrupted call to look
+            # only for instances that have allocations on this node and not
+            # handled by the above calls.
+            already_handled = {instance.uuid for instance in instances}.union(
+                evacuated_instances)
+            # NOTE(gibi): If ironic and vcenter virt driver slow start time
+            # becomes problematic here then we should consider adding a config
+            # option or a driver flag to tell us if we should thread this out
+            # in the background on startup
+            self._error_out_instances_whose_build_was_interrupted(
+                context, already_handled)
+
         finally:
             if CONF.defer_iptables_apply:
                 self.driver.filter_defer_apply_off()
@@ -1257,6 +1273,85 @@ def init_host(self):
                 # _sync_scheduler_instance_info periodic task will.
                 self._update_scheduler_instance_info(context, instances)
 
+    def _error_out_instances_whose_build_was_interrupted(
+            self, context, already_handled_instances):
+        """If there are instances in BUILDING state that are not
+        assigned to this host but have allocations in placement towards
+        this compute that means the nova-compute service was
+        restarted while those instances waited for the resource claim
+        to finish and the _set_instance_host_and_node() to update the
+        instance.host field. We need to push them to ERROR state here to
+        prevent keeping them in BUILDING state forever.
+
+        :param context: The request context
+        :param already_handled_instances: The set of instance UUIDs that the
+            host initialization process already handled in some way.
+        """
+
+        # Strategy:
+        # 1) Get the allocations from placement for our compute node(s)
+        # 2) Remove the already handled instances from the consumer list;
+        #    they are either already initialized or need to be skipped.
+        # 3) Check which remaining consumer is an instance in BUILDING state
+        #    and push it to ERROR state.
+
+        LOG.info(
+            "Looking for unclaimed instances stuck in BUILDING status for "
+            "nodes managed by this host")
+
+        try:
+            node_names = self.driver.get_available_nodes()
+        except exception.VirtDriverNotReady:
+            LOG.warning(
+                "Virt driver is not ready. Therefore unable to error out any "
+                "instances stuck in BUILDING state on this node. If this is "
+                "the first time this service is starting on this host, then "
+                "you can ignore this warning.")
+            return
+
+        for node_name in node_names:
+            try:
+                cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
+                    context, self.host, node_name).uuid
+            except exception.ComputeHostNotFound:
+                LOG.warning(
+                    "Compute node %s not found in the database and therefore "
+                    "unable to error out any instances stuck in BUILDING "
+                    "state on this node. If this is the first time this "
+                    "service is starting on this host, then you can ignore "
+                    "this warning.", node_name)
+                continue
+
+            f = self.reportclient.get_allocations_for_resource_provider
+            allocations = f(context, cn_uuid)
+            if not allocations:
+                LOG.error(
+                    "Could not retrieve compute node resource provider %s and "
+                    "therefore unable to error out any instances stuck in "
+                    "BUILDING state.", cn_uuid)
+                continue
+
+            not_handled_consumers = (set(allocations) -
+                                     already_handled_instances)
+
+            if not not_handled_consumers:
+                continue
+
+            filters = {
+                'vm_state': vm_states.BUILDING,
+                'uuid': not_handled_consumers
+            }
+
+            instances = objects.InstanceList.get_by_filters(
+                context, filters, expected_attrs=[])
+
+            for instance in instances:
+                LOG.debug(
+                    "Instance spawn was interrupted before instance_claim, "
+                    "setting instance to ERROR state", instance=instance)
+                self._set_instance_obj_error_state(
+                    context, instance, clean_task_state=True)
+
     def cleanup_host(self):
         self.driver.register_event_listener(None)
         self.instance_events.cancel_all_events()

diff --git a/nova/tests/functional/compute/test_init_host.py b/nova/tests/functional/compute/test_init_host.py
@@ -175,21 +175,14 @@ def sleep_forever(*args, **kwargs):
             # instance_claim call we can wait for in the test
             fake_notifier.wait_for_versioned_notifications(
                 'instance.create.start')
-            self.restart_compute_service(self.compute1)
 
-        # This is bug 1833581 as the server remains in BUILD state after the
-        # compute restart.
-        self._wait_for_state_change(self.admin_api, server, 'BUILD')
-
-        # Not even the periodic task push this server to ERROR because the
-        # server host is still None since the instance_claim didn't set it.
-        self.flags(instance_build_timeout=1)
-        self.compute1.manager._check_instance_build_time(
-            nova_context.get_admin_context())
-        server = self.admin_api.get_server(server['id'])
-        self.assertEqual('BUILD', server['status'])
-        self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
+            with mock.patch('nova.compute.manager.LOG.debug') as mock_log:
+                self.restart_compute_service(self.compute1)
 
         # We expect that the instance is pushed to ERROR state during the
         # compute restart.
-        # self._wait_for_state_change(self.admin_api, server, 'ERROR')
+        self._wait_for_state_change(self.admin_api, server, 'ERROR')
+        mock_log.assert_called_with(
+            'Instance spawn was interrupted before instance_claim, setting '
+            'instance to ERROR state',
+            instance=mock.ANY)