Skip to content

Commit

Permalink
Merge "Error out interrupted builds" into stable/rocky
Browse files Browse the repository at this point in the history
  • Loading branch information
Zuul authored and openstack-gerrit committed Oct 26, 2019
2 parents 9bd300a + 13bb7ed commit a526901
Show file tree
Hide file tree
Showing 3 changed files with 278 additions and 27 deletions.
113 changes: 104 additions & 9 deletions nova/compute/manager.py
Expand Up @@ -645,6 +645,10 @@ def _destroy_evacuated_instances(self, context):
Then allocations are removed from Placement for every instance that is
evacuated from this host regardless if the instance is reported by the
hypervisor or not.
:param context: The request context
:return: A dict keyed by instance uuid mapped to Migration objects
for instances that were migrated away from this host
"""
filters = {
'source_compute': self.host,
Expand All @@ -665,7 +669,7 @@ def _destroy_evacuated_instances(self, context):
evacuations = objects.MigrationList.get_by_filters(context,
filters)
if not evacuations:
return
return {}
evacuations = {mig.instance_uuid: mig for mig in evacuations}

# The instances might be deleted in which case we need to avoid
Expand Down Expand Up @@ -849,9 +853,8 @@ def _init_instance(self, context, instance):
# instance has already been scheduled to this particular host.
LOG.debug("Instance failed to spawn correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)
return

if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
Expand All @@ -862,9 +865,8 @@ def _init_instance(self, context, instance):
# spawned so set to ERROR state. This is consistent to BUILD
LOG.debug("Instance failed to rebuild correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)
return

if (instance.vm_state != vm_states.ERROR and
Expand Down Expand Up @@ -1241,10 +1243,24 @@ def init_host(self):

# Initialise instances on the host that are not evacuating
for instance in instances:
if (not evacuated_instances or
instance.uuid not in evacuated_instances):
if instance.uuid not in evacuated_instances:
self._init_instance(context, instance)

# NOTE(gibi): collect all the instance uuids that is in some way
# was already handled above. Either by init_instance or by
# _destroy_evacuated_instances. This way we can limit the scope of
# the _error_out_instances_whose_build_was_interrupted call to look
# only for instances that have allocations on this node and not
# handled by the above calls.
already_handled = {instance.uuid for instance in instances}.union(
evacuated_instances)
# NOTE(gibi): If ironic and vcenter virt driver slow start time
# becomes problematic here then we should consider adding a config
# option or a driver flag to tell us if we should thread this out
# in the background on startup
self._error_out_instances_whose_build_was_interrupted(
context, already_handled)

finally:
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_off()
Expand All @@ -1257,6 +1273,85 @@ def init_host(self):
# _sync_scheduler_instance_info periodic task will.
self._update_scheduler_instance_info(context, instances)

def _error_out_instances_whose_build_was_interrupted(
self, context, already_handled_instances):
"""If there are instances in BUILDING state that are not
assigned to this host but have allocations in placement towards
this compute that means the nova-compute service was
restarted while those instances waited for the resource claim
to finish and the _set_instance_host_and_node() to update the
instance.host field. We need to push them to ERROR state here to
prevent keeping them in BUILDING state forever.
:param context: The request context
:param already_handled_instances: The set of instance UUIDs that the
host initialization process already handled in some way.
"""

# Strategy:
# 1) Get the allocations from placement for our compute node(s)
# 2) Remove the already handled instances from the consumer list;
# they are either already initialized or need to be skipped.
# 3) Check which remaining consumer is an instance in BUILDING state
# and push it to ERROR state.

LOG.info(
"Looking for unclaimed instances stuck in BUILDING status for "
"nodes managed by this host")

try:
node_names = self.driver.get_available_nodes()
except exception.VirtDriverNotReady:
LOG.warning(
"Virt driver is not ready. Therefore unable to error out any "
"instances stuck in BUILDING state on this node. If this is "
"the first time this service is starting on this host, then "
"you can ignore this warning.")
return

for node_name in node_names:
try:
cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
context, self.host, node_name).uuid
except exception.ComputeHostNotFound:
LOG.warning(
"Compute node %s not found in the database and therefore "
"unable to error out any instances stuck in BUILDING "
"state on this node. If this is the first time this "
"service is starting on this host, then you can ignore "
"this warning.", node_name)
continue

f = self.reportclient.get_allocations_for_resource_provider
allocations = f(context, cn_uuid)
if not allocations:
LOG.error(
"Could not retrieve compute node resource provider %s and "
"therefore unable to error out any instances stuck in "
"BUILDING state.", cn_uuid)
continue

not_handled_consumers = (set(allocations) -
already_handled_instances)

if not not_handled_consumers:
continue

filters = {
'vm_state': vm_states.BUILDING,
'uuid': not_handled_consumers
}

instances = objects.InstanceList.get_by_filters(
context, filters, expected_attrs=[])

for instance in instances:
LOG.debug(
"Instance spawn was interrupted before instance_claim, "
"setting instance to ERROR state", instance=instance)
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)

def cleanup_host(self):
self.driver.register_event_listener(None)
self.instance_events.cancel_all_events()
Expand Down
21 changes: 7 additions & 14 deletions nova/tests/functional/compute/test_init_host.py
Expand Up @@ -175,21 +175,14 @@ def sleep_forever(*args, **kwargs):
# instance_claim call we can wait for in the test
fake_notifier.wait_for_versioned_notifications(
'instance.create.start')
self.restart_compute_service(self.compute1)

# This is bug 1833581 as the server remains in BUILD state after the
# compute restart.
self._wait_for_state_change(self.admin_api, server, 'BUILD')

# Not even the periodic task push this server to ERROR because the
# server host is still None since the instance_claim didn't set it.
self.flags(instance_build_timeout=1)
self.compute1.manager._check_instance_build_time(
nova_context.get_admin_context())
server = self.admin_api.get_server(server['id'])
self.assertEqual('BUILD', server['status'])
self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
with mock.patch('nova.compute.manager.LOG.debug') as mock_log:
self.restart_compute_service(self.compute1)

# We expect that the instance is pushed to ERROR state during the
# compute restart.
# self._wait_for_state_change(self.admin_api, server, 'ERROR')
self._wait_for_state_change(self.admin_api, server, 'ERROR')
mock_log.assert_called_with(
'Instance spawn was interrupted before instance_claim, setting '
'instance to ERROR state',
instance=mock.ANY)

0 comments on commit a526901

Please sign in to comment.