Permalink
Find file Copy path
4a5267f Jan 23, 2019
384 contributors

Users who have contributed to this file

@kk7ds @russellb @mriedem @vishvananda @tr3buchet @comstud @rconradharris @djipko @EdLeafe @jichenjc @gkotton @mikalstill @hanlind @jerdfelt @dprince @markmc @gibizer @justinsb @jogo @bdelliott @xtoddx @natsumetakashi @DragonDM @gaborantal @ameade @sleepsonthefloor
8264 lines (7350 sloc) 389 KB
# Copyright 2010 United States Government as represented by the
# Administrator of the National Aeronautics and Space Administration.
# Copyright 2011 Justin Santa Barbara
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Handles all processes relating to instances (guest vms).
The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
handles RPC calls relating to creating instances. It is responsible for
building a disk image, launching it via the underlying virtualization driver,
responding to calls to check its state, attaching persistent storage, and
terminating it.
"""
import base64
import binascii
import contextlib
import functools
import inspect
import sys
import time
import traceback
from cinderclient import exceptions as cinder_exception
from cursive import exception as cursive_exception
import eventlet.event
from eventlet import greenthread
import eventlet.semaphore
import eventlet.timeout
import futurist
from keystoneauth1 import exceptions as keystone_exception
from oslo_log import log as logging
import oslo_messaging as messaging
from oslo_serialization import jsonutils
from oslo_service import loopingcall
from oslo_service import periodic_task
from oslo_utils import excutils
from oslo_utils import strutils
from oslo_utils import timeutils
import six
from six.moves import range
from nova import block_device
from nova.cells import rpcapi as cells_rpcapi
from nova import compute
from nova.compute import build_results
from nova.compute import claims
from nova.compute import power_state
from nova.compute import resource_tracker
from nova.compute import rpcapi as compute_rpcapi
from nova.compute import task_states
from nova.compute import utils as compute_utils
from nova.compute.utils import wrap_instance_event
from nova.compute import vm_states
from nova import conductor
import nova.conf
from nova.console import rpcapi as console_rpcapi
import nova.context
from nova import exception
from nova import exception_wrapper
from nova import hooks
from nova.i18n import _
from nova import image
from nova import manager
from nova import network
from nova.network import base_api as base_net_api
from nova.network import model as network_model
from nova.network.security_group import openstack_driver
from nova import objects
from nova.objects import base as obj_base
from nova.objects import fields
from nova.objects import instance as obj_instance
from nova.objects import migrate_data as migrate_data_obj
from nova.pci import whitelist
from nova import rpc
from nova import safe_utils
from nova.scheduler import client as scheduler_client
from nova import utils
from nova.virt import block_device as driver_block_device
from nova.virt import configdrive
from nova.virt import driver
from nova.virt import event as virtevent
from nova.virt import storage_users
from nova.virt import virtapi
from nova.volume import cinder
CONF = nova.conf.CONF
LOG = logging.getLogger(__name__)
get_notifier = functools.partial(rpc.get_notifier, service='compute')
wrap_exception = functools.partial(exception_wrapper.wrap_exception,
get_notifier=get_notifier,
binary='nova-compute')
@contextlib.contextmanager
def errors_out_migration_ctxt(migration):
"""Context manager to error out migration on failure."""
try:
yield
except Exception:
with excutils.save_and_reraise_exception():
if migration:
# We may have been passed None for our migration if we're
# receiving from an older client. The migration will be
# errored via the legacy path.
migration.status = 'error'
try:
with migration.obj_as_admin():
migration.save()
except Exception:
LOG.debug(
'Error setting migration status for instance %s.',
migration.instance_uuid, exc_info=True)
@utils.expects_func_args('migration')
def errors_out_migration(function):
"""Decorator to error out migration on failure."""
@functools.wraps(function)
def decorated_function(self, context, *args, **kwargs):
wrapped_func = safe_utils.get_wrapped_function(function)
keyed_args = inspect.getcallargs(wrapped_func, self, context,
*args, **kwargs)
migration = keyed_args['migration']
with errors_out_migration_ctxt(migration):
return function(self, context, *args, **kwargs)
return decorated_function
@utils.expects_func_args('instance')
def reverts_task_state(function):
"""Decorator to revert task_state on failure."""
@functools.wraps(function)
def decorated_function(self, context, *args, **kwargs):
try:
return function(self, context, *args, **kwargs)
except exception.UnexpectedTaskStateError as e:
# Note(maoy): unexpected task state means the current
# task is preempted. Do not clear task state in this
# case.
with excutils.save_and_reraise_exception():
LOG.info("Task possibly preempted: %s",
e.format_message())
except Exception:
with excutils.save_and_reraise_exception():
wrapped_func = safe_utils.get_wrapped_function(function)
keyed_args = inspect.getcallargs(wrapped_func, self, context,
*args, **kwargs)
# NOTE(mriedem): 'instance' must be in keyed_args because we
# have utils.expects_func_args('instance') decorating this
# method.
instance = keyed_args['instance']
original_task_state = instance.task_state
try:
self._instance_update(context, instance, task_state=None)
LOG.info("Successfully reverted task state from %s on "
"failure for instance.",
original_task_state, instance=instance)
except exception.InstanceNotFound:
# We might delete an instance that failed to build shortly
# after it errored out this is an expected case and we
# should not trace on it.
pass
except Exception as e:
LOG.warning("Failed to revert task state for instance. "
"Error: %s", e, instance=instance)
return decorated_function
@utils.expects_func_args('instance')
def wrap_instance_fault(function):
"""Wraps a method to catch exceptions related to instances.
This decorator wraps a method to catch any exceptions having to do with
an instance that may get thrown. It then logs an instance fault in the db.
"""
@functools.wraps(function)
def decorated_function(self, context, *args, **kwargs):
try:
return function(self, context, *args, **kwargs)
except exception.InstanceNotFound:
raise
except Exception as e:
# NOTE(gtt): If argument 'instance' is in args rather than kwargs,
# we will get a KeyError exception which will cover up the real
# exception. So, we update kwargs with the values from args first.
# then, we can get 'instance' from kwargs easily.
kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))
with excutils.save_and_reraise_exception():
compute_utils.add_instance_fault_from_exc(context,
kwargs['instance'], e, sys.exc_info())
return decorated_function
@utils.expects_func_args('image_id', 'instance')
def delete_image_on_error(function):
"""Used for snapshot related method to ensure the image created in
compute.api is deleted when an error occurs.
"""
@functools.wraps(function)
def decorated_function(self, context, image_id, instance,
*args, **kwargs):
try:
return function(self, context, image_id, instance,
*args, **kwargs)
except Exception:
with excutils.save_and_reraise_exception():
LOG.debug("Cleaning up image %s", image_id,
exc_info=True, instance=instance)
try:
self.image_api.delete(context, image_id)
except exception.ImageNotFound:
# Since we're trying to cleanup an image, we don't care if
# if it's already gone.
pass
except Exception:
LOG.exception("Error while trying to clean up image %s",
image_id, instance=instance)
return decorated_function
# TODO(danms): Remove me after Icehouse
# TODO(alaski): Actually remove this after Newton, assuming a major RPC bump
# NOTE(mikal): if the method being decorated has more than one decorator, then
# put this one first. Otherwise the various exception handling decorators do
# not function correctly.
def object_compat(function):
"""Wraps a method that expects a new-world instance
This provides compatibility for callers passing old-style dict
instances.
"""
@functools.wraps(function)
def decorated_function(self, context, *args, **kwargs):
def _load_instance(instance_or_dict):
if isinstance(instance_or_dict, dict):
# try to get metadata and system_metadata for most cases but
# only attempt to load those if the db instance already has
# those fields joined
metas = [meta for meta in ('metadata', 'system_metadata')
if meta in instance_or_dict]
instance = objects.Instance._from_db_object(
context, objects.Instance(), instance_or_dict,
expected_attrs=metas)
instance._context = context
return instance
return instance_or_dict
try:
kwargs['instance'] = _load_instance(kwargs['instance'])
except KeyError:
args = (_load_instance(args[0]),) + args[1:]
migration = kwargs.get('migration')
if isinstance(migration, dict):
migration = objects.Migration._from_db_object(
context.elevated(), objects.Migration(),
migration)
kwargs['migration'] = migration
return function(self, context, *args, **kwargs)
return decorated_function
class InstanceEvents(object):
def __init__(self):
self._events = {}
@staticmethod
def _lock_name(instance):
return '%s-%s' % (instance.uuid, 'events')
def prepare_for_instance_event(self, instance, name, tag):
"""Prepare to receive an event for an instance.
This will register an event for the given instance that we will
wait on later. This should be called before initiating whatever
action will trigger the event. The resulting eventlet.event.Event
object should be wait()'d on to ensure completion.
:param instance: the instance for which the event will be generated
:param name: the name of the event we're expecting
:param tag: the tag associated with the event we're expecting
:returns: an event object that should be wait()'d on
"""
if self._events is None:
# NOTE(danms): We really should have a more specific error
# here, but this is what we use for our default error case
raise exception.NovaException('In shutdown, no new events '
'can be scheduled')
@utils.synchronized(self._lock_name(instance))
def _create_or_get_event():
instance_events = self._events.setdefault(instance.uuid, {})
return instance_events.setdefault((name, tag),
eventlet.event.Event())
LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',
{'name': name, 'tag': tag}, instance=instance)
return _create_or_get_event()
def pop_instance_event(self, instance, event):
"""Remove a pending event from the wait list.
This will remove a pending event from the wait list so that it
can be used to signal the waiters to wake up.
:param instance: the instance for which the event was generated
:param event: the nova.objects.external_event.InstanceExternalEvent
that describes the event
:returns: the eventlet.event.Event object on which the waiters
are blocked
"""
no_events_sentinel = object()
no_matching_event_sentinel = object()
@utils.synchronized(self._lock_name(instance))
def _pop_event():
if self._events is None:
LOG.debug('Unexpected attempt to pop events during shutdown',
instance=instance)
return no_events_sentinel
events = self._events.get(instance.uuid)
if not events:
return no_events_sentinel
_event = events.pop((event.name, event.tag), None)
if not events:
del self._events[instance.uuid]
if _event is None:
return no_matching_event_sentinel
return _event
result = _pop_event()
if result is no_events_sentinel:
LOG.debug('No waiting events found dispatching %(event)s',
{'event': event.key},
instance=instance)
return None
elif result is no_matching_event_sentinel:
LOG.debug('No event matching %(event)s in %(events)s',
{'event': event.key,
'events': self._events.get(instance.uuid, {}).keys()},
instance=instance)
return None
else:
return result
def clear_events_for_instance(self, instance):
"""Remove all pending events for an instance.
This will remove all events currently pending for an instance
and return them (indexed by event name).
:param instance: the instance for which events should be purged
:returns: a dictionary of {event_name: eventlet.event.Event}
"""
@utils.synchronized(self._lock_name(instance))
def _clear_events():
if self._events is None:
LOG.debug('Unexpected attempt to clear events during shutdown',
instance=instance)
return dict()
# NOTE(danms): We have historically returned the raw internal
# format here, which is {event.key: [events, ...])} so just
# trivially convert it here.
return {'%s-%s' % k: e
for k, e in self._events.pop(instance.uuid, {}).items()}
return _clear_events()
def cancel_all_events(self):
if self._events is None:
LOG.debug('Unexpected attempt to cancel events during shutdown.')
return
our_events = self._events
# NOTE(danms): Block new events
self._events = None
for instance_uuid, events in our_events.items():
for (name, tag), eventlet_event in events.items():
LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '
'instance %(instance_uuid)s',
{'name': name,
'tag': tag,
'instance_uuid': instance_uuid})
event = objects.InstanceExternalEvent(
instance_uuid=instance_uuid,
name=name, status='failed',
tag=tag, data={})
eventlet_event.send(event)
class ComputeVirtAPI(virtapi.VirtAPI):
def __init__(self, compute):
super(ComputeVirtAPI, self).__init__()
self._compute = compute
def _default_error_callback(self, event_name, instance):
raise exception.NovaException(_('Instance event failed'))
@contextlib.contextmanager
def wait_for_instance_event(self, instance, event_names, deadline=300,
error_callback=None):
"""Plan to wait for some events, run some code, then wait.
This context manager will first create plans to wait for the
provided event_names, yield, and then wait for all the scheduled
events to complete.
Note that this uses an eventlet.timeout.Timeout to bound the
operation, so callers should be prepared to catch that
failure and handle that situation appropriately.
If the event is not received by the specified timeout deadline,
eventlet.timeout.Timeout is raised.
If the event is received but did not have a 'completed'
status, a NovaException is raised. If an error_callback is
provided, instead of raising an exception as detailed above
for the failure case, the callback will be called with the
event_name and instance, and can return True to continue
waiting for the rest of the events, False to stop processing,
or raise an exception which will bubble up to the waiter.
:param instance: The instance for which an event is expected
:param event_names: A list of event names. Each element is a
tuple of strings to indicate (name, tag),
where name is required, but tag may be None.
:param deadline: Maximum number of seconds we should wait for all
of the specified events to arrive.
:param error_callback: A function to be called if an event arrives
"""
if error_callback is None:
error_callback = self._default_error_callback
events = {}
for event_name in event_names:
name, tag = event_name
event_name = objects.InstanceExternalEvent.make_key(name, tag)
try:
events[event_name] = (
self._compute.instance_events.prepare_for_instance_event(
instance, name, tag))
except exception.NovaException:
error_callback(event_name, instance)
# NOTE(danms): Don't wait for any of the events. They
# should all be canceled and fired immediately below,
# but don't stick around if not.
deadline = 0
yield
with eventlet.timeout.Timeout(deadline):
for event_name, event in events.items():
actual_event = event.wait()
if actual_event.status == 'completed':
continue
decision = error_callback(event_name, instance)
if decision is False:
break
class ComputeManager(manager.Manager):
"""Manages the running instances from creation to destruction."""
target = messaging.Target(version='5.1')
def __init__(self, compute_driver=None, *args, **kwargs):
"""Load configuration options and connect to the hypervisor."""
self.virtapi = ComputeVirtAPI(self)
self.network_api = network.API()
self.volume_api = cinder.API()
self.image_api = image.API()
self._last_host_check = 0
self._last_bw_usage_poll = 0
self._bw_usage_supported = True
self._last_bw_usage_cell_update = 0
self.compute_api = compute.API()
self.compute_rpcapi = compute_rpcapi.ComputeAPI()
self.conductor_api = conductor.API()
self.compute_task_api = conductor.ComputeTaskAPI()
self.is_neutron_security_groups = (
openstack_driver.is_neutron_security_groups())
self.cells_rpcapi = cells_rpcapi.CellsAPI()
self.scheduler_client = scheduler_client.SchedulerClient()
self.reportclient = self.scheduler_client.reportclient
self._resource_tracker = None
self.instance_events = InstanceEvents()
self._sync_power_pool = eventlet.GreenPool(
size=CONF.sync_power_state_pool_size)
self._syncs_in_progress = {}
self.send_instance_updates = (
CONF.filter_scheduler.track_instance_changes)
if CONF.max_concurrent_builds != 0:
self._build_semaphore = eventlet.semaphore.Semaphore(
CONF.max_concurrent_builds)
else:
self._build_semaphore = compute_utils.UnlimitedSemaphore()
if max(CONF.max_concurrent_live_migrations, 0) != 0:
self._live_migration_executor = futurist.GreenThreadPoolExecutor(
max_workers=CONF.max_concurrent_live_migrations)
else:
self._live_migration_executor = futurist.GreenThreadPoolExecutor()
# This is a dict, keyed by instance uuid, to a two-item tuple of
# migration object and Future for the queued live migration.
self._waiting_live_migrations = {}
super(ComputeManager, self).__init__(service_name="compute",
*args, **kwargs)
# NOTE(russellb) Load the driver last. It may call back into the
# compute manager via the virtapi, so we want it to be fully
# initialized before that happens.
self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
self.use_legacy_block_device_info = \
self.driver.need_legacy_block_device_info
def reset(self):
LOG.info('Reloading compute RPC API')
compute_rpcapi.LAST_VERSION = None
self.compute_rpcapi = compute_rpcapi.ComputeAPI()
self._get_resource_tracker().reportclient.clear_provider_cache()
def _get_resource_tracker(self):
if not self._resource_tracker:
rt = resource_tracker.ResourceTracker(self.host, self.driver)
self._resource_tracker = rt
return self._resource_tracker
def _update_resource_tracker(self, context, instance):
"""Let the resource tracker know that an instance has changed state."""
if instance.host == self.host:
rt = self._get_resource_tracker()
rt.update_usage(context, instance, instance.node)
def _instance_update(self, context, instance, **kwargs):
"""Update an instance in the database using kwargs as value."""
for k, v in kwargs.items():
setattr(instance, k, v)
instance.save()
self._update_resource_tracker(context, instance)
def _nil_out_instance_obj_host_and_node(self, instance):
# NOTE(jwcroppe): We don't do instance.save() here for performance
# reasons; a call to this is expected to be immediately followed by
# another call that does instance.save(), thus avoiding two writes
# to the database layer.
instance.host = None
instance.node = None
# If the instance is not on a host, it's not in an aggregate and
# therefore is not in an availability zone.
instance.availability_zone = None
def _set_instance_obj_error_state(self, context, instance,
clean_task_state=False):
try:
instance.vm_state = vm_states.ERROR
if clean_task_state:
instance.task_state = None
instance.save()
except exception.InstanceNotFound:
LOG.debug('Instance has been destroyed from under us while '
'trying to set it to ERROR', instance=instance)
def _get_instances_on_driver(self, context, filters=None):
"""Return a list of instance records for the instances found
on the hypervisor which satisfy the specified filters. If filters=None
return a list of instance records for all the instances found on the
hypervisor.
"""
if not filters:
filters = {}
try:
driver_uuids = self.driver.list_instance_uuids()
if len(driver_uuids) == 0:
# Short circuit, don't waste a DB call
return objects.InstanceList()
filters['uuid'] = driver_uuids
local_instances = objects.InstanceList.get_by_filters(
context, filters, use_slave=True)
return local_instances
except NotImplementedError:
pass
# The driver doesn't support uuids listing, so we'll have
# to brute force.
driver_instances = self.driver.list_instances()
# NOTE(mjozefcz): In this case we need to apply host filter.
# Without this all instance data would be fetched from db.
filters['host'] = self.host
instances = objects.InstanceList.get_by_filters(context, filters,
use_slave=True)
name_map = {instance.name: instance for instance in instances}
local_instances = []
for driver_instance in driver_instances:
instance = name_map.get(driver_instance)
if not instance:
continue
local_instances.append(instance)
return local_instances
def _destroy_evacuated_instances(self, context):
"""Destroys evacuated instances.
While nova-compute was down, the instances running on it could be
evacuated to another host. This method looks for evacuation migration
records where this is the source host and which were either started
(accepted), in-progress (pre-migrating) or migrated (done). From those
migration records, local instances reported by the hypervisor are
compared to the instances for the migration records and those local
guests are destroyed, along with instance allocation records in
Placement for this node.
"""
filters = {
'source_compute': self.host,
# NOTE(mriedem): Migration records that have been accepted are
# included in case the source node comes back up while instances
# are being evacuated to another host. We don't want the same
# instance being reported from multiple hosts.
# NOTE(lyarwood): pre-migrating is also included here as the
# source compute can come back online shortly after the RT
# claims on the destination that in-turn moves the migration to
# pre-migrating. If the evacuate fails on the destination host,
# the user can rebuild the instance (in ERROR state) on the source
# host.
'status': ['accepted', 'pre-migrating', 'done'],
'migration_type': 'evacuation',
}
with utils.temporary_mutation(context, read_deleted='yes'):
evacuations = objects.MigrationList.get_by_filters(context,
filters)
if not evacuations:
return
evacuations = {mig.instance_uuid: mig for mig in evacuations}
# TODO(mriedem): We could optimize by pre-loading the joined fields
# we know we'll use, like info_cache and flavor.
local_instances = self._get_instances_on_driver(context)
evacuated = [inst for inst in local_instances
if inst.uuid in evacuations]
# NOTE(gibi): We are called from init_host and at this point the
# compute_nodes of the resource tracker has not been populated yet so
# we cannot rely on the resource tracker here.
compute_nodes = {}
for instance in evacuated:
migration = evacuations[instance.uuid]
LOG.info('Deleting instance as it has been evacuated from '
'this host', instance=instance)
try:
network_info = self.network_api.get_instance_nw_info(
context, instance)
bdi = self._get_instance_block_device_info(context,
instance)
destroy_disks = not (self._is_instance_storage_shared(
context, instance))
except exception.InstanceNotFound:
network_info = network_model.NetworkInfo()
bdi = {}
LOG.info('Instance has been marked deleted already, '
'removing it from the hypervisor.',
instance=instance)
# always destroy disks if the instance was deleted
destroy_disks = True
self.driver.destroy(context, instance,
network_info,
bdi, destroy_disks)
# delete the allocation of the evacuated instance from this host
if migration.source_node not in compute_nodes:
try:
cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
context, self.host, migration.source_node).uuid
compute_nodes[migration.source_node] = cn_uuid
except exception.ComputeHostNotFound:
LOG.error("Failed to clean allocation of evacuated "
"instance as the source node %s is not found",
migration.source_node, instance=instance)
continue
cn_uuid = compute_nodes[migration.source_node]
# If the instance was deleted in the interim, assume its
# allocations were properly cleaned up (either by its hosting
# compute service or the API).
if (not instance.deleted and
not self.reportclient.
remove_provider_tree_from_instance_allocation(
context, instance.uuid, cn_uuid)):
LOG.error("Failed to clean allocation of evacuated instance "
"on the source node %s",
cn_uuid, instance=instance)
migration.status = 'completed'
migration.save()
return evacuations
def _is_instance_storage_shared(self, context, instance, host=None):
shared_storage = True
data = None
try:
data = self.driver.check_instance_shared_storage_local(context,
instance)
if data:
shared_storage = (self.compute_rpcapi.
check_instance_shared_storage(context,
instance, data, host=host))
except NotImplementedError:
LOG.debug('Hypervisor driver does not support '
'instance shared storage check, '
'assuming it\'s not on shared storage',
instance=instance)
shared_storage = False
except Exception:
LOG.exception('Failed to check if instance shared',
instance=instance)
finally:
if data:
self.driver.check_instance_shared_storage_cleanup(context,
data)
return shared_storage
def _complete_partial_deletion(self, context, instance):
"""Complete deletion for instances in DELETED status but not marked as
deleted in the DB
"""
instance.destroy()
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
self._complete_deletion(context,
instance)
self._notify_about_instance_usage(context, instance, "delete.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.DELETE,
phase=fields.NotificationPhase.END, bdms=bdms)
def _complete_deletion(self, context, instance):
self._update_resource_tracker(context, instance)
rt = self._get_resource_tracker()
rt.reportclient.delete_allocation_for_instance(context, instance.uuid)
self._clean_instance_console_tokens(context, instance)
self._delete_scheduler_instance_info(context, instance.uuid)
def _init_instance(self, context, instance):
"""Initialize this instance during service init."""
# NOTE(danms): If the instance appears to not be owned by this
# host, it may have been evacuated away, but skipped by the
# evacuation cleanup code due to configuration. Thus, if that
# is a possibility, don't touch the instance in any way, but
# log the concern. This will help avoid potential issues on
# startup due to misconfiguration.
if instance.host != self.host:
LOG.warning('Instance %(uuid)s appears to not be owned '
'by this host, but by %(host)s. Startup '
'processing is being skipped.',
{'uuid': instance.uuid,
'host': instance.host})
return
# Instances that are shut down, or in an error state can not be
# initialized and are not attempted to be recovered. The exception
# to this are instances that are in RESIZE_MIGRATING or DELETING,
# which are dealt with further down.
if (instance.vm_state == vm_states.SOFT_DELETED or
(instance.vm_state == vm_states.ERROR and
instance.task_state not in
(task_states.RESIZE_MIGRATING, task_states.DELETING))):
LOG.debug("Instance is in %s state.",
instance.vm_state, instance=instance)
return
if instance.vm_state == vm_states.DELETED:
try:
self._complete_partial_deletion(context, instance)
except Exception:
# we don't want that an exception blocks the init_host
LOG.exception('Failed to complete a deletion',
instance=instance)
return
if (instance.vm_state == vm_states.BUILDING or
instance.task_state in [task_states.SCHEDULING,
task_states.BLOCK_DEVICE_MAPPING,
task_states.NETWORKING,
task_states.SPAWNING]):
# NOTE(dave-mcnally) compute stopped before instance was fully
# spawned so set to ERROR state. This is safe to do as the state
# may be set by the api but the host is not so if we get here the
# instance has already been scheduled to this particular host.
LOG.debug("Instance failed to spawn correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
return
if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
instance.task_state in [task_states.REBUILDING,
task_states.REBUILD_BLOCK_DEVICE_MAPPING,
task_states.REBUILD_SPAWNING]):
# NOTE(jichenjc) compute stopped before instance was fully
# spawned so set to ERROR state. This is consistent to BUILD
LOG.debug("Instance failed to rebuild correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
return
if (instance.vm_state != vm_states.ERROR and
instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
task_states.IMAGE_PENDING_UPLOAD,
task_states.IMAGE_UPLOADING,
task_states.IMAGE_SNAPSHOT]):
LOG.debug("Instance in transitional state %s at start-up "
"clearing task state",
instance.task_state, instance=instance)
try:
self._post_interrupted_snapshot_cleanup(context, instance)
except Exception:
# we don't want that an exception blocks the init_host
LOG.exception('Failed to cleanup snapshot.', instance=instance)
instance.task_state = None
instance.save()
if (instance.vm_state != vm_states.ERROR and
instance.task_state in [task_states.RESIZE_PREP]):
LOG.debug("Instance in transitional state %s at start-up "
"clearing task state",
instance['task_state'], instance=instance)
instance.task_state = None
instance.save()
if instance.task_state == task_states.DELETING:
try:
LOG.info('Service started deleting the instance during '
'the previous run, but did not finish. Restarting'
' the deletion now.', instance=instance)
instance.obj_load_attr('metadata')
instance.obj_load_attr('system_metadata')
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
self._delete_instance(context, instance, bdms)
except Exception:
# we don't want that an exception blocks the init_host
LOG.exception('Failed to complete a deletion',
instance=instance)
self._set_instance_obj_error_state(context, instance)
return
current_power_state = self._get_power_state(context, instance)
try_reboot, reboot_type = self._retry_reboot(context, instance,
current_power_state)
if try_reboot:
LOG.debug("Instance in transitional state (%(task_state)s) at "
"start-up and power state is (%(power_state)s), "
"triggering reboot",
{'task_state': instance.task_state,
'power_state': current_power_state},
instance=instance)
# NOTE(mikal): if the instance was doing a soft reboot that got as
# far as shutting down the instance but not as far as starting it
# again, then we've just become a hard reboot. That means the
# task state for the instance needs to change so that we're in one
# of the expected task states for a hard reboot.
if (instance.task_state in task_states.soft_reboot_states and
reboot_type == 'HARD'):
instance.task_state = task_states.REBOOT_PENDING_HARD
instance.save()
self.reboot_instance(context, instance, block_device_info=None,
reboot_type=reboot_type)
return
elif (current_power_state == power_state.RUNNING and
instance.task_state in [task_states.REBOOT_STARTED,
task_states.REBOOT_STARTED_HARD,
task_states.PAUSING,
task_states.UNPAUSING]):
LOG.warning("Instance in transitional state "
"(%(task_state)s) at start-up and power state "
"is (%(power_state)s), clearing task state",
{'task_state': instance.task_state,
'power_state': current_power_state},
instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ACTIVE
instance.save()
elif (current_power_state == power_state.PAUSED and
instance.task_state == task_states.UNPAUSING):
LOG.warning("Instance in transitional state "
"(%(task_state)s) at start-up and power state "
"is (%(power_state)s), clearing task state "
"and unpausing the instance",
{'task_state': instance.task_state,
'power_state': current_power_state},
instance=instance)
try:
self.unpause_instance(context, instance)
except NotImplementedError:
# Some virt driver didn't support pause and unpause
pass
except Exception:
LOG.exception('Failed to unpause instance', instance=instance)
return
if instance.task_state == task_states.POWERING_OFF:
try:
LOG.debug("Instance in transitional state %s at start-up "
"retrying stop request",
instance.task_state, instance=instance)
self.stop_instance(context, instance, True)
except Exception:
# we don't want that an exception blocks the init_host
LOG.exception('Failed to stop instance', instance=instance)
return
if instance.task_state == task_states.POWERING_ON:
try:
LOG.debug("Instance in transitional state %s at start-up "
"retrying start request",
instance.task_state, instance=instance)
self.start_instance(context, instance)
except Exception:
# we don't want that an exception blocks the init_host
LOG.exception('Failed to start instance', instance=instance)
return
net_info = instance.get_network_info()
try:
self.driver.plug_vifs(instance, net_info)
except NotImplementedError as e:
LOG.debug(e, instance=instance)
except exception.VirtualInterfacePlugException:
# NOTE(mriedem): If we get here, it could be because the vif_type
# in the cache is "binding_failed" or "unbound". The only way to
# fix this is to try and bind the ports again, which would be
# expensive here on host startup. We could add a check to
# _heal_instance_info_cache to handle this, but probably only if
# the instance task_state is None.
LOG.exception('Virtual interface plugging failed for instance. '
'The port binding:host_id may need to be manually '
'updated.', instance=instance)
self._set_instance_obj_error_state(context, instance)
return
if instance.task_state == task_states.RESIZE_MIGRATING:
# We crashed during resize/migration, so roll back for safety
try:
# NOTE(mriedem): check old_vm_state for STOPPED here, if it's
# not in system_metadata we default to True for backwards
# compatibility
power_on = (instance.system_metadata.get('old_vm_state') !=
vm_states.STOPPED)
block_dev_info = self._get_instance_block_device_info(context,
instance)
self.driver.finish_revert_migration(context,
instance, net_info, block_dev_info, power_on)
except Exception:
LOG.exception('Failed to revert crashed migration',
instance=instance)
finally:
LOG.info('Instance found in migrating state during '
'startup. Resetting task_state',
instance=instance)
instance.task_state = None
instance.save()
if instance.task_state == task_states.MIGRATING:
# Live migration did not complete, but instance is on this
# host, so reset the state.
instance.task_state = None
instance.save(expected_task_state=[task_states.MIGRATING])
db_state = instance.power_state
drv_state = self._get_power_state(context, instance)
expect_running = (db_state == power_state.RUNNING and
drv_state != db_state)
LOG.debug('Current state is %(drv_state)s, state in DB is '
'%(db_state)s.',
{'drv_state': drv_state, 'db_state': db_state},
instance=instance)
if expect_running and CONF.resume_guests_state_on_host_boot:
self._resume_guests_state(context, instance, net_info)
elif drv_state == power_state.RUNNING:
# VMwareAPI drivers will raise an exception
try:
self.driver.ensure_filtering_rules_for_instance(
instance, net_info)
except NotImplementedError:
LOG.debug('Hypervisor driver does not support '
'firewall rules', instance=instance)
def _resume_guests_state(self, context, instance, net_info):
LOG.info('Rebooting instance after nova-compute restart.',
instance=instance)
block_device_info = \
self._get_instance_block_device_info(context, instance)
try:
self.driver.resume_state_on_host_boot(
context, instance, net_info, block_device_info)
except NotImplementedError:
LOG.warning('Hypervisor driver does not support '
'resume guests', instance=instance)
except Exception:
# NOTE(vish): The instance failed to resume, so we set the
# instance to error and attempt to continue.
LOG.warning('Failed to resume instance',
instance=instance)
self._set_instance_obj_error_state(context, instance)
def _retry_reboot(self, context, instance, current_power_state):
current_task_state = instance.task_state
retry_reboot = False
reboot_type = compute_utils.get_reboot_type(current_task_state,
current_power_state)
pending_soft = (current_task_state == task_states.REBOOT_PENDING and
instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
pending_hard = (current_task_state == task_states.REBOOT_PENDING_HARD
and instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
started_not_running = (current_task_state in
[task_states.REBOOT_STARTED,
task_states.REBOOT_STARTED_HARD] and
current_power_state != power_state.RUNNING)
if pending_soft or pending_hard or started_not_running:
retry_reboot = True
return retry_reboot, reboot_type
def handle_lifecycle_event(self, event):
LOG.info("VM %(state)s (Lifecycle Event)",
{'state': event.get_name()},
instance_uuid=event.get_instance_uuid())
context = nova.context.get_admin_context(read_deleted='yes')
vm_power_state = None
event_transition = event.get_transition()
if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:
vm_power_state = power_state.SHUTDOWN
elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:
vm_power_state = power_state.RUNNING
elif event_transition in (
virtevent.EVENT_LIFECYCLE_PAUSED,
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):
vm_power_state = power_state.PAUSED
elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:
vm_power_state = power_state.RUNNING
elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
vm_power_state = power_state.SUSPENDED
else:
LOG.warning("Unexpected lifecycle event: %d", event_transition)
migrate_finish_statuses = {
# This happens on the source node and indicates live migration
# entered post-copy mode.
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
# Suspended for offline migration.
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
}
expected_attrs = []
if event_transition in migrate_finish_statuses:
# Join on info_cache since that's needed in migrate_instance_start.
expected_attrs.append('info_cache')
instance = objects.Instance.get_by_uuid(context,
event.get_instance_uuid(),
expected_attrs=expected_attrs)
# Note(lpetrut): The event may be delayed, thus not reflecting
# the current instance power state. In that case, ignore the event.
current_power_state = self._get_power_state(context, instance)
if current_power_state == vm_power_state:
LOG.debug('Synchronizing instance power state after lifecycle '
'event "%(event)s"; current vm_state: %(vm_state)s, '
'current task_state: %(task_state)s, current DB '
'power_state: %(db_power_state)s, VM power_state: '
'%(vm_power_state)s',
{'event': event.get_name(),
'vm_state': instance.vm_state,
'task_state': instance.task_state,
'db_power_state': instance.power_state,
'vm_power_state': vm_power_state},
instance_uuid=instance.uuid)
self._sync_instance_power_state(context,
instance,
vm_power_state)
# The following checks are for live migration. We want to activate
# the port binding for the destination host before the live migration
# is resumed on the destination host in order to reduce network
# downtime. Otherwise the ports are bound to the destination host
# in post_live_migration_at_destination.
# TODO(danms): Explore options for using a different live migration
# specific callback for this instead of piggy-backing on the
# handle_lifecycle_event callback.
if (instance.task_state == task_states.MIGRATING and
event_transition in migrate_finish_statuses):
status = migrate_finish_statuses[event_transition]
try:
migration = objects.Migration.get_by_instance_and_status(
context, instance.uuid, status)
LOG.debug('Binding ports to destination host: %s',
migration.dest_compute, instance=instance)
# For neutron, migrate_instance_start will activate the
# destination host port bindings, if there are any created by
# conductor before live migration started.
self.network_api.migrate_instance_start(
context, instance, migration)
except exception.MigrationNotFoundByStatus:
LOG.warning("Unable to find migration record with status "
"'%s' for instance. Port binding will happen in "
"post live migration.", status, instance=instance)
def handle_events(self, event):
if isinstance(event, virtevent.LifecycleEvent):
try:
self.handle_lifecycle_event(event)
except exception.InstanceNotFound:
LOG.debug("Event %s arrived for non-existent instance. The "
"instance was probably deleted.", event)
else:
LOG.debug("Ignoring event %s", event)
def init_virt_events(self):
if CONF.workarounds.handle_virt_lifecycle_events:
self.driver.register_event_listener(self.handle_events)
else:
# NOTE(mriedem): If the _sync_power_states periodic task is
# disabled we should emit a warning in the logs.
if CONF.sync_power_state_interval < 0:
LOG.warning('Instance lifecycle events from the compute '
'driver have been disabled. Note that lifecycle '
'changes to an instance outside of the compute '
'service will not be synchronized '
'automatically since the _sync_power_states '
'periodic task is also disabled.')
else:
LOG.info('Instance lifecycle events from the compute '
'driver have been disabled. Note that lifecycle '
'changes to an instance outside of the compute '
'service will only be synchronized by the '
'_sync_power_states periodic task.')
def init_host(self):
"""Initialization for a standalone compute service."""
if CONF.pci.passthrough_whitelist:
# Simply loading the PCI passthrough whitelist will do a bunch of
# validation that would otherwise wait until the PciDevTracker is
# constructed when updating available resources for the compute
# node(s) in the resource tracker, effectively killing that task.
# So load up the whitelist when starting the compute service to
# flush any invalid configuration early so we can kill the service
# if the configuration is wrong.
whitelist.Whitelist(CONF.pci.passthrough_whitelist)
nova.conf.neutron.register_dynamic_opts(CONF)
# Override the number of concurrent disk operations allowed if the
# user has specified a limit.
if CONF.compute.max_concurrent_disk_ops != 0:
compute_utils.disk_ops_semaphore = \
eventlet.semaphore.BoundedSemaphore(
CONF.compute.max_concurrent_disk_ops)
self.driver.init_host(host=self.host)
context = nova.context.get_admin_context()
instances = objects.InstanceList.get_by_host(
context, self.host, expected_attrs=['info_cache', 'metadata'])
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_on()
self.init_virt_events()
try:
# checking that instance was not already evacuated to other host
evacuated_instances = self._destroy_evacuated_instances(context)
# Initialise instances on the host that are not evacuating
for instance in instances:
if (not evacuated_instances or
instance.uuid not in evacuated_instances):
self._init_instance(context, instance)
finally:
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_off()
if instances:
# We only send the instance info to the scheduler on startup
# if there is anything to send, otherwise this host might
# not be mapped yet in a cell and the scheduler may have
# issues dealing with the information. Later changes to
# instances on this host will update the scheduler, or the
# _sync_scheduler_instance_info periodic task will.
self._update_scheduler_instance_info(context, instances)
def cleanup_host(self):
self.driver.register_event_listener(None)
self.instance_events.cancel_all_events()
self.driver.cleanup_host(host=self.host)
self._cleanup_live_migrations_in_pool()
def _cleanup_live_migrations_in_pool(self):
# Shutdown the pool so we don't get new requests.
self._live_migration_executor.shutdown(wait=False)
# For any queued migrations, cancel the migration and update
# its status.
for migration, future in self._waiting_live_migrations.values():
# If we got here before the Future was submitted then we need
# to move on since there isn't anything we can do.
if future is None:
continue
if future.cancel():
self._set_migration_status(migration, 'cancelled')
LOG.info('Successfully cancelled queued live migration.',
instance_uuid=migration.instance_uuid)
else:
LOG.warning('Unable to cancel live migration.',
instance_uuid=migration.instance_uuid)
self._waiting_live_migrations.clear()
def pre_start_hook(self):
"""After the service is initialized, but before we fully bring
the service up by listening on RPC queues, make sure to update
our available resources (and indirectly our available nodes).
"""
self.update_available_resource(nova.context.get_admin_context(),
startup=True)
def _get_power_state(self, context, instance):
"""Retrieve the power state for the given instance."""
LOG.debug('Checking state', instance=instance)
try:
return self.driver.get_info(instance).state
except exception.InstanceNotFound:
return power_state.NOSTATE
def get_console_topic(self, context):
"""Retrieves the console host for a project on this host.
Currently this is just set in the flags for each compute host.
"""
# TODO(mdragon): perhaps make this variable by console_type?
return '%s.%s' % (console_rpcapi.RPC_TOPIC, CONF.console_host)
@wrap_exception()
def get_console_pool_info(self, context, console_type):
return self.driver.get_console_pool_info(console_type)
@wrap_exception()
def refresh_instance_security_rules(self, context, instance):
"""Tell the virtualization driver to refresh security rules for
an instance.
Passes straight through to the virtualization driver.
Synchronize the call because we may still be in the middle of
creating the instance.
"""
@utils.synchronized(instance.uuid)
def _sync_refresh():
try:
return self.driver.refresh_instance_security_rules(instance)
except NotImplementedError:
LOG.debug('Hypervisor driver does not support '
'security groups.', instance=instance)
return _sync_refresh()
def _await_block_device_map_created(self, context, vol_id):
# TODO(yamahata): creating volume simultaneously
# reduces creation time?
# TODO(yamahata): eliminate dumb polling
start = time.time()
retries = CONF.block_device_allocate_retries
if retries < 0:
LOG.warning("Treating negative config value (%(retries)s) for "
"'block_device_retries' as 0.",
{'retries': retries})
# (1) treat negative config value as 0
# (2) the configured value is 0, one attempt should be made
# (3) the configured value is > 0, then the total number attempts
# is (retries + 1)
attempts = 1
if retries >= 1:
attempts = retries + 1
for attempt in range(1, attempts + 1):
volume = self.volume_api.get(context, vol_id)
volume_status = volume['status']
if volume_status not in ['creating', 'downloading']:
if volume_status == 'available':
return attempt
LOG.warning("Volume id: %(vol_id)s finished being "
"created but its status is %(vol_status)s.",
{'vol_id': vol_id,
'vol_status': volume_status})
break
greenthread.sleep(CONF.block_device_allocate_retries_interval)
raise exception.VolumeNotCreated(volume_id=vol_id,
seconds=int(time.time() - start),
attempts=attempt,
volume_status=volume_status)
def _decode_files(self, injected_files):
"""Base64 decode the list of files to inject."""
if not injected_files:
return []
def _decode(f):
path, contents = f
# Py3 raises binascii.Error instead of TypeError as in Py27
try:
decoded = base64.b64decode(contents)
return path, decoded
except (TypeError, binascii.Error):
raise exception.Base64Exception(path=path)
return [_decode(f) for f in injected_files]
def _validate_instance_group_policy(self, context, instance,
scheduler_hints):
# NOTE(russellb) Instance group policy is enforced by the scheduler.
# However, there is a race condition with the enforcement of
# the policy. Since more than one instance may be scheduled at the
# same time, it's possible that more than one instance with an
# anti-affinity policy may end up here. It's also possible that
# multiple instances with an affinity policy could end up on different
# hosts. This is a validation step to make sure that starting the
# instance here doesn't violate the policy.
group_hint = scheduler_hints.get('group')
if not group_hint:
return
# The RequestSpec stores scheduler_hints as key=list pairs so we need
# to check the type on the value and pull the single entry out. The
# API request schema validates that the 'group' hint is a single value.
if isinstance(group_hint, list):
group_hint = group_hint[0]
@utils.synchronized(group_hint)
def _do_validation(context, instance, group_hint):
group = objects.InstanceGroup.get_by_hint(context, group_hint)
if group.policy and 'anti-affinity' == group.policy:
instances_uuids = objects.InstanceList.get_uuids_by_host(
context, self.host)
ins_on_host = set(instances_uuids)
members = set(group.members)
# Determine the set of instance group members on this host
# which are not the instance in question. This is used to
# determine how many other members from the same anti-affinity
# group can be on this host.
members_on_host = ins_on_host & members - set([instance.uuid])
rules = group.rules
if rules and 'max_server_per_host' in rules:
max_server = rules['max_server_per_host']
else:
max_server = 1
if len(members_on_host) >= max_server:
msg = _("Anti-affinity instance group policy "
"was violated.")
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
elif group.policy and 'affinity' == group.policy:
group_hosts = group.get_hosts(exclude=[instance.uuid])
if group_hosts and self.host not in group_hosts:
msg = _("Affinity instance group policy was violated.")
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
if not CONF.workarounds.disable_group_policy_check_upcall:
_do_validation(context, instance, group_hint)
def _log_original_error(self, exc_info, instance_uuid):
LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
exc_info=exc_info)
# TODO(mriedem): This method is confusing and only ever used for resize
# reschedules; remove it and merge into _reschedule_resize_or_reraise.
def _reschedule(self, context, request_spec, filter_properties,
instance, reschedule_method, method_args, task_state,
exc_info=None, host_list=None):
"""Attempt to re-schedule a compute operation."""
instance_uuid = instance.uuid
retry = filter_properties.get('retry')
if not retry:
# no retry information, do not reschedule.
LOG.debug("Retry info not present, will not reschedule",
instance_uuid=instance_uuid)
return
LOG.debug("Re-scheduling %(method)s: attempt %(num)d",
{'method': reschedule_method.__name__,
'num': retry['num_attempts']}, instance_uuid=instance_uuid)
# reset the task state:
self._instance_update(context, instance, task_state=task_state)
if exc_info:
# stringify to avoid circular ref problem in json serialization:
retry['exc'] = traceback.format_exception_only(exc_info[0],
exc_info[1])
reschedule_method(context, *method_args, request_spec=request_spec,
host_list=host_list)
return True
@periodic_task.periodic_task
def _check_instance_build_time(self, context):
"""Ensure that instances are not stuck in build."""
timeout = CONF.instance_build_timeout
if timeout == 0:
return
filters = {'vm_state': vm_states.BUILDING,
'host': self.host}
building_insts = objects.InstanceList.get_by_filters(context,
filters, expected_attrs=[], use_slave=True)
for instance in building_insts:
if timeutils.is_older_than(instance.created_at, timeout):
self._set_instance_obj_error_state(context, instance)
LOG.warning("Instance build timed out. Set to error "
"state.", instance=instance)
def _check_instance_exists(self, context, instance):
"""Ensure an instance with the same name is not already present."""
if self.driver.instance_exists(instance):
raise exception.InstanceExists(name=instance.name)
def _allocate_network_async(self, context, instance, requested_networks,
macs, security_groups, is_vpn):
"""Method used to allocate networks in the background.
Broken out for testing.
"""
# First check to see if we're specifically not supposed to allocate
# networks because if so, we can exit early.
if requested_networks and requested_networks.no_allocate:
LOG.debug("Not allocating networking since 'none' was specified.",
instance=instance)
return network_model.NetworkInfo([])
LOG.debug("Allocating IP information in the background.",
instance=instance)
retries = CONF.network_allocate_retries
attempts = retries + 1
retry_time = 1
bind_host_id = self.driver.network_binding_host_id(context, instance)
for attempt in range(1, attempts + 1):
try:
nwinfo = self.network_api.allocate_for_instance(
context, instance, vpn=is_vpn,
requested_networks=requested_networks,
macs=macs,
security_groups=security_groups,
bind_host_id=bind_host_id)
LOG.debug('Instance network_info: |%s|', nwinfo,
instance=instance)
instance.system_metadata['network_allocated'] = 'True'
# NOTE(JoshNang) do not save the instance here, as it can cause
# races. The caller shares a reference to instance and waits
# for this async greenthread to finish before calling
# instance.save().
return nwinfo
except Exception:
exc_info = sys.exc_info()
log_info = {'attempt': attempt,
'attempts': attempts}
if attempt == attempts:
LOG.exception('Instance failed network setup '
'after %(attempts)d attempt(s)',
log_info)
six.reraise(*exc_info)
LOG.warning('Instance failed network setup '
'(attempt %(attempt)d of %(attempts)d)',
log_info, instance=instance)
time.sleep(retry_time)
retry_time *= 2
if retry_time > 30:
retry_time = 30
# Not reached.
def _build_networks_for_instance(self, context, instance,
requested_networks, security_groups):
# If we're here from a reschedule the network may already be allocated.
if strutils.bool_from_string(
instance.system_metadata.get('network_allocated', 'False')):
# NOTE(alex_xu): The network_allocated is True means the network
# resource already allocated at previous scheduling, and the
# network setup is cleanup at previous. After rescheduling, the
# network resource need setup on the new host.
self.network_api.setup_instance_network_on_host(
context, instance, instance.host)
return self.network_api.get_instance_nw_info(context, instance)
if not self.is_neutron_security_groups:
security_groups = []
macs = self.driver.macs_for_instance(instance)
network_info = self._allocate_network(context, instance,
requested_networks, macs, security_groups)
return network_info
def _allocate_network(self, context, instance, requested_networks, macs,
security_groups):
"""Start network allocation asynchronously. Return an instance
of NetworkInfoAsyncWrapper that can be used to retrieve the
allocated networks when the operation has finished.
"""
# NOTE(comstud): Since we're allocating networks asynchronously,
# this task state has little meaning, as we won't be in this
# state for very long.
instance.vm_state = vm_states.BUILDING
instance.task_state = task_states.NETWORKING
instance.save(expected_task_state=[None])
is_vpn = False
return network_model.NetworkInfoAsyncWrapper(
self._allocate_network_async, context, instance,
requested_networks, macs, security_groups, is_vpn)
def _default_root_device_name(self, instance, image_meta, root_bdm):
try:
return self.driver.default_root_device_name(instance,
image_meta,
root_bdm)
except NotImplementedError:
return compute_utils.get_next_device_name(instance, [])
def _default_device_names_for_instance(self, instance,
root_device_name,
*block_device_lists):
try:
self.driver.default_device_names_for_instance(instance,
root_device_name,
*block_device_lists)
except NotImplementedError:
compute_utils.default_device_names_for_instance(
instance, root_device_name, *block_device_lists)
def _get_device_name_for_instance(self, instance, bdms, block_device_obj):
# NOTE(ndipanov): Copy obj to avoid changing the original
block_device_obj = block_device_obj.obj_clone()
try:
return self.driver.get_device_name_for_instance(
instance, bdms, block_device_obj)
except NotImplementedError:
return compute_utils.get_device_name_for_instance(
instance, bdms, block_device_obj.get("device_name"))
def _default_block_device_names(self, instance, image_meta, block_devices):
"""Verify that all the devices have the device_name set. If not,
provide a default name.
It also ensures that there is a root_device_name and is set to the
first block device in the boot sequence (boot_index=0).
"""
root_bdm = block_device.get_root_bdm(block_devices)
if not root_bdm:
return
# Get the root_device_name from the root BDM or the instance
root_device_name = None
update_root_bdm = False
if root_bdm.device_name:
root_device_name = root_bdm.device_name
instance.root_device_name = root_device_name
elif instance.root_device_name:
root_device_name = instance.root_device_name
root_bdm.device_name = root_device_name
update_root_bdm = True
else:
root_device_name = self._default_root_device_name(instance,
image_meta,
root_bdm)
instance.root_device_name = root_device_name
root_bdm.device_name = root_device_name
update_root_bdm = True
if update_root_bdm:
root_bdm.save()
ephemerals = list(filter(block_device.new_format_is_ephemeral,
block_devices))
swap = list(filter(block_device.new_format_is_swap,
block_devices))
block_device_mapping = list(filter(
driver_block_device.is_block_device_mapping, block_devices))
self._default_device_names_for_instance(instance,
root_device_name,
ephemerals,
swap,
block_device_mapping)
def _block_device_info_to_legacy(self, block_device_info):
"""Convert BDI to the old format for drivers that need it."""
if self.use_legacy_block_device_info:
ephemerals = driver_block_device.legacy_block_devices(
driver.block_device_info_get_ephemerals(block_device_info))
mapping = driver_block_device.legacy_block_devices(
driver.block_device_info_get_mapping(block_device_info))
swap = block_device_info['swap']
if swap:
swap = swap.legacy()
block_device_info.update({
'ephemerals': ephemerals,
'swap': swap,
'block_device_mapping': mapping})
def _add_missing_dev_names(self, bdms, instance):
for bdm in bdms:
if bdm.device_name is not None:
continue
device_name = self._get_device_name_for_instance(instance,
bdms, bdm)
values = {'device_name': device_name}
bdm.update(values)
bdm.save()
def _prep_block_device(self, context, instance, bdms):
"""Set up the block device for an instance with error logging."""
try:
self._add_missing_dev_names(bdms, instance)
block_device_info = driver.get_block_device_info(instance, bdms)
mapping = driver.block_device_info_get_mapping(block_device_info)
driver_block_device.attach_block_devices(
mapping, context, instance, self.volume_api, self.driver,
wait_func=self._await_block_device_map_created)
self._block_device_info_to_legacy(block_device_info)
return block_device_info
except exception.OverQuota as e:
LOG.warning('Failed to create block device for instance due'
' to exceeding volume related resource quota.'
' Error: %s', e.message, instance=instance)
raise
except Exception as ex:
LOG.exception('Instance failed block device setup',
instance=instance)
# InvalidBDM will eventually result in a BuildAbortException when
# booting from volume, and will be recorded as an instance fault.
# Maintain the original exception message which most likely has
# useful details which the standard InvalidBDM error message lacks.
raise exception.InvalidBDM(six.text_type(ex))
def _update_instance_after_spawn(self, context, instance):
instance.power_state = self._get_power_state(context, instance)
instance.vm_state = vm_states.ACTIVE
instance.task_state = None
instance.launched_at = timeutils.utcnow()
configdrive.update_instance(instance)
def _update_scheduler_instance_info(self, context, instance):
"""Sends an InstanceList with created or updated Instance objects to
the Scheduler client.
In the case of init_host, the value passed will already be an
InstanceList. Other calls will send individual Instance objects that
have been created or resized. In this case, we create an InstanceList
object containing that Instance.
"""
if not self.send_instance_updates:
return
if isinstance(instance, obj_instance.Instance):
instance = objects.InstanceList(objects=[instance])
context = context.elevated()
self.scheduler_client.update_instance_info(context, self.host,
instance)
def _delete_scheduler_instance_info(self, context, instance_uuid):
"""Sends the uuid of the deleted Instance to the Scheduler client."""
if not self.send_instance_updates:
return
context = context.elevated()
self.scheduler_client.delete_instance_info(context, self.host,
instance_uuid)
@periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
def _sync_scheduler_instance_info(self, context):
if not self.send_instance_updates:
return
context = context.elevated()
instances = objects.InstanceList.get_by_host(context, self.host,
expected_attrs=[],
use_slave=True)
uuids = [instance.uuid for instance in instances]
self.scheduler_client.sync_instance_info(context, self.host, uuids)
def _notify_about_instance_usage(self, context, instance, event_suffix,
network_info=None, extra_usage_info=None,
fault=None):
compute_utils.notify_about_instance_usage(
self.notifier, context, instance, event_suffix,
network_info=network_info,
extra_usage_info=extra_usage_info, fault=fault)
def _deallocate_network(self, context, instance,
requested_networks=None):
# If we were told not to allocate networks let's save ourselves
# the trouble of calling the network API.
if requested_networks and requested_networks.no_allocate:
LOG.debug("Skipping network deallocation for instance since "
"networking was not requested.", instance=instance)
return
LOG.debug('Deallocating network for instance', instance=instance)
with timeutils.StopWatch() as timer:
self.network_api.deallocate_for_instance(
context, instance, requested_networks=requested_networks)
# nova-network does an rpc call so we're OK tracking time spent here
LOG.info('Took %0.2f seconds to deallocate network for instance.',
timer.elapsed(), instance=instance)
def _get_instance_block_device_info(self, context, instance,
refresh_conn_info=False,
bdms=None):
"""Transform block devices to the driver block_device format."""
if not bdms:
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
block_device_info = driver.get_block_device_info(instance, bdms)
if not refresh_conn_info:
# if the block_device_mapping has no value in connection_info
# (returned as None), don't include in the mapping
block_device_info['block_device_mapping'] = [
bdm for bdm in driver.block_device_info_get_mapping(
block_device_info)
if bdm.get('connection_info')]
else:
driver_block_device.refresh_conn_infos(
driver.block_device_info_get_mapping(block_device_info),
context, instance, self.volume_api, self.driver)
self._block_device_info_to_legacy(block_device_info)
return block_device_info
def _build_failed(self, node):
if CONF.compute.consecutive_build_service_disable_threshold:
rt = self._get_resource_tracker()
# NOTE(danms): Update our counter, but wait for the next
# update_available_resource() periodic to flush it to the DB
rt.build_failed(node)
def _build_succeeded(self, node):
rt = self._get_resource_tracker()
rt.build_succeeded(node)
@wrap_exception()
@reverts_task_state
@wrap_instance_fault
def build_and_run_instance(self, context, instance, image, request_spec,
filter_properties, admin_password=None,
injected_files=None, requested_networks=None,
security_groups=None, block_device_mapping=None,
node=None, limits=None, host_list=None):
@utils.synchronized(instance.uuid)
def _locked_do_build_and_run_instance(*args, **kwargs):
# NOTE(danms): We grab the semaphore with the instance uuid
# locked because we could wait in line to build this instance
# for a while and we want to make sure that nothing else tries
# to do anything with this instance while we wait.
with self._build_semaphore:
try:
result = self._do_build_and_run_instance(*args, **kwargs)
except Exception:
# NOTE(mriedem): This should really only happen if
# _decode_files in _do_build_and_run_instance fails, and
# that's before a guest is spawned so it's OK to remove
# allocations for the instance for this node from Placement
# below as there is no guest consuming resources anyway.
# The _decode_files case could be handled more specifically
# but that's left for another day.
result = build_results.FAILED
raise
finally:
if result == build_results.FAILED:
# Remove the allocation records from Placement for the
# instance if the build failed. The instance.host is
# likely set to None in _do_build_and_run_instance
# which means if the user deletes the instance, it
# will be deleted in the API, not the compute service.
# Setting the instance.host to None in
# _do_build_and_run_instance means that the
# ResourceTracker will no longer consider this instance
# to be claiming resources against it, so we want to
# reflect that same thing in Placement. No need to
# call this for a reschedule, as the allocations will
# have already been removed in
# self._do_build_and_run_instance().
self._delete_allocation_for_instance(context,
instance.uuid)
if result in (build_results.FAILED,
build_results.RESCHEDULED):
self._build_failed(node)
else:
self._build_succeeded(node)
# NOTE(danms): We spawn here to return the RPC worker thread back to
# the pool. Since what follows could take a really long time, we don't
# want to tie up RPC workers.
utils.spawn_n(_locked_do_build_and_run_instance,
context, instance, image, request_spec,
filter_properties, admin_password, injected_files,
requested_networks, security_groups,
block_device_mapping, node, limits, host_list)
def _delete_allocation_for_instance(self, context, instance_uuid):
rt = self._get_resource_tracker()
rt.reportclient.delete_allocation_for_instance(context, instance_uuid)
def _check_device_tagging(self, requested_networks, block_device_mapping):
tagging_requested = False
if requested_networks:
for net in requested_networks:
if 'tag' in net and net.tag is not None:
tagging_requested = True
break
if block_device_mapping and not tagging_requested:
for bdm in block_device_mapping:
if 'tag' in bdm and bdm.tag is not None:
tagging_requested = True
break
if (tagging_requested and
not self.driver.capabilities.get('supports_device_tagging',
False)):
raise exception.BuildAbortException('Attempt to boot guest with '
'tagged devices on host that '
'does not support tagging.')
def _check_trusted_certs(self, instance):
if (instance.trusted_certs and
not self.driver.capabilities.get('supports_trusted_certs',
False)):
raise exception.BuildAbortException(
'Trusted image certificates provided on host that does not '
'support certificate validation.')
@hooks.add_hook('build_instance')
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def _do_build_and_run_instance(self, context, instance, image,
request_spec, filter_properties, admin_password, injected_files,
requested_networks, security_groups, block_device_mapping,
node=None, limits=None, host_list=None):
try:
LOG.debug('Starting instance...', instance=instance)
instance.vm_state = vm_states.BUILDING
instance.task_state = None
instance.save(expected_task_state=
(task_states.SCHEDULING, None))
except exception.InstanceNotFound:
msg = 'Instance disappeared before build.'
LOG.debug(msg, instance=instance)
return build_results.FAILED
except exception.UnexpectedTaskStateError as e:
LOG.debug(e.format_message(), instance=instance)
return build_results.FAILED
# b64 decode the files to inject:
decoded_files = self._decode_files(injected_files)
if limits is None:
limits = {}
if node is None:
node = self._get_nodename(instance, refresh=True)
try:
with timeutils.StopWatch() as timer:
self._build_and_run_instance(context, instance, image,
decoded_files, admin_password, requested_networks,
security_groups, block_device_mapping, node, limits,
filter_properties, request_spec)
LOG.info('Took %0.2f seconds to build instance.',
timer.elapsed(), instance=instance)
return build_results.ACTIVE
except exception.RescheduledException as e:
retry = filter_properties.get('retry')
if not retry:
# no retry information, do not reschedule.
LOG.debug("Retry info not present, will not reschedule",
instance=instance)
self._cleanup_allocated_networks(context, instance,
requested_networks)
self._cleanup_volumes(context, instance,
block_device_mapping, raise_exc=False)
compute_utils.add_instance_fault_from_exc(context,
instance, e, sys.exc_info(),
fault_message=e.kwargs['reason'])
self._nil_out_instance_obj_host_and_node(instance)
self._set_instance_obj_error_state(context, instance,
clean_task_state=True)
return build_results.FAILED
LOG.debug(e.format_message(), instance=instance)
# This will be used for logging the exception
retry['exc'] = traceback.format_exception(*sys.exc_info())
# This will be used for setting the instance fault message
retry['exc_reason'] = e.kwargs['reason']
# NOTE(comstud): Deallocate networks if the driver wants
# us to do so.
# NOTE(mriedem): Always deallocate networking when using Neutron.
# This is to unbind any ports that the user supplied in the server
# create request, or delete any ports that nova created which were
# meant to be bound to this host. This check intentionally bypasses
# the result of deallocate_networks_on_reschedule because the
# default value in the driver is False, but that method was really
# only meant for Ironic and should be removed when nova-network is
# removed (since is_neutron() will then always be True).
# NOTE(vladikr): SR-IOV ports should be deallocated to
# allow new sriov pci devices to be allocated on a new host.
# Otherwise, if devices with pci addresses are already allocated
# on the destination host, the instance will fail to spawn.
# info_cache.network_info should be present at this stage.
if (self.driver.deallocate_networks_on_reschedule(instance) or
utils.is_neutron() or
self.deallocate_sriov_ports_on_reschedule(instance)):
self._cleanup_allocated_networks(context, instance,
requested_networks)
else:
# NOTE(alex_xu): Network already allocated and we don't
# want to deallocate them before rescheduling. But we need
# to cleanup those network resources setup on this host before
# rescheduling.
self.network_api.cleanup_instance_network_on_host(
context, instance, self.host)
self._nil_out_instance_obj_host_and_node(instance)
instance.task_state = task_states.SCHEDULING
instance.save()
# The instance will have already claimed resources from this host
# before this build was attempted. Now that it has failed, we need
# to unclaim those resources before casting to the conductor, so
# that if there are alternate hosts available for a retry, it can
# claim resources on that new host for the instance.
self._delete_allocation_for_instance(context, instance.uuid)
self.compute_task_api.build_instances(context, [instance],
image, filter_properties, admin_password,
injected_files, requested_networks, security_groups,
block_device_mapping, request_spec=request_spec,
host_lists=[host_list])
return build_results.RESCHEDULED
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
msg = 'Instance disappeared during build.'
LOG.debug(msg, instance=instance)
self._cleanup_allocated_networks(context, instance,
requested_networks)
return build_results.FAILED
except Exception as e:
if isinstance(e, exception.BuildAbortException):
LOG.error(e.format_message(), instance=instance)
else:
# Should not reach here.
LOG.exception('Unexpected build failure, not rescheduling '
'build.', instance=instance)
self._cleanup_allocated_networks(context, instance,
requested_networks)
self._cleanup_volumes(context, instance,
block_device_mapping, raise_exc=False)
compute_utils.add_instance_fault_from_exc(context, instance,
e, sys.exc_info())
self._nil_out_instance_obj_host_and_node(instance)
self._set_instance_obj_error_state(context, instance,
clean_task_state=True)
return build_results.FAILED
def deallocate_sriov_ports_on_reschedule(self, instance):
"""Determine if networks are needed to be deallocated before reschedule
Check the cached network info for any assigned SR-IOV ports.
SR-IOV ports should be deallocated prior to rescheduling
in order to allow new sriov pci devices to be allocated on a new host.
"""
info_cache = instance.info_cache
def _has_sriov_port(vif):
return vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV
if (info_cache and info_cache.network_info):
for vif in info_cache.network_info:
if _has_sriov_port(vif):
return True
return False
@staticmethod
def _get_scheduler_hints(filter_properties, request_spec=None):
"""Helper method to get scheduler hints.
This method prefers to get the hints out of the request spec, but that
might not be provided. Conductor will pass request_spec down to the
first compute chosen for a build but older computes will not pass
the request_spec to conductor's build_instances method for a
a reschedule, so if we're on a host via a retry, request_spec may not
be provided so we need to fallback to use the filter_properties
to get scheduler hints.
"""
hints = {}
if request_spec is not None and 'scheduler_hints' in request_spec:
hints = request_spec.scheduler_hints
if not hints:
hints = filter_properties.get('scheduler_hints') or {}
return hints
def _build_and_run_instance(self, context, instance, image, injected_files,
admin_password, requested_networks, security_groups,
block_device_mapping, node, limits, filter_properties,
request_spec=None):
image_name = image.get('name')
self._notify_about_instance_usage(context, instance, 'create.start',
extra_usage_info={'image_name': image_name})
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.START,
bdms=block_device_mapping)
# NOTE(mikal): cache the keystone roles associated with the instance
# at boot time for later reference
instance.system_metadata.update(
{'boot_roles': ','.join(context.roles)})
self._check_device_tagging(requested_networks, block_device_mapping)
self._check_trusted_certs(instance)
try:
scheduler_hints = self._get_scheduler_hints(filter_properties,
request_spec)
rt = self._get_resource_tracker()
with rt.instance_claim(context, instance, node, limits):
# NOTE(russellb) It's important that this validation be done
# *after* the resource tracker instance claim, as that is where
# the host is set on the instance.
self._validate_instance_group_policy(context, instance,
scheduler_hints)
image_meta = objects.ImageMeta.from_dict(image)
with self._build_resources(context, instance,
requested_networks, security_groups, image_meta,
block_device_mapping) as resources:
instance.vm_state = vm_states.BUILDING
instance.task_state = task_states.SPAWNING
# NOTE(JoshNang) This also saves the changes to the
# instance from _allocate_network_async, as they aren't
# saved in that function to prevent races.
instance.save(expected_task_state=
task_states.BLOCK_DEVICE_MAPPING)
block_device_info = resources['block_device_info']
network_info = resources['network_info']
allocs = resources['allocations']
LOG.debug('Start spawning the instance on the hypervisor.',
instance=instance)
with timeutils.StopWatch() as timer:
self.driver.spawn(context, instance, image_meta,
injected_files, admin_password,
allocs, network_info=network_info,
block_device_info=block_device_info)
LOG.info('Took %0.2f seconds to spawn the instance on '
'the hypervisor.', timer.elapsed(),
instance=instance)
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError) as e:
with excutils.save_and_reraise_exception():
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
except exception.ComputeResourcesUnavailable as e:
LOG.debug(e.format_message(), instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=e.format_message())
except exception.BuildAbortException as e:
with excutils.save_and_reraise_exception():
LOG.debug(e.format_message(), instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
except (exception.FixedIpLimitExceeded,
exception.NoMoreNetworks, exception.NoMoreFixedIps) as e:
LOG.warning('No more network or fixed IP to be allocated',
instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
msg = _('Failed to allocate the network(s) with error %s, '
'not rescheduling.') % e.format_message()
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
except (exception.VirtualInterfaceCreateException,
exception.VirtualInterfaceMacAddressException,
exception.FixedIpInvalidOnHost,
exception.UnableToAutoAllocateNetwork) as e:
LOG.exception('Failed to allocate network(s)',
instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
msg = _('Failed to allocate the network(s), not rescheduling.')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
except (exception.FlavorDiskTooSmall,
exception.FlavorMemoryTooSmall,
exception.ImageNotActive,
exception.ImageUnacceptable,
exception.InvalidDiskInfo,
exception.InvalidDiskFormat,
cursive_exception.SignatureVerificationError,
exception.CertificateValidationFailed,
exception.VolumeEncryptionNotSupported,
exception.InvalidInput,
# TODO(mriedem): We should be validating RequestedVRamTooHigh
# in the API during server create and rebuild.
exception.RequestedVRamTooHigh) as e:
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message())
except Exception as e:
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=six.text_type(e))
# NOTE(alaski): This is only useful during reschedules, remove it now.
instance.system_metadata.pop('network_allocated', None)
# If CONF.default_access_ip_network_name is set, grab the
# corresponding network and set the access ip values accordingly.
network_name = CONF.default_access_ip_network_name
if (network_name and not instance.access_ip_v4 and
not instance.access_ip_v6):
# Note that when there are multiple ips to choose from, an
# arbitrary one will be chosen.
for vif in network_info:
if vif['network']['label'] == network_name:
for ip in vif.fixed_ips():
if not instance.access_ip_v4 and ip['version'] == 4:
instance.access_ip_v4 = ip['address']
if not instance.access_ip_v6 and ip['version'] == 6:
instance.access_ip_v6 = ip['address']
break
self._update_instance_after_spawn(context, instance)
try:
instance.save(expected_task_state=task_states.SPAWNING)
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError) as e:
with excutils.save_and_reraise_exception():
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
tb = traceback.format_exc()
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping, tb=tb)
self._update_scheduler_instance_info(context, instance)
self._notify_about_instance_usage(context, instance, 'create.end',
extra_usage_info={'message': _('Success')},
network_info=network_info)
compute_utils.notify_about_instance_create(context, instance,
self.host, phase=fields.NotificationPhase.END,
bdms=block_device_mapping)
@contextlib.contextmanager
def _build_resources(self, context, instance, requested_networks,
security_groups, image_meta, block_device_mapping):
resources = {}
network_info = None
try:
LOG.debug('Start building networks asynchronously for instance.',
instance=instance)
network_info = self._build_networks_for_instance(context, instance,
requested_networks, security_groups)
resources['network_info'] = network_info
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
raise
except exception.UnexpectedTaskStateError as e:
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message())
except Exception:
# Because this allocation is async any failures are likely to occur
# when the driver accesses network_info during spawn().
LOG.exception('Failed to allocate network(s)',
instance=instance)
msg = _('Failed to allocate the network(s), not rescheduling.')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
try:
# Perform any driver preparation work for the driver.
self.driver.prepare_for_spawn(instance)
# Depending on a virt driver, some network configuration is
# necessary before preparing block devices.
self.driver.prepare_networks_before_block_device_mapping(
instance, network_info)
# Verify that all the BDMs have a device_name set and assign a
# default to the ones missing it with the help of the driver.
self._default_block_device_names(instance, image_meta,
block_device_mapping)
LOG.debug('Start building block device mappings for instance.',
instance=instance)
instance.vm_state = vm_states.BUILDING
instance.task_state = task_states.BLOCK_DEVICE_MAPPING
instance.save()
block_device_info = self._prep_block_device(context, instance,
block_device_mapping)
resources['block_device_info'] = block_device_info
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
with excutils.save_and_reraise_exception():
# Make sure the async call finishes
if network_info is not None:
network_info.wait(do_raise=False)
self.driver.clean_networks_preparation(instance,
network_info)
self.driver.failed_spawn_cleanup(instance)
except (exception.UnexpectedTaskStateError,
exception.OverQuota, exception.InvalidBDM) as e:
# Make sure the async call finishes
if network_info is not None:
network_info.wait(do_raise=False)
self.driver.clean_networks_preparation(instance, network_info)
self.driver.failed_spawn_cleanup(instance)
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message())
except Exception:
LOG.exception('Failure prepping block device',
instance=instance)
# Make sure the async call finishes
if network_info is not None:
network_info.wait(do_raise=False)
self.driver.clean_networks_preparation(instance, network_info)
self.driver.failed_spawn_cleanup(instance)
msg = _('Failure prepping block device.')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
try:
resources['allocations'] = (
self.reportclient.get_allocations_for_consumer(context,
instance.uuid))
except Exception:
LOG.exception('Failure retrieving placement allocations',
instance=instance)
# Make sure the async call finishes
if network_info is not None:
network_info.wait(do_raise=False)
self.driver.failed_spawn_cleanup(instance)
msg = _('Failure retrieving placement allocations')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
try:
yield resources
except Exception as exc:
with excutils.save_and_reraise_exception() as ctxt:
if not isinstance(exc, (
exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError)):
LOG.exception('Instance failed to spawn',
instance=instance)
# Make sure the async call finishes
if network_info is not None:
network_info.wait(do_raise=False)
# if network_info is empty we're likely here because of
# network allocation failure. Since nothing can be reused on
# rescheduling it's better to deallocate network to eliminate
# the chance of orphaned ports in neutron
deallocate_networks = False if network_info else True
try:
self._shutdown_instance(context, instance,
block_device_mapping, requested_networks,
try_deallocate_networks=deallocate_networks)
except Exception as exc2:
ctxt.reraise = False
LOG.warning('Could not clean up failed build,'
' not rescheduling. Error: %s',
six.text_type(exc2))
raise exception.BuildAbortException(
instance_uuid=instance.uuid,
reason=six.text_type(exc))
def _cleanup_allocated_networks(self, context, instance,
requested_networks):
try:
self._deallocate_network(context, instance, requested_networks)
except Exception:
LOG.exception('Failed to deallocate networks', instance=instance)
return
instance.system_metadata['network_allocated'] = 'False'
try:
instance.save()
except exception.InstanceNotFound:
# NOTE(alaski): It's possible that we're cleaning up the networks
# because the instance was deleted. If that's the case then this
# exception will be raised by instance.save()
pass
def _try_deallocate_network(self, context, instance,
requested_networks=None):
# During auto-scale cleanup, we could be deleting a large number
# of servers at the same time and overloading parts of the system,
# so we retry a few times in case of connection failures to the
# networking service.
@loopingcall.RetryDecorator(
max_retry_count=3, inc_sleep_time=2, max_sleep_time=12,
exceptions=(keystone_exception.connection.ConnectFailure,))
def _deallocate_network_with_retries():
try:
self._deallocate_network(
context, instance, requested_networks)
except keystone_exception.connection.ConnectFailure as e:
# Provide a warning that something is amiss.
with excutils.save_and_reraise_exception():
LOG.warning('Failed to deallocate network for instance; '
'retrying. Error: %s', six.text_type(e),
instance=instance)
try:
# tear down allocated network structure
_deallocate_network_with_retries()
except Exception as ex:
with excutils.save_and_reraise_exception():
LOG.error('Failed to deallocate network for instance. '
'Error: %s', ex, instance=instance)
self._set_instance_obj_error_state(context, instance)
def _get_power_off_values(self, context, instance, clean_shutdown):
"""Get the timing configuration for powering down this instance."""
if clean_shutdown:
timeout = compute_utils.get_value_from_system_metadata(instance,
key='image_os_shutdown_timeout', type=int,
default=CONF.shutdown_timeout)
retry_interval = CONF.compute.shutdown_retry_interval
else:
timeout = 0
retry_interval = 0
return timeout, retry_interval
def _power_off_instance(self, context, instance, clean_shutdown=True):
"""Power off an instance on this host."""
timeout, retry_interval = self._get_power_off_values(context,
instance, clean_shutdown)
self.driver.power_off(instance, timeout, retry_interval)
def _shutdown_instance(self, context, instance,
bdms, requested_networks=None, notify=True,
try_deallocate_networks=True):
"""Shutdown an instance on this host.
:param:context: security context
:param:instance: a nova.objects.Instance object
:param:bdms: the block devices for the instance to be torn
down
:param:requested_networks: the networks on which the instance
has ports
:param:notify: true if a final usage notification should be
emitted
:param:try_deallocate_networks: false if we should avoid
trying to teardown networking
"""
context = context.elevated()
LOG.info('Terminating instance', instance=instance)
if notify:
self._notify_about_instance_usage(context, instance,
"shutdown.start")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.SHUTDOWN,
phase=fields.NotificationPhase.START, bdms=bdms)
network_info = instance.get_network_info()
# NOTE(vish) get bdms before destroying the instance
vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
block_device_info = self._get_instance_block_device_info(
context, instance, bdms=bdms)
# NOTE(melwitt): attempt driver destroy before releasing ip, may
# want to keep ip allocated for certain failures
try:
LOG.debug('Start destroying the instance on the hypervisor.',
instance=instance)
with timeutils.StopWatch() as timer:
self.driver.destroy(context, instance, network_info,
block_device_info)
LOG.info('Took %0.2f seconds to destroy the instance on the '
'hypervisor.', timer.elapsed(), instance=instance)
except exception.InstancePowerOffFailure:
# if the instance can't power off, don't release the ip
with excutils.save_and_reraise_exception():
pass
except Exception:
with excutils.save_and_reraise_exception():
# deallocate ip and fail without proceeding to
# volume api calls, preserving current behavior
if try_deallocate_networks:
self._try_deallocate_network(context, instance,
requested_networks)
if try_deallocate_networks:
self._try_deallocate_network(context, instance, requested_networks)
timer.restart()
for bdm in vol_bdms:
try:
if bdm.attachment_id:
self.volume_api.attachment_delete(context,
bdm.attachment_id)
else:
# NOTE(vish): actual driver detach done in driver.destroy,
# so just tell cinder that we are done with it.
connector = self.driver.get_volume_connector(instance)
self.volume_api.terminate_connection(context,
bdm.volume_id,
connector)
self.volume_api.detach(context, bdm.volume_id,
instance.uuid)
except exception.VolumeAttachmentNotFound as exc:
LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,
instance=instance)
except exception.DiskNotFound as exc:
LOG.debug('Ignoring DiskNotFound: %s', exc,
instance=instance)
except exception.VolumeNotFound as exc:
LOG.debug('Ignoring VolumeNotFound: %s', exc,
instance=instance)
except (cinder_exception.EndpointNotFound,
keystone_exception.EndpointNotFound) as exc:
LOG.warning('Ignoring EndpointNotFound for '
'volume %(volume_id)s: %(exc)s',
{'exc': exc, 'volume_id': bdm.volume_id},
instance=instance)
except cinder_exception.ClientException as exc:
LOG.warning('Ignoring unknown cinder exception for '
'volume %(volume_id)s: %(exc)s',
{'exc': exc, 'volume_id': bdm.volume_id},
instance=instance)
except Exception as exc:
LOG.warning('Ignoring unknown exception for '
'volume %(volume_id)s: %(exc)s',
{'exc': exc, 'volume_id': bdm.volume_id},
instance=instance)
if vol_bdms:
LOG.info('Took %(time).2f seconds to detach %(num)s volumes '
'for instance.',
{'time': timer.elapsed(), 'num': len(vol_bdms)},
instance=instance)
if notify:
self._notify_about_instance_usage(context, instance,
"shutdown.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.SHUTDOWN,
phase=fields.NotificationPhase.END, bdms=bdms)
def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
detach=True):
exc_info = None
for bdm in bdms:
if detach and bdm.volume_id:
try:
LOG.debug("Detaching volume: %s", bdm.volume_id,
instance_uuid=instance.uuid)
destroy = bdm.delete_on_termination
self._detach_volume(context, bdm, instance,
destroy_bdm=destroy)
except Exception as exc:
exc_info = sys.exc_info()
LOG.warning('Failed to detach volume: %(volume_id)s '
'due to %(exc)s',
{'volume_id': bdm.volume_id, 'exc': exc})
if bdm.volume_id and bdm.delete_on_termination:
try:
LOG.debug("Deleting volume: %s", bdm.volume_id,
instance_uuid=instance.uuid)
self.volume_api.delete(context, bdm.volume_id)
except Exception as exc:
exc_info = sys.exc_info()
LOG.warning('Failed to delete volume: %(volume_id)s '
'due to %(exc)s',
{'volume_id': bdm.volume_id, 'exc': exc})
if exc_info is not None and raise_exc:
six.reraise(exc_info[0], exc_info[1], exc_info[2])
@hooks.add_hook("delete_instance")
def _delete_instance(self, context, instance, bdms):
"""Delete an instance on this host.
:param context: nova request context
:param instance: nova.objects.instance.Instance object
:param bdms: nova.objects.block_device.BlockDeviceMappingList object
"""
events = self.instance_events.clear_events_for_instance(instance)
if events:
LOG.debug('Events pending at deletion: %(events)s',
{'events': ','.join(events.keys())},
instance=instance)
self._notify_about_instance_usage(context, instance,
"delete.start")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.DELETE,
phase=fields.NotificationPhase.START, bdms=bdms)
self._shutdown_instance(context, instance, bdms)
# NOTE(vish): We have already deleted the instance, so we have
# to ignore problems cleaning up the volumes. It
# would be nice to let the user know somehow that
# the volume deletion failed, but it is not
# acceptable to have an instance that can not be
# deleted. Perhaps this could be reworked in the
# future to set an instance fault the first time
# and to only ignore the failure if the instance
# is already in ERROR.
# NOTE(ameeda): The volumes already detached during the above
# _shutdown_instance() call and this is why
# detach is not requested from _cleanup_volumes()
# in this case
self._cleanup_volumes(context, instance, bdms,
raise_exc=False, detach=False)
# if a delete task succeeded, always update vm state and task
# state without expecting task state to be DELETING
instance.vm_state = vm_states.DELETED
instance.task_state = None
instance.power_state = power_state.NOSTATE
instance.terminated_at = timeutils.utcnow()
instance.save()
self._complete_deletion(context, instance)
# only destroy the instance in the db if the _complete_deletion
# doesn't raise and therefore allocation is successfully
# deleted in placement
instance.destroy()
self._notify_about_instance_usage(context, instance, "delete.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.DELETE,
phase=fields.NotificationPhase.END, bdms=bdms)
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def terminate_instance(self, context, instance, bdms):
"""Terminate an instance on this host."""
@utils.synchronized(instance.uuid)
def do_terminate_instance(instance, bdms):
# NOTE(mriedem): If we are deleting the instance while it was
# booting from volume, we could be racing with a database update of
# the BDM volume_id. Since the compute API passes the BDMs over RPC
# to compute here, the BDMs may be stale at this point. So check
# for any volume BDMs that don't have volume_id set and if we
# detect that, we need to refresh the BDM list before proceeding.
# TODO(mriedem): Move this into _delete_instance and make the bdms
# parameter optional.
for bdm in list(bdms):
if bdm.is_volume and not bdm.volume_id:
LOG.debug('There are potentially stale BDMs during '
'delete, refreshing the BlockDeviceMappingList.',
instance=instance)
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
break
try:
self._delete_instance(context, instance, bdms)
except exception.InstanceNotFound:
LOG.info("Instance disappeared during terminate",
instance=instance)
except Exception:
# As we're trying to delete always go to Error if something
# goes wrong that _delete_instance can't handle.
with excutils.save_and_reraise_exception():
LOG.exception('Setting instance vm_state to ERROR',
instance=instance)
self._set_instance_obj_error_state(context, instance)
do_terminate_instance(instance, bdms)
# NOTE(johannes): This is probably better named power_off_instance
# so it matches the driver method, but because of other issues, we
# can't use that name in grizzly.
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def stop_instance(self, context, instance, clean_shutdown):
"""Stopping an instance on this host."""
@utils.synchronized(instance.uuid)
def do_stop_instance():
current_power_state = self._get_power_state(context, instance)
LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
'current task_state: %(task_state)s, current DB '
'power_state: %(db_power_state)s, current VM '
'power_state: %(current_power_state)s',
{'vm_state': instance.vm_state,
'task_state': instance.task_state,
'db_power_state': instance.power_state,
'current_power_state': current_power_state},
instance_uuid=instance.uuid)
# NOTE(mriedem): If the instance is already powered off, we are
# possibly tearing down and racing with other operations, so we can
# expect the task_state to be None if something else updates the
# instance and we're not locking it.
expected_task_state = [task_states.POWERING_OFF]
# The list of power states is from _sync_instance_power_state.
if current_power_state in (power_state.NOSTATE,
power_state.SHUTDOWN,
power_state.CRASHED):
LOG.info('Instance is already powered off in the '
'hypervisor when stop is called.',
instance=instance)
expected_task_state.append(None)
self._notify_about_instance_usage(context, instance,
"power_off.start")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.POWER_OFF,
phase=fields.NotificationPhase.START)
self._power_off_instance(context, instance, clean_shutdown)
instance.power_state = self._get_power_state(context, instance)
instance.vm_state = vm_states.STOPPED
instance.task_state = None
instance.save(expected_task_state=expected_task_state)
self._notify_about_instance_usage(context, instance,
"power_off.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.POWER_OFF,
phase=fields.NotificationPhase.END)
do_stop_instance()
def _power_on(self, context, instance):
network_info = self.network_api.get_instance_nw_info(context, instance)
block_device_info = self._get_instance_block_device_info(context,
instance)
self.driver.power_on(context, instance,
network_info,
block_device_info)
def _delete_snapshot_of_shelved_instance(self, context, instance,
snapshot_id):
"""Delete snapshot of shelved instance."""
try:
self.image_api.delete(context, snapshot_id)
except (exception.ImageNotFound,
exception.ImageNotAuthorized) as exc:
LOG.warning("Failed to delete snapshot "
"from shelved instance (%s).",
exc.format_message(), instance=instance)
except Exception:
LOG.exception("Something wrong happened when trying to "
"delete snapshot from shelved instance.",
instance=instance)
# NOTE(johannes): This is probably better named power_on_instance
# so it matches the driver method, but because of other issues, we
# can't use that name in grizzly.
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def start_instance(self, context, instance):
"""Starting an instance on this host."""
self._notify_about_instance_usage(context, instance, "power_on.start")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.POWER_ON,
phase=fields.NotificationPhase.START)
self._power_on(context, instance)
instance.power_state = self._get_power_state(context, instance)
instance.vm_state = vm_states.ACTIVE
instance.task_state = None
# Delete an image(VM snapshot) for a shelved instance
snapshot_id = instance.system_metadata.get('shelved_image_id')
if snapshot_id:
self._delete_snapshot_of_shelved_instance(context, instance,
snapshot_id)
# Delete system_metadata for a shelved instance
compute_utils.remove_shelved_keys_from_system_metadata(instance)
instance.save(expected_task_state=task_states.POWERING_ON)
self._notify_about_instance_usage(context, instance, "power_on.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.POWER_ON,
phase=fields.NotificationPhase.END)
@messaging.expected_exceptions(NotImplementedError,
exception.TriggerCrashDumpNotSupported,
exception.InstanceNotRunning)
@wrap_exception()
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def trigger_crash_dump(self, context, instance):
"""Trigger crash dump in an instance."""
self._notify_about_instance_usage(context, instance,
"trigger_crash_dump.start")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
phase=fields.NotificationPhase.START)
# This method does not change task_state and power_state because the
# effect of a trigger depends on user's configuration.
self.driver.trigger_crash_dump(instance)
self._notify_about_instance_usage(context, instance,
"trigger_crash_dump.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
phase=fields.NotificationPhase.END)
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def soft_delete_instance(self, context, instance):
"""Soft delete an instance on this host."""
with compute_utils.notify_about_instance_delete(
self.notifier, context, instance, 'soft_delete',
source=fields.NotificationSource.COMPUTE):
try:
self.driver.soft_delete(instance)
except NotImplementedError:
# Fallback to just powering off the instance if the
# hypervisor doesn't implement the soft_delete method
self.driver.power_off(instance)
instance.power_state = self._get_power_state(context, instance)
instance.vm_state = vm_states.SOFT_DELETED
instance.task_state = None
instance.save(expected_task_state=[task_states.SOFT_DELETING])
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def restore_instance(self, context, instance):
"""Restore a soft-deleted instance on this host."""
self._notify_about_instance_usage(context, instance, "restore.start")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.RESTORE,
phase=fields.NotificationPhase.START)
try:
self.driver.restore(instance)
except NotImplementedError:
# Fallback to just powering on the instance if the hypervisor
# doesn't implement the restore method
self._power_on(context, instance)
instance.power_state = self._get_power_state(context, instance)
instance.vm_state = vm_states.ACTIVE
instance.task_state = None
instance.save(expected_task_state=task_states.RESTORING)
self._notify_about_instance_usage(context, instance, "restore.end")
compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.RESTORE,
phase=fields.NotificationPhase.END)
@staticmethod
def _set_migration_status(migration, status):
"""Set the status, and guard against a None being passed in.
This is useful as some of the compute RPC calls will not pass
a migration object in older versions. The check can be removed when
we move past 4.x major version of the RPC API.
"""
if migration:
migration.status = status
migration.save()
def _rebuild_default_impl(self, context, instance, image_meta,
injected_files, admin_password, allocations,
bdms, detach_block_devices, attach_block_devices,
network_info=None,
evacuate=False, block_device_info=None,
preserve_ephemeral=False):
if preserve_ephemeral:
# The default code path does not support preserving ephemeral
# partitions.
raise exception.PreserveEphemeralNotSupported()
if evacuate:
detach_block_devices(context, bdms)
else:
self._power_off_instance(context, instance, clean_shutdown=True)
detach_block_devices(context, bdms)
self.driver.destroy(context, instance,
network_info=network_info,
block_device_info=block_device_info)
instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
instance.save(expected_task_state=[task_states.REBUILDING])
new_block_device_info = attach_block_devices(context, instance, bdms)
instance.task_state = task_states.REBUILD_SPAWNING
instance.save(
expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
with instance.mutated_migration_context():
self.driver.spawn(context, instance, image_meta, injected_files,
admin_password, allocations,
network_info=network_info,
block_device_info=new_block_device_info)
def _notify_instance_rebuild_error(self, context, instance, error, bdms):
tb = traceback.format_exc()
self._notify_about_instance_usage(context, instance,
'rebuild.error', fault=error)
compute_utils.notify_about_instance_rebuild(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms,
tb=tb)
@messaging.expected_exceptions(exception.PreserveEphemeralNotSupported)
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
injected_files, new_pass, orig_sys_metadata,
bdms, recreate, on_shared_storage,
preserve_ephemeral, migration,
scheduled_node, limits, request_spec):
"""Destroy and re-make this instance.
A 'rebuild' effectively purges all existing data from the system and
remakes the VM with given 'metadata' and 'personalities'.
:param context: `nova.RequestContext` object
:param instance: Instance object
:param orig_image_ref: Original image_ref before rebuild
:param image_ref: New image_ref for rebuild
:param injected_files: Files to inject
:param new_pass: password to set on rebuilt instance
:param orig_sys_metadata: instance system metadata from pre-rebuild
:param bdms: block-device-mappings to use for rebuild
:param recreate: True if the instance is being recreated (e.g. the
hypervisor it was on failed) - cleanup of old state will be
skipped.
:param on_shared_storage: True if instance files on shared storage.
If not provided then information from the
driver will be used to decide if the instance
files are available or not on the target host
:param preserve_ephemeral: True if the default ephemeral storage
partition must be preserved on rebuild
:param migration: a Migration object if one was created for this
rebuild operation (if it's a part of evacuate)
:param scheduled_node: A node of the host chosen by the scheduler. If a
host was specified by the user, this will be
None
:param limits: Overcommit limits set by the scheduler. If a host was
specified by the user, this will be None
:param request_spec: a RequestSpec object used to schedule the instance
"""
# recreate=True means the instance is being evacuated from a failed
# host to a new destination host (this host). The 'recreate' variable
# name is confusing, so rename it to evacuate here at the top, which
# is simpler than renaming a parameter in an RPC versioned method.
evacuate = recreate
context = context.elevated()
if evacuate:
LOG.info("Evacuating instance", instance=instance)
else:
LOG.info("Rebuilding instance", instance=instance)
rt = self._get_resource_tracker()
if evacuate:
# This is an evacuation to a new host, so we need to perform a
# resource claim.
rebuild_claim = rt.rebuild_claim
else:
# This is a rebuild to the same host, so we don't need to make
# a claim since the instance is already on this host.
rebuild_claim = claims.NopClaim
if image_ref:
image_meta = objects.ImageMeta.from_image_ref(
context, self.image_api, image_ref)
elif evacuate:
# For evacuate the API does not send down the image_ref since the
# image does not change so just get it from what was stashed in
# the instance system_metadata when the instance was created (or
# last rebuilt). This also works for volume-backed instances.
image_meta = instance.image_meta
else:
image_meta = objects.ImageMeta()
# NOTE(mriedem): On an evacuate, we need to update
# the instance's host and node properties to reflect it's
# destination node for the evacuate.
if not scheduled_node:
if evacuate:
try:
compute_node = self._get_compute_info(context, self.host)
scheduled_node = compute_node.hypervisor_hostname
except exception.ComputeHostNotFound:
LOG.exception('Failed to get compute_info for %s',
self.host)
else:
scheduled_node = instance.node
with self._error_out_instance_on_exception(context, instance):
try:
claim_ctxt = rebuild_claim(
context, instance, scheduled_node,
limits=limits, image_meta=image_meta,
migration=migration)
self._do_rebuild_instance_with_claim(
claim_ctxt, context, instance, orig_image_ref,
image_meta, injected_files, new_pass, orig_sys_metadata,
bdms, evacuate, on_shared_storage, preserve_ephemeral,
migration, request_spec)
except (exception.ComputeResourcesUnavailable,
exception.RescheduledException) as e:
if isinstance(e, exception.ComputeResourcesUnavailable):
LOG.debug("Could not rebuild instance on this host, not "
"enough resources available.", instance=instance)
else:
# RescheduledException is raised by the late server group
# policy check during evacuation if a parallel scheduling
# violated the policy.
# We catch the RescheduledException here but we don't have
# the plumbing to do an actual reschedule so we abort the
# operation.
LOG.debug("Could not rebuild instance on this host, "
"late server group check failed.",
instance=instance)
# NOTE(ndipanov): We just abort the build for now and leave a
# migration record for potential cleanup later
self._set_migration_status(migration, 'failed')
# Since the claim failed, we need to remove the allocation
# created against the destination node. Note that we can only
# get here when evacuating to a destination node. Rebuilding
# on the same host (not evacuate) uses the NopClaim which will
# not raise ComputeResourcesUnavailable.
rt.delete_allocation_for_evacuated_instance(
context, instance, scheduled_node, node_type='destination')
self._notify_instance_rebuild_error(context, instance, e, bdms)
raise exception.BuildAbortException(
instance_uuid=instance.uuid, reason=e.format_message())
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError) as e:
LOG.debug('Instance was deleted while rebuilding',
instance=instance)
self._set_migration_status(migration, 'failed')
self._notify_instance_rebuild_error(context, instance, e, bdms)
except Exception as e:
self._set_migration_status(migration, 'failed')
if evacuate or scheduled_node is not None:
rt.delete_allocation_for_evacuated_instance(
context, instance, scheduled_node,
node_type='destination')
self._notify_instance_rebuild_error(context, instance, e, bdms)
raise
else:
instance.apply_migration_context()
# NOTE (ndipanov): This save will now update the host and node
# attributes making sure that next RT pass is consistent since
# it will be based on the instance and not the migration DB
# entry.
instance.host = self.host
instance.node = scheduled_node
instance.save()
instance.drop_migration_context()
# NOTE (ndipanov): Mark the migration as done only after we
# mark the instance as belonging to this host.
self._set_migration_status(migration, 'done')
def _do_rebuild_instance_with_claim(self, claim_context, *args, **kwargs):
"""Helper to avoid deep nesting in the top-level method."""
with claim_context:
self._do_rebuild_instance(*args, **kwargs)
@staticmethod
def _get_image_name(image_meta):
if image_meta.obj_attr_is_set("name"):
return image_meta.name
else:
return ''
def _do_rebuild_instance(self, context, instance, orig_image_ref,
image_meta, injected_files, new_pass,
orig_sys_metadata, bdms, evacuate,
on_shared_storage, preserve_ephemeral,
migration, request_spec):
orig_vm_state = instance.vm_state
if evacuate:
if request_spec:
# NOTE(gibi): Do a late check of server group policy as
# parallel scheduling could violate such policy. This will
# cause the evacuate to fail as rebuild does not implement
# reschedule.
hints = self._get_scheduler_hints({}, request_spec)
self._validate_instance_group_policy(context, instance, hints)
if not self.driver.capabilities.get("supports_evacuate", False):
raise exception.InstanceEvacuateNotSupported
self._check_instance_exists(context, instance)
if on_shared_storage is None:
LOG.debug('on_shared_storage is not provided, using driver '
'information to decide if the instance needs to '
'be evacuated')
on_shared_storage = self.driver.instance_on_disk(instance)
elif (on_shared_storage !=
self.driver.instance_on_disk(instance)):
# To cover case when admin expects that instance files are
# on shared storage, but not accessible and vice versa
raise exception.InvalidSharedStorage(
_("Invalid state of instance files on shared"
" storage"))
if on_shared_storage:
LOG.info('disk on shared storage, evacuating using'
' existing disk')
elif instance.image_ref:
orig_image_ref = instance.image_ref
LOG.info("disk not on shared storage, evacuating from "
"image: '%s'", str(orig_image_ref))
else:
LOG.info('disk on volume, evacuating using existing '
'volume')
# We check trusted certs capabilities for both evacuate (rebuild on
# another host) and rebuild (rebuild on the same host) because for
# evacuate we need to make sure an instance with trusted certs can
# have the image verified with those certs during rebuild, and for
# rebuild we could be rebuilding a server that started out with no
# trusted certs on this host, and then was rebuilt with trusted certs
# for a new image, in which case we need to validate that new image
# with the trusted certs during the rebuild.
self._check_trusted_certs(instance)
# This instance.exists message should contain the original
# image_ref, not the new one. Since the DB has been updated
# to point to the new one... we have to override it.
orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,
context)
extra_usage_info = {'image_ref_url': orig_image_ref_url}
compute_utils.notify_usage_exists(
self.notifier, context, instance, self.host,
current_period=True, system_metadata=orig_sys_metadata,
extra_usage_info=extra_usage_info)
# This message should contain the new image_ref
extra_usage_info = {'image_name': self._get_image_name(image_meta)}
self._notify_about_instance_usage(context, instance,
"rebuild.start", extra_usage_info=extra_usage_info)
# NOTE: image_name is not included in the versioned notification
# because we already provide the image_uuid in the notification
# payload and the image details can be looked up via the uuid.
compute_utils.notify_about_instance_rebuild(
context, instance, self.host,
phase=fields.NotificationPhase.START,
bdms=bdms)
instance.power_state = self._get_power_state(context, instance)
instance.task_state = task_states.REBUILDING
instance.save(expected_task_state=[task_states.REBUILDING])
if evacuate:
self.network_api.setup_networks_on_host(
context, instance, self.host)
# For nova-network this is needed to move floating IPs
# For neutron this updates the host in the port binding
# TODO(cfriesen): this network_api call and the one above
# are so similar, we should really try to unify them.
self.network_api.setup_instance_network_on_host(
context, instance, self.host, migration)
# TODO(mriedem): Consider decorating setup_instance_network_on_host
# with @base_api.refresh_cache and then we wouldn't need this
# explicit call to get_instance_nw_info.
network_info = self.network_api.get_instance_nw_info(context,
instance)
else:
network_info = instance.get_network_info()
allocations = self.reportclient.get_allocations_for_consumer(
context, instance.uuid)
if bdms is None:
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
block_device_info = \
self._get_instance_block_device_info(
context, instance, bdms=bdms)
def detach_block_devices(context, bdms):
for bdm in bdms:
if bdm.is_volume:
# NOTE (ildikov): Having the attachment_id set in the BDM
# means that it's the new Cinder attach/detach flow