Vm state management and error states

this implements the blueprint nova-vm-state-management It implements the following functionalities: - Filter compute api calls according to state of the VM (defined in compute/state_checker). - Sets error state if the scheduler cannot allocate the VM in any host - Handles the create/delete concurrency in the compute manager Change-Id: Ie6d016b7d4781f70bb5967f204fa88a6412bd727
openstack · Dec 13, 2011 · ff753cd · ff753cd
1 parent d3b75b7
commit ff753cd
Show file tree

Hide file tree

Showing 11 changed files with 533 additions and 100 deletions.
diff --git a/nova/compute/api.py b/nova/compute/api.py
@@ -35,6 +35,7 @@
 from nova import volume
 from nova.compute import instance_types
 from nova.compute import power_state
+from nova.compute import state_checker
 from nova.compute import task_states
 from nova.compute import vm_states
 from nova.scheduler import api as scheduler_api
@@ -49,25 +50,8 @@
 flags.DECLARE('vncproxy_topic', 'nova.vnc')
 flags.DEFINE_integer('find_host_timeout', 30,
                      'Timeout after NN seconds when looking for a host.')
-
-
-def _is_able_to_shutdown(instance):
-    vm_state = instance["vm_state"]
-    instance_uuid = instance["uuid"]
-
-    valid_shutdown_states = [
-        vm_states.ACTIVE,
-        vm_states.REBUILDING,
-        vm_states.BUILDING,
-        vm_states.ERROR,
-    ]
-
-    if vm_state not in valid_shutdown_states:
-        LOG.warn(_("Instance %(instance_uuid)s cannot be shutdown from "
-                   "its current state: %(vm_state)s.") % locals())
-        return False
-
-    return True
+flags.DEFINE_boolean('api_check_vm_states', True,
+                     'Filter calls by vm state')
 
 
 def _is_queued_delete(instance):
@@ -83,6 +67,27 @@ def _is_queued_delete(instance):
     return True
 
 
+class check_vm_state(object):
+    """Class to wrap API functions that are sensitive to the VM state.
+
+    If the instance is in the wrong state, the wrapper will raise an exception.
+    It uses state_checker to decide if the call is allowed or not.
+    """
+
+    def __init__(self, method_name):
+        self.method_name = method_name
+
+    def __call__(self, f):
+        def _state_checker_wrap(api, context, instance, *args, **kw):
+            if FLAGS.api_check_vm_states and \
+            state_checker.is_blocked(self.method_name, context, instance):
+                raise exception.InstanceInvalidState(\
+                    instance_uuid=instance['uuid'], method=self.method_name)
+            else:
+                return f(api, context, instance, *args, **kw)
+        return _state_checker_wrap
+
+
 class API(base.Base):
     """API for interacting with the compute manager."""
 
@@ -766,15 +771,13 @@ def update(self, context, instance, **kwargs):
         rv = self.db.instance_update(context, instance["id"], kwargs)
         return dict(rv.iteritems())
 
-    @scheduler_api.reroute_compute("soft_delete")
+    @check_vm_state(state_checker.SOFT_DELETE)
+    @scheduler_api.reroute_compute(state_checker.SOFT_DELETE)
     def soft_delete(self, context, instance):
         """Terminate an instance."""
         instance_uuid = instance["uuid"]
         LOG.debug(_("Going to try to soft delete %s"), instance_uuid)
 
-        if not _is_able_to_shutdown(instance):
-            return
-
         # NOTE(jerdfelt): The compute daemon handles reclaiming instances
         # that are in soft delete. If there is no host assigned, there is
         # no daemon to reclaim, so delete it immediately.
@@ -806,20 +809,18 @@ def _delete(self, context, instance):
         else:
             self.db.instance_destroy(context, instance['id'])
 
-    @scheduler_api.reroute_compute("delete")
+    @check_vm_state(state_checker.DELETE)
+    @scheduler_api.reroute_compute(state_checker.DELETE)
     def delete(self, context, instance):
         """Terminate an instance."""
         LOG.debug(_("Going to try to terminate %s"), instance["uuid"])
 
-        if not _is_able_to_shutdown(instance):
-            return
-
         self._delete(context, instance)
 
-    @scheduler_api.reroute_compute("restore")
+    @check_vm_state(state_checker.RESTORE)
+    @scheduler_api.reroute_compute(state_checker.RESTORE)
     def restore(self, context, instance):
         """Restore a previously deleted (but not reclaimed) instance."""
-
         if not _is_queued_delete(instance):
             return
 
@@ -837,7 +838,8 @@ def restore(self, context, instance):
             self._cast_compute_message('power_on_instance', context,
                                        instance['uuid'], host)
 
-    @scheduler_api.reroute_compute("force_delete")
+    @check_vm_state(state_checker.FORCE_DELETE)
+    @scheduler_api.reroute_compute(state_checker.FORCE_DELETE)
     def force_delete(self, context, instance):
         """Force delete a previously deleted (but not reclaimed) instance."""
 
@@ -846,15 +848,13 @@ def force_delete(self, context, instance):
 
         self._delete(context, instance)
 
-    @scheduler_api.reroute_compute("stop")
+    @check_vm_state(state_checker.STOP)
+    @scheduler_api.reroute_compute(state_checker.STOP)
     def stop(self, context, instance):
         """Stop an instance."""
         instance_uuid = instance["uuid"]
         LOG.debug(_("Going to try to stop %s"), instance_uuid)
 
-        if not _is_able_to_shutdown(instance):
-            return
-
         self.update(context,
                     instance,
                     vm_state=vm_states.ACTIVE,
@@ -867,6 +867,7 @@ def stop(self, context, instance):
             self._cast_compute_message('stop_instance', context,
                     instance_uuid, host)
 
+    @check_vm_state(state_checker.START)
     def start(self, context, instance):
         """Start an instance."""
         vm_state = instance["vm_state"]
@@ -1078,7 +1079,8 @@ def _find_host(self, context, instance_uuid):
         raise exception.Error(_("Unable to find host for Instance %s")
                                 % instance_uuid)
 
-    @scheduler_api.reroute_compute("backup")
+    @check_vm_state(state_checker.BACKUP)
+    @scheduler_api.reroute_compute(state_checker.BACKUP)
     def backup(self, context, instance, name, backup_type, rotation,
                extra_properties=None):
         """Backup the given instance
@@ -1095,7 +1097,8 @@ def backup(self, context, instance, name, backup_type, rotation,
                             extra_properties=extra_properties)
         return recv_meta
 
-    @scheduler_api.reroute_compute("snapshot")
+    @check_vm_state(state_checker.SNAPSHOT)
+    @scheduler_api.reroute_compute(state_checker.SNAPSHOT)
     def snapshot(self, context, instance, name, extra_properties=None):
         """Snapshot the given instance.
 
@@ -1125,12 +1128,6 @@ def _create_image(self, context, instance, name, image_type,
         task_state = instance["task_state"]
         instance_uuid = instance['uuid']
 
-        if task_state == task_states.IMAGE_BACKUP:
-            raise exception.InstanceBackingUp(instance_uuid=instance_uuid)
-
-        if task_state == task_states.IMAGE_SNAPSHOT:
-            raise exception.InstanceSnapshotting(instance_uuid=instance_uuid)
-
         properties = {
             'instance_uuid': instance_uuid,
             'user_id': str(context.user_id),
@@ -1150,7 +1147,8 @@ def _create_image(self, context, instance, name, image_type,
                                    params=params)
         return recv_meta
 
-    @scheduler_api.reroute_compute("reboot")
+    @check_vm_state(state_checker.REBOOT)
+    @scheduler_api.reroute_compute(state_checker.REBOOT)
     def reboot(self, context, instance, reboot_type):
         """Reboot the given instance."""
         state = {'SOFT': task_states.REBOOTING,
@@ -1164,16 +1162,13 @@ def reboot(self, context, instance, reboot_type):
                                    instance['uuid'],
                                    params={'reboot_type': reboot_type})
 
-    @scheduler_api.reroute_compute("rebuild")
+    @check_vm_state(state_checker.REBUILD)
+    @scheduler_api.reroute_compute(state_checker.REBUILD)
     def rebuild(self, context, instance, image_href, admin_password,
                 name=None, metadata=None, files_to_inject=None):
         """Rebuild the given instance with the provided metadata."""
         name = name or instance["display_name"]
 
-        if instance["vm_state"] != vm_states.ACTIVE:
-            msg = _("Instance must be active to rebuild.")
-            raise exception.RebuildRequiresActiveInstance(msg)
-
         files_to_inject = files_to_inject or []
         metadata = metadata or {}
 
@@ -1199,7 +1194,8 @@ def rebuild(self, context, instance, image_href, admin_password,
                                    instance["uuid"],
                                    params=rebuild_params)
 
-    @scheduler_api.reroute_compute("revert_resize")
+    @check_vm_state(state_checker.REVERT_RESIZE)
+    @scheduler_api.reroute_compute(state_checker.REVERT_RESIZE)
     def revert_resize(self, context, instance):
         """Reverts a resize, deleting the 'new' instance in the process."""
         context = context.elevated()
@@ -1223,7 +1219,8 @@ def revert_resize(self, context, instance):
         self.db.migration_update(context, migration_ref['id'],
                                  {'status': 'reverted'})
 
-    @scheduler_api.reroute_compute("confirm_resize")
+    @check_vm_state(state_checker.CONFIRM_RESIZE)
+    @scheduler_api.reroute_compute(state_checker.CONFIRM_RESIZE)
     def confirm_resize(self, context, instance):
         """Confirms a migration/resize and deletes the 'old' instance."""
         context = context.elevated()
@@ -1249,7 +1246,8 @@ def confirm_resize(self, context, instance):
         self.db.instance_update(context, instance['uuid'],
                 {'host': migration_ref['dest_compute'], })
 
-    @scheduler_api.reroute_compute("resize")
+    @check_vm_state(state_checker.RESIZE)
+    @scheduler_api.reroute_compute(state_checker.RESIZE)
     def resize(self, context, instance, flavor_id=None):
         """Resize (ie, migrate) a running instance.
 
@@ -1330,7 +1328,8 @@ def add_network_to_project(self, context, project_id):
         # didn't raise so this is the correct zone
         self.network_api.add_network_to_project(context, project_id)
 
-    @scheduler_api.reroute_compute("pause")
+    @check_vm_state(state_checker.PAUSE)
+    @scheduler_api.reroute_compute(state_checker.PAUSE)
     def pause(self, context, instance):
         """Pause the given instance."""
         instance_uuid = instance["uuid"]
@@ -1340,7 +1339,8 @@ def pause(self, context, instance):
                     task_state=task_states.PAUSING)
         self._cast_compute_message('pause_instance', context, instance_uuid)
 
-    @scheduler_api.reroute_compute("unpause")
+    @check_vm_state(state_checker.UNPAUSE)
+    @scheduler_api.reroute_compute(state_checker.UNPAUSE)
     def unpause(self, context, instance):
         """Unpause the given instance."""
         instance_uuid = instance["uuid"]
@@ -1377,7 +1377,8 @@ def get_actions(self, context, instance):
         """Retrieve actions for the given instance."""
         return self.db.instance_get_actions(context, instance['id'])
 
-    @scheduler_api.reroute_compute("suspend")
+    @check_vm_state(state_checker.SUSPEND)
+    @scheduler_api.reroute_compute(state_checker.SUSPEND)
     def suspend(self, context, instance):
         """Suspend the given instance."""
         instance_uuid = instance["uuid"]
@@ -1387,7 +1388,8 @@ def suspend(self, context, instance):
                     task_state=task_states.SUSPENDING)
         self._cast_compute_message('suspend_instance', context, instance_uuid)
 
-    @scheduler_api.reroute_compute("resume")
+    @check_vm_state(state_checker.RESUME)
+    @scheduler_api.reroute_compute(state_checker.RESUME)
     def resume(self, context, instance):
         """Resume the given instance."""
         instance_uuid = instance["uuid"]
@@ -1397,7 +1399,8 @@ def resume(self, context, instance):
                     task_state=task_states.RESUMING)
         self._cast_compute_message('resume_instance', context, instance_uuid)
 
-    @scheduler_api.reroute_compute("rescue")
+    @check_vm_state(state_checker.RESCUE)
+    @scheduler_api.reroute_compute(state_checker.RESCUE)
     def rescue(self, context, instance, rescue_password=None):
         """Rescue the given instance."""
         self.update(context,
@@ -1412,7 +1415,8 @@ def rescue(self, context, instance, rescue_password=None):
                                    instance['uuid'],
                                    params=rescue_params)
 
-    @scheduler_api.reroute_compute("unrescue")
+    @check_vm_state(state_checker.UNRESCUE)
+    @scheduler_api.reroute_compute(state_checker.UNRESCUE)
     def unrescue(self, context, instance):
         """Unrescue the given instance."""
         self.update(context,

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -297,6 +297,34 @@ def _setup_block_device_mapping(self, context, instance):
 
         return (swap, ephemerals, block_device_mapping)
 
+    def _is_instance_terminated(self, instance_uuid):
+        """Instance in DELETING task state or not found in DB"""
+        context = nova.context.get_admin_context()
+        try:
+            instance = self.db.instance_get_by_uuid(context, instance_uuid)
+            if instance['task_state'] == task_states.DELETING:
+                return True
+            return False
+        except:
+            return True
+
+    def _shutdown_instance_even_if_deleted(self, context, instance_uuid):
+        """Call terminate_instance even for already deleted instances"""
+        LOG.info(_("Going to force the deletion of the vm %(instance_uuid)s, "
+                   "even if it is deleted") % locals())
+        try:
+            try:
+                self.terminate_instance(context, instance_uuid)
+            except exception.InstanceNotFound:
+                LOG.info(_("Instance %(instance_uuid)s did not exist in the "
+                         "DB, but I will shut it down anyway using a special "
+                         "context") % locals())
+                ctxt = nova.context.get_admin_context(True)
+                self.terminate_instance(ctxt, instance_uuid)
+        except Exception as ex:
+            LOG.info(_("exception terminating the instance "
+                     "%(instance_id)s") % locals())
+
     def _run_instance(self, context, instance_uuid,
                       requested_networks=None,
                       injected_files=[],
@@ -320,9 +348,14 @@ def _run_instance(self, context, instance_uuid,
                 with utils.save_and_reraise_exception():
                     self._deallocate_network(context, instance)
             self._notify_about_instance_usage(instance)
+            if self._is_instance_terminated(instance_uuid):
+                raise exception.InstanceNotFound
         except exception.InstanceNotFound:
             LOG.exception(_("Instance %s not found.") % instance_uuid)
-            return  # assuming the instance was already deleted
+            # assuming the instance was already deleted, run "delete" again
+            # just in case
+            self._shutdown_instance_even_if_deleted(context, instance_uuid)
+            return
         except Exception as e:
             with utils.save_and_reraise_exception():
                 self._instance_update(context, instance_uuid,