Skip to content

Commit

Permalink
Make error notifications more consistent.
Browse files Browse the repository at this point in the history
The error notifications sent by wrap_exception are a good source of
information when tracking down errors in the system. It is easy to spot
patterns in the errors by looking at similarities in the exceptions
raised.

This change looks at making the in-line error notifications
follow the same format, so they become equally useful.

Closes-Bug: #1251602
Change-Id: I1cd64a90ecefc64d12af05487eb6e45aa0362d69
  • Loading branch information
JohnGarbutt authored and John Garbutt committed Feb 28, 2014
1 parent cb31014 commit f35c63d
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 70 deletions.
91 changes: 44 additions & 47 deletions nova/compute/manager.py
Expand Up @@ -958,13 +958,13 @@ def _run_instance(self, context, request_spec,

extra_usage_info = {}

def notify(status, msg="", **kwargs):
def notify(status, msg="", fault=None, **kwargs):
"""Send a create.{start,error,end} notification."""
type_ = "create.%(status)s" % dict(status=status)
info = extra_usage_info.copy()
info['message'] = unicode(msg)
self._notify_about_instance_usage(context, instance, type_,
extra_usage_info=info, **kwargs)
extra_usage_info=info, fault=fault, **kwargs)

try:
self._prebuild_instance(context, instance)
Expand All @@ -985,7 +985,7 @@ def notify(status, msg="", **kwargs):

except exception.RescheduledException as e:
# Instance build encountered an error, and has been rescheduled.
notify("error", msg=unicode(e)) # notify that build failed
notify("error", fault=e)

except exception.BuildAbortException as e:
# Instance build aborted due to a non-failure
Expand All @@ -996,7 +996,7 @@ def notify(status, msg="", **kwargs):
# Instance build encountered a non-recoverable error:
with excutils.save_and_reraise_exception():
self._set_instance_error_state(context, instance['uuid'])
notify("error", msg=unicode(e)) # notify that build failed
notify("error", fault=e) # notify that build failed

def _prebuild_instance(self, context, instance):
self._check_instance_exists(context, instance)
Expand Down Expand Up @@ -1128,6 +1128,8 @@ def _reschedule_or_error(self, context, instance, exc_info,

compute_utils.add_instance_fault_from_exc(context, self.conductor_api,
instance, exc_info[1], exc_info=exc_info)
self._notify_about_instance_usage(context, instance,
'instance.create.error', fault=exc_info[1])

try:
LOG.debug(_("Clean up resource before rescheduling."),
Expand Down Expand Up @@ -1515,12 +1517,12 @@ def _set_access_ip_values():

def _notify_about_instance_usage(self, context, instance, event_suffix,
network_info=None, system_metadata=None,
extra_usage_info=None):
extra_usage_info=None, fault=None):
compute_utils.notify_about_instance_usage(
self.notifier, context, instance, event_suffix,
network_info=network_info,
system_metadata=system_metadata,
extra_usage_info=extra_usage_info)
extra_usage_info=extra_usage_info, fault=fault)

def _deallocate_network(self, context, instance,
requested_networks=None):
Expand Down Expand Up @@ -1697,45 +1699,36 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
'create.end',
extra_usage_info={'message': _('Success')},
network_info=network_info)
except exception.InstanceNotFound as e:
with excutils.save_and_reraise_exception():
self._notify_about_instance_usage(context, instance,
'create.end',
extra_usage_info={'message': e.format_message()})
except exception.UnexpectedDeletingTaskStateError as e:
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError) as e:
with excutils.save_and_reraise_exception():
msg = e.format_message()
self._notify_about_instance_usage(context, instance,
'create.end', extra_usage_info={'message': msg})
'create.end', fault=e)
except exception.ComputeResourcesUnavailable as e:
LOG.debug(e.format_message(), instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error',
extra_usage_info={'message': e.format_message()})
'create.error', fault=e)
raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=e.format_message())
except exception.BuildAbortException as e:
with excutils.save_and_reraise_exception():
LOG.debug(e.format_message, instance=instance)
LOG.debug(e.format_message(), instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', extra_usage_info={'message':
e.format_message()})
'create.error', fault=e)
except (exception.VirtualInterfaceCreateException,
exception.VirtualInterfaceMacAddressException,
exception.FixedIpLimitExceeded,
exception.NoMoreNetworks) as e:
LOG.exception(_('Failed to allocate network(s)'),
instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', extra_usage_info={'message':
e.format_message()})
'create.error', fault=e)
msg = _('Failed to allocate the network(s), not rescheduling.')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
except Exception as e:
self._notify_about_instance_usage(context, instance,
'create.error',
extra_usage_info={'message': str(e)})
'create.error', fault=e)
raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=str(e))

Expand Down Expand Up @@ -2400,25 +2393,23 @@ def bad_volumes_callback(bad_devices):
bad_volumes_callback=bad_volumes_callback)

except Exception as error:
# Can't use save_and_reraise as we don't know yet if we
# will re-raise or not
type_, value, tb = sys.exc_info()

compute_utils.add_instance_fault_from_exc(context,
self.conductor_api, instance, error,
sys.exc_info())

# if the reboot failed but the VM is running don't
# put it into an error state
new_power_state = self._get_power_state(context, instance)
if new_power_state == power_state.RUNNING:
LOG.warning(_('Reboot failed but instance is running'),
context=context, instance=instance)
else:
LOG.error(_('Cannot reboot instance: %s'), error,
context=context, instance=instance)
self._set_instance_obj_error_state(context, instance)
raise type_, value, tb
with excutils.save_and_reraise_exception() as ctxt:
exc_info = sys.exc_info()
# if the reboot failed but the VM is running don't
# put it into an error state
new_power_state = self._get_power_state(context, instance)
if new_power_state == power_state.RUNNING:
LOG.warning(_('Reboot failed but instance is running'),
context=context, instance=instance)
compute_utils.add_instance_fault_from_exc(context,
self.conductor_api, instance, error, exc_info)
self._notify_about_instance_usage(context, instance,
'reboot.error', fault=error)
ctxt.reraise = False
else:
LOG.error(_('Cannot reboot instance: %s'), error,
context=context, instance=instance)
self._set_instance_obj_error_state(context, instance)

if not new_power_state:
new_power_state = self._get_power_state(context, instance)
Expand Down Expand Up @@ -3127,9 +3118,6 @@ def _reschedule_resize_or_reraise(self, context, image, instance, exc_info,
rescheduled = False
instance_uuid = instance['uuid']

compute_utils.add_instance_fault_from_exc(context, self.conductor_api,
instance, exc_info[0], exc_info=exc_info)

try:
# NOTE(comstud): remove the scheduler RPCAPI method when
# this is adjusted to send to conductor... and then
Expand All @@ -3143,14 +3131,23 @@ def _reschedule_resize_or_reraise(self, context, image, instance, exc_info,
rescheduled = self._reschedule(context, request_spec,
filter_properties, instance_uuid, scheduler_method,
method_args, task_state, exc_info)
except Exception:
except Exception as error:
rescheduled = False
LOG.exception(_("Error trying to reschedule"),
instance_uuid=instance_uuid)
compute_utils.add_instance_fault_from_exc(context,
self.conductor_api, instance, error,
exc_info=sys.exc_info())
self._notify_about_instance_usage(context, instance,
'resize.error', fault=error)

if rescheduled:
# log the original build error
self._log_original_error(exc_info, instance_uuid)
compute_utils.add_instance_fault_from_exc(context,
self.conductor_api, instance, exc_info[1],
exc_info=exc_info)
self._notify_about_instance_usage(context, instance,
'resize.error', fault=exc_info[1])
else:
# not re-scheduling
raise exc_info[0], exc_info[1], exc_info[2]
Expand Down
44 changes: 34 additions & 10 deletions nova/compute/utils.py
Expand Up @@ -39,12 +39,11 @@
LOG = log.getLogger(__name__)


def add_instance_fault_from_exc(context, conductor,
instance, fault, exc_info=None):
"""Adds the specified fault to the database."""
def exception_to_dict(fault):
"""Converts exceptions to a dict for use in notifications."""
#TODO(johngarbutt) move to nova/exception.py to share with wrap_exception

code = 500

if hasattr(fault, "kwargs"):
code = fault.kwargs.get('code', 500)

Expand All @@ -65,17 +64,35 @@ def add_instance_fault_from_exc(context, conductor,
# MySQL silently truncates overly long messages, but PostgreSQL throws an
# error if we don't truncate it.
u_message = unicode(message)[:255]
details = ''

if exc_info and code == 500:
fault_dict = dict(exception=fault)
fault_dict["message"] = u_message
fault_dict["code"] = code
return fault_dict


def _get_fault_details(exc_info, error_code):
details = ''
if exc_info and error_code == 500:
tb = exc_info[2]
details += ''.join(traceback.format_tb(tb))
if tb:
details = ''.join(traceback.format_tb(tb))
return unicode(details)


def add_instance_fault_from_exc(context, conductor,
instance, fault, exc_info=None):
"""Adds the specified fault to the database."""

fault_dict = exception_to_dict(fault)
code = fault_dict["code"]
details = _get_fault_details(exc_info, code)

values = {
'instance_uuid': instance['uuid'],
'code': code,
'message': u_message,
'details': unicode(details),
'message': fault_dict["message"],
'details': details,
'host': CONF.host
}
conductor.instance_fault_create(context, values)
Expand Down Expand Up @@ -294,7 +311,7 @@ def notify_usage_exists(notifier, context, instance_ref, current_period=False,

def notify_about_instance_usage(notifier, context, instance, event_suffix,
network_info=None, system_metadata=None,
extra_usage_info=None):
extra_usage_info=None, fault=None):
"""Send a notification about an instance.
:param notifier: a messaging.Notifier
Expand All @@ -311,6 +328,13 @@ def notify_about_instance_usage(notifier, context, instance, event_suffix,
usage_info = notifications.info_from_instance(context, instance,
network_info, system_metadata, **extra_usage_info)

if fault:
# NOTE(johngarbutt) mirrors the format in wrap_exception
fault_payload = exception_to_dict(fault)
LOG.debug(fault_payload["message"], instance=instance,
exc_info=True)
usage_info.update(fault_payload)

if event_suffix.endswith("error"):
method = notifier.error
else:
Expand Down
14 changes: 9 additions & 5 deletions nova/tests/compute/test_compute.py
Expand Up @@ -2337,6 +2337,7 @@ def _test_reboot(self, soft,
'args': (econtext, instance, expected_nw_info,
reboot_type),
'kwargs': {'block_device_info': fake_block_dev_info}}
fault = exception.InstanceNotFound(instance_id='instance-0000')

def fake_reboot(*args, **kwargs):
reboot_call_info['args'] = args
Expand All @@ -2347,7 +2348,7 @@ def fake_reboot(*args, **kwargs):
# can't stub it out, thus we skip that comparison.
kwargs.pop('bad_volumes_callback')
if fail_reboot:
raise exception.InstanceNotFound(instance_id='instance-0000')
raise fault

self.stubs.Set(self.compute.driver, 'reboot', fake_reboot)

Expand All @@ -2362,15 +2363,16 @@ def fake_reboot(*args, **kwargs):
instance).AndReturn(fake_power_state3)

if test_delete:
fault = exception.InstanceNotFound(
instance_id=instance['uuid'])
db.instance_update_and_get_original(
econtext, updated_dbinstance1['uuid'],
{'power_state': new_power_state,
'task_state': None,
'vm_state': vm_states.ACTIVE},
update_cells=False,
columns_to_join=['system_metadata'],
).AndRaise(exception.InstanceNotFound(
instance_id=instance['uuid']))
).AndRaise(fault)
self.compute._notify_about_instance_usage(
econtext,
instance,
Expand All @@ -2381,8 +2383,7 @@ def fake_reboot(*args, **kwargs):
{'vm_state': vm_states.ERROR},
update_cells=False,
columns_to_join=['system_metadata'],
).AndRaise(exception.InstanceNotFound(
instance_id=instance['uuid']))
).AndRaise(fault)
else:
db.instance_update_and_get_original(
econtext, updated_dbinstance1['uuid'],
Expand All @@ -2392,6 +2393,9 @@ def fake_reboot(*args, **kwargs):
update_cells=False,
columns_to_join=['system_metadata'],
).AndReturn((None, updated_dbinstance2))
if fail_running:
self.compute._notify_about_instance_usage(econtext, instance,
'reboot.error', fault=fault)
self.compute._notify_about_instance_usage(
econtext,
instance,
Expand Down
15 changes: 7 additions & 8 deletions nova/tests/compute/test_compute_mgr.py
Expand Up @@ -1138,7 +1138,7 @@ def test_instance_not_found(self):
network_info=self.network_info,
block_device_info=self.block_device_info).AndRaise(exc)
self._notify_about_instance_usage('create.end',
extra_usage_info={'message': exc.format_message()}, stub=False)
fault=exc, stub=False)
conductor_rpcapi.ConductorAPI.instance_update(
self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor')
self.mox.ReplayAll()
Expand All @@ -1161,16 +1161,15 @@ def test_reschedule_on_exception(self):
self._notify_about_instance_usage('create.start',
extra_usage_info={'image_name': self.image.get('name')})
self._build_and_run_instance_update()
exc = test.TestingException()
self.compute.driver.spawn(self.context, self.instance, self.image,
self.injected_files, self.admin_pass,
network_info=self.network_info,
block_device_info=self.block_device_info).AndRaise(
test.TestingException())
block_device_info=self.block_device_info).AndRaise(exc)
conductor_rpcapi.ConductorAPI.instance_update(
self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor')
self._notify_about_instance_usage('create.error',
extra_usage_info={'message': str(test.TestingException())},
stub=False)
fault=exc, stub=False)
self.mox.ReplayAll()

self.assertRaises(exception.RescheduledException,
Expand Down Expand Up @@ -1214,7 +1213,7 @@ def test_spawn_network_alloc_failure(self):
mock.call(self.context, self.instance, 'create.start',
extra_usage_info={'image_name': self.image.get('name')}),
mock.call(self.context, self.instance, 'create.error',
extra_usage_info={'message': exc.format_message()})])
fault=exc)])

save.assert_has_calls([
mock.call(),
Expand Down Expand Up @@ -1249,7 +1248,7 @@ def instance_claim(self, context, instance, limits):
self._notify_about_instance_usage('create.start',
extra_usage_info={'image_name': self.image.get('name')})
self._notify_about_instance_usage('create.error',
extra_usage_info={'message': exc.format_message()}, stub=False)
fault=exc, stub=False)
self.compute.compute_task_api.build_instances(self.context,
[self.instance], self.image, [], self.admin_pass,
self.injected_files, self.requested_networks,
Expand Down Expand Up @@ -1283,7 +1282,7 @@ def test_build_resources_buildabort_reraise(self):
self.requested_networks, self.security_groups, self.image,
self.block_device_mapping).AndRaise(exc)
self._notify_about_instance_usage('create.error',
extra_usage_info={'message': exc.format_message()}, stub=False)
fault=exc, stub=False)
self.mox.ReplayAll()
self.assertRaises(exception.BuildAbortException,
self.compute._build_and_run_instance, self.context,
Expand Down

0 comments on commit f35c63d

Please sign in to comment.