Skip to content

Commit

Permalink
rt: use a single ResourceTracker object instance
Browse files Browse the repository at this point in the history
This patch removes the ResourceTracker.nodename attribute, switches the
compute manager to only create a single ResourceTracker object instance
which itself now has an in-memory dict of ComputeNode objects that are
managed by the nova-compute daemon.

This isolates the code that manages ComputeNode objects and resources in
just the resource tracker, which will make it possible for the scheduler
report client to manage Ironic nodes and custom resource classes
properly. The scheduler report client contains a cache of
ResourceProvider, Inventory, and Allocation records. We definitely did
not want to have multiple ResourceTracker object instances, each with
their own cache of ResourceProvider objects. Having a single
ResourceTracker and single scheduler report client is both more
efficient and a cleaner interface into the compute manager.

Co-Authored-By: Chris Dent <cdent@anticdent.org>
Change-Id: I6827137f35c0cb4f9fc4c6f753d9a035326ed01b
blueprint: custom-resource-classes
  • Loading branch information
jaypipes and cdent committed Jan 16, 2017
1 parent f24af19 commit 1c96759
Show file tree
Hide file tree
Showing 10 changed files with 342 additions and 494 deletions.
15 changes: 8 additions & 7 deletions nova/compute/claims.py
Expand Up @@ -74,11 +74,12 @@ class Claim(NopClaim):
correct decisions with respect to host selection.
"""

def __init__(self, context, instance, tracker, resources, pci_requests,
overhead=None, limits=None):
def __init__(self, context, instance, nodename, tracker, resources,
pci_requests, overhead=None, limits=None):
super(Claim, self).__init__()
# Stash a copy of the instance at the current point of time
self.instance = instance.obj_clone()
self.nodename = nodename
self._numa_topology_loaded = False
self.tracker = tracker
self._pci_requests = pci_requests
Expand Down Expand Up @@ -122,7 +123,7 @@ def abort(self):
"""
LOG.debug("Aborting claim: %s", self, instance=self.instance)
self.tracker.abort_instance_claim(self.context, self.instance,
self.instance.node)
self.nodename)

def _claim_test(self, resources, limits=None):
"""Test if this claim can be satisfied given available resources and
Expand Down Expand Up @@ -260,14 +261,14 @@ class MoveClaim(Claim):
Move can be either a migrate/resize, live-migrate or an evacuate operation.
"""
def __init__(self, context, instance, instance_type, image_meta, tracker,
resources, pci_requests, overhead=None, limits=None):
def __init__(self, context, instance, nodename, instance_type, image_meta,
tracker, resources, pci_requests, overhead=None, limits=None):
self.context = context
self.instance_type = instance_type
if isinstance(image_meta, dict):
image_meta = objects.ImageMeta.from_dict(image_meta)
self.image_meta = image_meta
super(MoveClaim, self).__init__(context, instance, tracker,
super(MoveClaim, self).__init__(context, instance, nodename, tracker,
resources, pci_requests,
overhead=overhead, limits=limits)
self.migration = None
Expand Down Expand Up @@ -298,6 +299,6 @@ def abort(self):
LOG.debug("Aborting claim: %s", self, instance=self.instance)
self.tracker.drop_move_claim(
self.context,
self.instance, self.instance.node,
self.instance, self.nodename,
instance_type=self.instance_type)
self.instance.drop_migration_context()
56 changes: 21 additions & 35 deletions nova/compute/manager.py
Expand Up @@ -514,7 +514,7 @@ def __init__(self, compute_driver=None, *args, **kwargs):
self.consoleauth_rpcapi = consoleauth.rpcapi.ConsoleAuthAPI()
self.cells_rpcapi = cells_rpcapi.CellsAPI()
self.scheduler_client = scheduler_client.SchedulerClient()
self._resource_tracker_dict = {}
self._resource_tracker = None
self.instance_events = InstanceEvents()
self._sync_power_pool = eventlet.GreenPool(
size=CONF.sync_power_state_pool_size)
Expand Down Expand Up @@ -547,26 +547,17 @@ def reset(self):
compute_rpcapi.LAST_VERSION = None
self.compute_rpcapi = compute_rpcapi.ComputeAPI()

def _get_resource_tracker(self, nodename):
rt = self._resource_tracker_dict.get(nodename)
if not rt:
if not self.driver.node_is_available(nodename):
raise exception.NovaException(
_("%s is not a valid node managed by this "
"compute host.") % nodename)

rt = resource_tracker.ResourceTracker(self.host,
self.driver,
nodename)
self._resource_tracker_dict[nodename] = rt
return rt
def _get_resource_tracker(self):
if not self._resource_tracker:
rt = resource_tracker.ResourceTracker(self.host, self.driver)
self._resource_tracker = rt
return self._resource_tracker

def _update_resource_tracker(self, context, instance):
"""Let the resource tracker know that an instance has changed state."""

if (instance.host == self.host and
self.driver.node_is_available(instance.node)):
rt = self._get_resource_tracker(instance.node)
if instance.host == self.host:
rt = self._get_resource_tracker()
rt.update_usage(context, instance, instance.node)

def _instance_update(self, context, instance, **kwargs):
Expand Down Expand Up @@ -1899,7 +1890,7 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
self._check_device_tagging(requested_networks, block_device_mapping)

try:
rt = self._get_resource_tracker(node)
rt = self._get_resource_tracker()
with rt.instance_claim(context, instance, node, limits):
# NOTE(russellb) It's important that this validation be done
# *after* the resource tracker instance claim, as that is where
Expand Down Expand Up @@ -2713,7 +2704,7 @@ def rebuild_instance(self, context, instance, orig_image_ref, image_ref,

LOG.info(_LI("Rebuilding instance"), instance=instance)
if scheduled_node is not None:
rt = self._get_resource_tracker(scheduled_node)
rt = self._get_resource_tracker()
rebuild_claim = rt.rebuild_claim
else:
rebuild_claim = claims.NopClaim
Expand Down Expand Up @@ -3505,7 +3496,7 @@ def _confirm_resize(self, context, instance, quotas,
with migration.obj_as_admin():
migration.save()

rt = self._get_resource_tracker(migration.source_node)
rt = self._get_resource_tracker()
rt.drop_move_claim(context, instance, migration.source_node,
old_instance_type, prefix='old_')
instance.drop_migration_context()
Expand Down Expand Up @@ -3597,7 +3588,7 @@ def revert_resize(self, context, instance, migration, reservations):
instance.revert_migration_context()
instance.save()

rt = self._get_resource_tracker(instance.node)
rt = self._get_resource_tracker()
rt.drop_move_claim(context, instance, instance.node)

self.compute_rpcapi.finish_revert_resize(context, instance,
Expand Down Expand Up @@ -3715,7 +3706,7 @@ def _prep_resize(self, context, image, instance, instance_type,
instance.save()

limits = filter_properties.get('limits', {})
rt = self._get_resource_tracker(node)
rt = self._get_resource_tracker()
with rt.resize_claim(context, instance, instance_type, node,
image_meta=image, limits=limits) as claim:
LOG.info(_LI('Migrating'), instance=instance)
Expand Down Expand Up @@ -4436,7 +4427,7 @@ def _unshelve_instance(self, context, instance, image, filter_properties,
LOG.debug('No node specified, defaulting to %s', node,
instance=instance)

rt = self._get_resource_tracker(node)
rt = self._get_resource_tracker()
limits = filter_properties.get('limits', {})

shelved_image_ref = instance.image_ref
Expand Down Expand Up @@ -6532,7 +6523,7 @@ def _reclaim_queued_deletes(self, context):

def update_available_resource_for_node(self, context, nodename):

rt = self._get_resource_tracker(nodename)
rt = self._get_resource_tracker()
try:
rt.update_available_resource(context, nodename)
except exception.ComputeHostNotFound:
Expand All @@ -6544,18 +6535,17 @@ def update_available_resource_for_node(self, context, nodename):
# that this will resolve itself on the next run.
LOG.info(_LI("Compute node '%s' not found in "
"update_available_resource."), nodename)
self._resource_tracker_dict.pop(nodename, None)
# TODO(jaypipes): Yes, this is inefficient to throw away all of the
# compute nodes to force a rebuild, but this is only temporary
# until Ironic baremetal node resource providers are tracked
# properly in the report client and this is a tiny edge case
# anyway.
self._resource_tracker = None
return
except Exception:
LOG.exception(_LE("Error updating resources for node "
"%(node)s."), {'node': nodename})

# NOTE(comstud): Replace the RT cache before looping through
# compute nodes to delete below, as we can end up doing greenthread
# switches there. Best to have everyone using the newest cache
# ASAP.
self._resource_tracker_dict[nodename] = rt

@periodic_task.periodic_task(spacing=CONF.update_resources_interval)
def update_available_resource(self, context):
"""See driver.get_available_resource()
Expand All @@ -6572,10 +6562,6 @@ def update_available_resource(self, context):
for nodename in nodenames:
self.update_available_resource_for_node(context, nodename)

self._resource_tracker_dict = {
k: v for k, v in self._resource_tracker_dict.items()
if k in nodenames}

# Delete orphan compute node not reported by driver but still in db
for cn in compute_nodes_in_db:
if cn.hypervisor_hostname not in nodenames:
Expand Down

0 comments on commit 1c96759

Please sign in to comment.