Skip to content

Commit

Permalink
Make resource tracker use UUIDs instead of names
Browse files Browse the repository at this point in the history
This makes the resource tracker look up and create ComputeNode objects
by uuid instead of nodename. For drivers like ironic that already
provide 'uuid' in the resources dict, we can use that. For those
that do not, we force the uuid to be the locally-persisted node
uuid, and use that to find/create the ComputeNode object.

A (happy) side-effect of this is that if we find a deleted compute
node object that matches that of our hypervisor, we undelete it
instead of re-creating one with a new uuid, which may clash with our
old one. This means we remove some of the special-casing of ironic
rebalance, although the tests for that still largely stay the same.

Change-Id: I6a582a38c302fd1554a49abc38cfeda7c324d911
  • Loading branch information
kk7ds committed Jan 30, 2023
1 parent 0efdbb3 commit 23c5f3d
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 169 deletions.
31 changes: 16 additions & 15 deletions nova/compute/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
from nova.virt import driver
from nova.virt import event as virtevent
from nova.virt import hardware
import nova.virt.node
from nova.virt import storage_users
from nova.virt import virtapi
from nova.volume import cinder
Expand Down Expand Up @@ -1471,27 +1472,27 @@ def _get_nodes(self, context):
:return: a dict of ComputeNode objects keyed by the UUID of the given
node.
"""
nodes_by_uuid = {}
try:
node_names = self.driver.get_available_nodes()
node_uuids = self.driver.get_available_node_uuids()
except exception.VirtDriverNotReady:
LOG.warning(
"Virt driver is not ready. If this is the first time this "
"service is starting on this host, then you can ignore this "
"warning.")
"service is starting on this host, then you can ignore "
"this warning.")
return {}

for node_name in node_names:
try:
node = objects.ComputeNode.get_by_host_and_nodename(
context, self.host, node_name)
nodes_by_uuid[node.uuid] = node
except exception.ComputeHostNotFound:
LOG.warning(
"Compute node %s not found in the database. If this is "
"the first time this service is starting on this host, "
"then you can ignore this warning.", node_name)
return nodes_by_uuid
nodes = objects.ComputeNodeList.get_all_by_uuids(context, node_uuids)
if not nodes:
# NOTE(danms): This should only happen if the compute_id is
# pre-provisioned on a host that has never started.
LOG.warning('Compute nodes %s for host %s were not found in the '
'database. If this is the first time this service is '
'starting on this host, then you can ignore this '
'warning.',
node_uuids, self.host)
return {}

return {n.uuid: n for n in nodes}

def _ensure_existing_node_identity(self, service_ref):
"""If we are upgrading from an older service version, we need
Expand Down
83 changes: 31 additions & 52 deletions nova/compute/resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from nova.scheduler.client import report
from nova import utils
from nova.virt import hardware
from nova.virt import node


CONF = nova.conf.CONF
Expand Down Expand Up @@ -668,50 +669,6 @@ def disabled(self, nodename):
return (nodename not in self.compute_nodes or
not self.driver.node_is_available(nodename))

def _check_for_nodes_rebalance(self, context, resources, nodename):
"""Check if nodes rebalance has happened.
The ironic driver maintains a hash ring mapping bare metal nodes
to compute nodes. If a compute dies, the hash ring is rebuilt, and
some of its bare metal nodes (more precisely, those not in ACTIVE
state) are assigned to other computes.
This method checks for this condition and adjusts the database
accordingly.
:param context: security context
:param resources: initial values
:param nodename: node name
:returns: True if a suitable compute node record was found, else False
"""
if not self.driver.rebalances_nodes:
return False

# Its possible ironic just did a node re-balance, so let's
# check if there is a compute node that already has the correct
# hypervisor_hostname. We can re-use that rather than create a
# new one and have to move existing placement allocations
cn_candidates = objects.ComputeNodeList.get_by_hypervisor(
context, nodename)

if len(cn_candidates) == 1:
cn = cn_candidates[0]
LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s",
{"name": nodename, "old": cn.host, "new": self.host})
cn.host = self.host
self.compute_nodes[nodename] = cn
self._copy_resources(cn, resources)
self._setup_pci_tracker(context, cn, resources)
self._update(context, cn)
return True
elif len(cn_candidates) > 1:
LOG.error(
"Found more than one ComputeNode for nodename %s. "
"Please clean up the orphaned ComputeNode records in your DB.",
nodename)

return False

def _init_compute_node(self, context, resources):
"""Initialize the compute node if it does not already exist.
Expand All @@ -729,6 +686,7 @@ def _init_compute_node(self, context, resources):
False otherwise
"""
nodename = resources['hypervisor_hostname']
node_uuid = resources['uuid']

# if there is already a compute node just use resources
# to initialize
Expand All @@ -740,16 +698,30 @@ def _init_compute_node(self, context, resources):

# now try to get the compute node record from the
# database. If we get one we use resources to initialize
cn = self._get_compute_node(context, nodename)

# We use read_deleted=True so that we will find and recover a deleted
# node object, if necessary.
with utils.temporary_mutation(context, read_deleted='yes'):
cn = self._get_compute_node(context, node_uuid)
if cn and cn.deleted:
# Undelete and save this right now so that everything below
# can continue without read_deleted=yes
LOG.info('Undeleting compute node %s', cn.uuid)
cn.deleted = False
cn.deleted_at = None
cn.save()
if cn:
if cn.host != self.host:
LOG.info("ComputeNode %(name)s moving from %(old)s to %(new)s",
{"name": nodename, "old": cn.host, "new": self.host})
cn.host = self.host
self._update(context, cn)

self.compute_nodes[nodename] = cn
self._copy_resources(cn, resources)
self._setup_pci_tracker(context, cn, resources)
return False

if self._check_for_nodes_rebalance(context, resources, nodename):
return False

# there was no local copy and none in the database
# so we need to create a new compute node. This needs
# to be initialized with resource values.
Expand Down Expand Up @@ -889,6 +861,14 @@ def update_available_resource(self, context, nodename, startup=False):
# contains a non-None value, even for non-Ironic nova-compute hosts. It
# is this value that will be populated in the compute_nodes table.
resources['host_ip'] = CONF.my_ip
if 'uuid' not in resources:
# NOTE(danms): Any driver that does not provide a uuid per
# node gets the locally-persistent compute_id. Only ironic
# should be setting the per-node uuid (and returning
# multiple nodes in general). If this is the first time we
# are creating a compute node on this host, we will
# generate and persist this uuid for the future.
resources['uuid'] = node.get_local_node_uuid()

# We want the 'cpu_info' to be None from the POV of the
# virt driver, but the DB requires it to be non-null so
Expand Down Expand Up @@ -1014,14 +994,13 @@ def _update_available_resource(self, context, resources, startup=False):
if startup:
self._check_resources(context)

def _get_compute_node(self, context, nodename):
def _get_compute_node(self, context, node_uuid):
"""Returns compute node for the host and nodename."""
try:
return objects.ComputeNode.get_by_host_and_nodename(
context, self.host, nodename)
return objects.ComputeNode.get_by_uuid(context, node_uuid)
except exception.NotFound:
LOG.warning("No compute node record for %(host)s:%(node)s",
{'host': self.host, 'node': nodename})
{'host': self.host, 'node': node_uuid})

def _report_hypervisor_resource_view(self, resources):
"""Log the hypervisor's view of free resources.
Expand Down
7 changes: 5 additions & 2 deletions nova/objects/compute_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,8 +388,11 @@ def update_from_virt_driver(self, resources):
# The uuid field is read-only so it should only be set when
# creating the compute node record for the first time. Ignore
# it otherwise.
if key == 'uuid' and 'uuid' in self:
continue
if (key == 'uuid' and 'uuid' in self and
resources[key] != self.uuid):
raise exception.InvalidNodeConfiguration(
reason='Attempt to overwrite node %s with %s!' % (
self.uuid, resources[key]))
setattr(self, key, resources[key])

# supported_instances has a different name in compute_node
Expand Down
3 changes: 3 additions & 0 deletions nova/tests/functional/libvirt/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ def _start_compute(hostname, host_info):
with mock.patch('nova.virt.node.get_local_node_uuid') as m:
m.return_value = str(getattr(uuids, 'node_%s' % hostname))
self.computes[hostname] = _start_compute(hostname, host_info)
# We need to trigger libvirt.Host() to capture the node-local
# uuid while we have it mocked out.
self.computes[hostname].driver._host.get_node_uuid()

self.compute_rp_uuids[hostname] = self.placement.get(
'/resource_providers?name=%s' % hostname).body[
Expand Down
39 changes: 17 additions & 22 deletions nova/tests/unit/compute/test_compute_mgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,26 +1160,25 @@ def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
self.context, {active_instance.uuid, evacuating_instance.uuid},
mock_get_nodes.return_value.keys())

@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(objects.ComputeNodeList, 'get_all_by_uuids')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_get_nodes(self, mock_driver_get_nodes, mock_get_by_host_and_node):
def test_get_nodes(self, mock_driver_get_nodes, mock_get_by_uuid):
mock_driver_get_nodes.return_value = ['fake_node1', 'fake_node2']
# NOTE(danms): The fake driver, by default, uses
# uuidsentinel.$node_name, so we can predict the uuids it will
# return here.
cn1 = objects.ComputeNode(uuid=uuids.fake_node1)
cn2 = objects.ComputeNode(uuid=uuids.fake_node2)
mock_get_by_host_and_node.side_effect = [cn1, cn2]
mock_get_by_uuid.return_value = [cn1, cn2]

nodes = self.compute._get_nodes(self.context)

self.assertEqual({uuids.fake_node1: cn1, uuids.fake_node2: cn2}, nodes)

mock_driver_get_nodes.assert_called_once_with()
mock_get_by_host_and_node.assert_has_calls([
mock.call(self.context, self.compute.host, 'fake_node1'),
mock.call(self.context, self.compute.host, 'fake_node2')
])
mock_get_by_uuid.assert_called_once_with(self.context,
[uuids.fake_node1,
uuids.fake_node2])

@mock.patch.object(manager.LOG, 'warning')
@mock.patch.object(
Expand All @@ -1201,29 +1200,25 @@ def test_get_nodes_driver_not_ready(
"is starting on this host, then you can ignore this warning.")

@mock.patch.object(manager.LOG, 'warning')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
@mock.patch.object(objects.ComputeNodeList, 'get_all_by_uuids')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_node_uuids')
def test_get_nodes_node_not_found(
self, mock_driver_get_nodes, mock_get_by_host_and_node,
self, mock_driver_get_nodes, mock_get_all_by_uuids,
mock_log_warning):
mock_driver_get_nodes.return_value = ['fake-node1', 'fake-node2']
cn2 = objects.ComputeNode(uuid=uuids.cn2)
mock_get_by_host_and_node.side_effect = [
exception.ComputeHostNotFound(host='fake-node1'), cn2]
mock_driver_get_nodes.return_value = ['fake-node1']
mock_get_all_by_uuids.return_value = []

nodes = self.compute._get_nodes(self.context)

self.assertEqual({uuids.cn2: cn2}, nodes)
self.assertEqual({}, nodes)

mock_driver_get_nodes.assert_called_once_with()
mock_get_by_host_and_node.assert_has_calls([
mock.call(self.context, self.compute.host, 'fake-node1'),
mock.call(self.context, self.compute.host, 'fake-node2')
])
mock_get_all_by_uuids.assert_called_once_with(self.context,
['fake-node1'])
mock_log_warning.assert_called_once_with(
"Compute node %s not found in the database. If this is the first "
"time this service is starting on this host, then you can ignore "
"this warning.", 'fake-node1')
"Compute nodes %s for host %s were not found in the database. "
"If this is the first time this service is starting on this host, "
"then you can ignore this warning.", ['fake-node1'], 'fake-mini')

def test_init_host_disk_devices_configuration_failure(self):
self.flags(max_disk_devices_to_attach=0, group='compute')
Expand Down

0 comments on commit 23c5f3d

Please sign in to comment.