diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 0d1940b123b..0dfe7d44f60 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -180,6 +180,14 @@ 'that its view of instances is in sync with nova. If the ' 'CONF option `scheduler_tracks_instance_changes` is ' 'False, changing this option will have no effect.'), + cfg.IntOpt('update_resources_interval', + default=0, + help='Interval in seconds for updating compute resources. A ' + 'number less than 0 means to disable the task completely. ' + 'Leaving this at the default of 0 will cause this to run ' + 'at the default periodic interval. Setting it to any ' + 'positive value will cause it to run at approximately ' + 'that number of seconds.'), ] timeout_opts = [ @@ -6171,7 +6179,7 @@ def _reclaim_queued_deletes(self, context): "instance: %s"), e, instance=instance) - @periodic_task.periodic_task + @periodic_task.periodic_task(spacing=CONF.update_resources_interval) def update_available_resource(self, context): """See driver.get_available_resource() @@ -6181,23 +6189,43 @@ def update_available_resource(self, context): :param context: security context """ new_resource_tracker_dict = {} + + # Delete orphan compute node not reported by driver but still in db + compute_nodes_in_db = self._get_compute_nodes_in_db(context, + use_slave=True) nodenames = set(self.driver.get_available_nodes()) for nodename in nodenames: rt = self._get_resource_tracker(nodename) - rt.update_available_resource(context) + try: + rt.update_available_resource(context) + except exception.ComputeHostNotFound: + # NOTE(comstud): We can get to this case if a node was + # marked 'deleted' in the DB and then re-added with a + # different auto-increment id. The cached resource + # tracker tried to update a deleted record and failed. + # Don't add this resource tracker to the new dict, so + # that this will resolve itself on the next run. + LOG.info(_LI("Compute node '%s' not found in " + "update_available_resource."), nodename) + continue + except Exception as e: + LOG.error(_LE("Error updating resources for node " + "%(node)%s: %(e)s"), + {'node': nodename, 'e': e}) new_resource_tracker_dict[nodename] = rt - # Delete orphan compute node not reported by driver but still in db - compute_nodes_in_db = self._get_compute_nodes_in_db(context, - use_slave=True) + # NOTE(comstud): Replace the RT cache before looping through + # compute nodes to delete below, as we can end up doing greenthread + # switches there. Best to have everyone using the newest cache + # ASAP. + self._resource_tracker_dict = new_resource_tracker_dict + # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: LOG.info(_LI("Deleting orphan compute node %s") % cn.id) cn.destroy() - self._resource_tracker_dict = new_resource_tracker_dict - def _get_compute_nodes_in_db(self, context, use_slave=False): try: return objects.ComputeNodeList.get_all_by_host(context, self.host, diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 066f287884a..81a8fb932d2 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -52,6 +52,7 @@ from nova import utils from nova.virt import driver as virt_driver from nova.virt import event as virtevent +from nova.virt import fake as fake_driver from nova.virt import hardware @@ -122,6 +123,66 @@ def _mark_shutdown(*args, **kwargs): '_shutdown_instance', 'delete'], methods_called) + @mock.patch.object(manager.ComputeManager, '_get_resource_tracker') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource(self, get_db_nodes, get_avail_nodes, + get_rt): + info = {'cn_id': 1} + + def _make_compute_node(hyp_hostname): + cn = mock.Mock(spec_set=['hypervisor_hostname', 'id', + 'destroy']) + cn.id = info['cn_id'] + info['cn_id'] += 1 + cn.hypervisor_hostname = hyp_hostname + return cn + + def _make_rt(node): + n = mock.Mock(spec_set=['update_available_resource', + 'nodename']) + n.nodename = node + return n + + ctxt = mock.Mock() + db_nodes = [_make_compute_node('node1'), + _make_compute_node('node2'), + _make_compute_node('node3'), + _make_compute_node('node4')] + avail_nodes = set(['node2', 'node3', 'node4', 'node5']) + avail_nodes_l = list(avail_nodes) + rts = [_make_rt(node) for node in avail_nodes_l] + # Make the 2nd and 3rd ones raise + exc = exception.ComputeHostNotFound(host='fake') + rts[1].update_available_resource.side_effect = exc + exc = test.TestingException() + rts[2].update_available_resource.side_effect = exc + rts_iter = iter(rts) + + def _get_rt_side_effect(*args, **kwargs): + return rts_iter.next() + + expected_rt_dict = {avail_nodes_l[0]: rts[0], + avail_nodes_l[2]: rts[2], + avail_nodes_l[3]: rts[3]} + get_db_nodes.return_value = db_nodes + get_avail_nodes.return_value = avail_nodes + get_rt.side_effect = _get_rt_side_effect + self.compute.update_available_resource(ctxt) + get_db_nodes.assert_called_once_with(ctxt, use_slave=True) + self.assertEqual([mock.call(node) for node in avail_nodes], + get_rt.call_args_list) + for rt in rts: + rt.update_available_resource.assert_called_once_with(ctxt) + self.assertEqual(expected_rt_dict, + self.compute._resource_tracker_dict) + # First node in set should have been removed from DB + for db_node in db_nodes: + if db_node.hypervisor_hostname == 'node1': + db_node.destroy.assert_called_once_with() + else: + self.assertFalse(db_node.destroy.called) + def test_allocate_network_succeeds_after_retries(self): self.flags(network_allocate_retries=8)