Override update_available_resources interval

Add a config option 'update_resources_interval' to be able to set how often the update_available_resources periodic task should run. When a single nova-compute is managing a lot of nodes, we need the ability to tune the frequency of this task. The default value of 0 means to run on every periodic task interval (same as current/old behavior). Setting it to a negative value will prevent it from running at all. Setting it to a positive value will determine the number of seconds between calls. DocImpact This adds a new CONF option 'update_resources_interval' that controls how often the update_available_resource() method is called. Co-Authored-By: Ed Leafe <ed@leafe.com> Change-Id: I8986d8d97dbf348e219c5f1f0a62092115c096d5
openstack · Mar 23, 2015 · 0279506 · 0279506
1 parent fd6071c
commit 0279506
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 7 deletions.
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -180,6 +180,14 @@
                     'that its view of instances is in sync with nova. If the '
                     'CONF option `scheduler_tracks_instance_changes` is '
                     'False, changing this option will have no effect.'),
+    cfg.IntOpt('update_resources_interval',
+               default=0,
+               help='Interval in seconds for updating compute resources. A '
+                    'number less than 0 means to disable the task completely. '
+                    'Leaving this at the default of 0 will cause this to run '
+                    'at the default periodic interval. Setting it to any '
+                    'positive value will cause it to run at approximately '
+                    'that number of seconds.'),
 ]
 
 timeout_opts = [
@@ -6171,7 +6179,7 @@ def _reclaim_queued_deletes(self, context):
                                     "instance: %s"),
                                 e, instance=instance)
 
-    @periodic_task.periodic_task
+    @periodic_task.periodic_task(spacing=CONF.update_resources_interval)
     def update_available_resource(self, context):
         """See driver.get_available_resource()
 
@@ -6181,23 +6189,43 @@ def update_available_resource(self, context):
         :param context: security context
         """
         new_resource_tracker_dict = {}
+
+        # Delete orphan compute node not reported by driver but still in db
+        compute_nodes_in_db = self._get_compute_nodes_in_db(context,
+                                                            use_slave=True)
         nodenames = set(self.driver.get_available_nodes())
         for nodename in nodenames:
             rt = self._get_resource_tracker(nodename)
-            rt.update_available_resource(context)
+            try:
+                rt.update_available_resource(context)
+            except exception.ComputeHostNotFound:
+                # NOTE(comstud): We can get to this case if a node was
+                # marked 'deleted' in the DB and then re-added with a
+                # different auto-increment id. The cached resource
+                # tracker tried to update a deleted record and failed.
+                # Don't add this resource tracker to the new dict, so
+                # that this will resolve itself on the next run.
+                LOG.info(_LI("Compute node '%s' not found in "
+                             "update_available_resource."), nodename)
+                continue
+            except Exception as e:
+                LOG.error(_LE("Error updating resources for node "
+                              "%(node)%s: %(e)s"),
+                          {'node': nodename, 'e': e})
             new_resource_tracker_dict[nodename] = rt
 
-        # Delete orphan compute node not reported by driver but still in db
-        compute_nodes_in_db = self._get_compute_nodes_in_db(context,
-                                                            use_slave=True)
+        # NOTE(comstud): Replace the RT cache before looping through
+        # compute nodes to delete below, as we can end up doing greenthread
+        # switches there. Best to have everyone using the newest cache
+        # ASAP.
+        self._resource_tracker_dict = new_resource_tracker_dict
 
+        # Delete orphan compute node not reported by driver but still in db
         for cn in compute_nodes_in_db:
             if cn.hypervisor_hostname not in nodenames:
                 LOG.info(_LI("Deleting orphan compute node %s") % cn.id)
                 cn.destroy()
 
-        self._resource_tracker_dict = new_resource_tracker_dict
-
     def _get_compute_nodes_in_db(self, context, use_slave=False):
         try:
             return objects.ComputeNodeList.get_all_by_host(context, self.host,

diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
@@ -52,6 +52,7 @@
 from nova import utils
 from nova.virt import driver as virt_driver
 from nova.virt import event as virtevent
+from nova.virt import fake as fake_driver
 from nova.virt import hardware
 
 
@@ -122,6 +123,66 @@ def _mark_shutdown(*args, **kwargs):
                           '_shutdown_instance', 'delete'],
                          methods_called)
 
+    @mock.patch.object(manager.ComputeManager, '_get_resource_tracker')
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
+    def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
+                                       get_rt):
+        info = {'cn_id': 1}
+
+        def _make_compute_node(hyp_hostname):
+            cn = mock.Mock(spec_set=['hypervisor_hostname', 'id',
+                                     'destroy'])
+            cn.id = info['cn_id']
+            info['cn_id'] += 1
+            cn.hypervisor_hostname = hyp_hostname
+            return cn
+
+        def _make_rt(node):
+            n = mock.Mock(spec_set=['update_available_resource',
+                                    'nodename'])
+            n.nodename = node
+            return n
+
+        ctxt = mock.Mock()
+        db_nodes = [_make_compute_node('node1'),
+                    _make_compute_node('node2'),
+                    _make_compute_node('node3'),
+                    _make_compute_node('node4')]
+        avail_nodes = set(['node2', 'node3', 'node4', 'node5'])
+        avail_nodes_l = list(avail_nodes)
+        rts = [_make_rt(node) for node in avail_nodes_l]
+        # Make the 2nd and 3rd ones raise
+        exc = exception.ComputeHostNotFound(host='fake')
+        rts[1].update_available_resource.side_effect = exc
+        exc = test.TestingException()
+        rts[2].update_available_resource.side_effect = exc
+        rts_iter = iter(rts)
+
+        def _get_rt_side_effect(*args, **kwargs):
+            return rts_iter.next()
+
+        expected_rt_dict = {avail_nodes_l[0]: rts[0],
+                            avail_nodes_l[2]: rts[2],
+                            avail_nodes_l[3]: rts[3]}
+        get_db_nodes.return_value = db_nodes
+        get_avail_nodes.return_value = avail_nodes
+        get_rt.side_effect = _get_rt_side_effect
+        self.compute.update_available_resource(ctxt)
+        get_db_nodes.assert_called_once_with(ctxt, use_slave=True)
+        self.assertEqual([mock.call(node) for node in avail_nodes],
+                         get_rt.call_args_list)
+        for rt in rts:
+            rt.update_available_resource.assert_called_once_with(ctxt)
+        self.assertEqual(expected_rt_dict,
+                         self.compute._resource_tracker_dict)
+        # First node in set should have been removed from DB
+        for db_node in db_nodes:
+            if db_node.hypervisor_hostname == 'node1':
+                db_node.destroy.assert_called_once_with()
+            else:
+                self.assertFalse(db_node.destroy.called)
+
     def test_allocate_network_succeeds_after_retries(self):
         self.flags(network_allocate_retries=8)