Allocate mdevs when resizing or reverting resize

Now that allocations are passed to the methods, we can ask whether we need to use mediated devices for the instance. Adding a large functional test for verifying it works. Change-Id: I018762335b19c98045ad42147080203092b51c27 Closes-Bug: #1778563
openstack · Apr 20, 2020 · d2e0afc · d2e0afc
1 parent 580eedb
commit d2e0afc
Show file tree

Hide file tree

Showing 5 changed files with 217 additions and 63 deletions.
diff --git a/doc/source/admin/virtual-gpu.rst b/doc/source/admin/virtual-gpu.rst
@@ -363,17 +363,27 @@ For libvirt:
   vGPU resources). The proposed workaround is to rebuild the instance after
   resizing it. The rebuild operation allocates vGPUS to the instance.
 
+  .. versionchanged:: 21.0.0
+
+     This has been resolved in the Ussuri release. See `bug 1778563`_.
+
 * Cold migrating an instance to another host will have the same problem as
   resize. If you want to migrate an instance, make sure to rebuild it after the
   migration.
 
+  .. versionchanged:: 21.0.0
+
+     This has been resolved in the Ussuri release. See `bug 1778563`_.
+
 * Rescue images do not use vGPUs. An instance being rescued does not keep its
   vGPUs during rescue. During that time, another instance can receive those
   vGPUs. This is a known issue. The recommended workaround is to rebuild an
   instance immediately after rescue. However, rebuilding the rescued instance
   only helps if there are other free vGPUs on the host.
 
-  .. note:: This has been resolved in the Rocky release [#]_.
+  .. versionchanged:: 18.0.0
+
+     This has been resolved in the Rocky release. See `bug 1762688`_.
 
 For XenServer:
 
@@ -397,7 +407,8 @@ For XenServer:
 
 * Multiple GPU types per compute is not supported by the XenServer driver.
 
-.. [#] https://bugs.launchpad.net/nova/+bug/1762688
+.. _bug 1778563: https://bugs.launchpad.net/nova/+bug/1778563
+.. _bug 1762688: https://bugs.launchpad.net/nova/+bug/1762688
 
 .. Links
 .. _Intel GVT-g: https://01.org/igvt-g

diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
@@ -370,6 +370,43 @@ def _delete_server(self, server):
         self.api.delete_server(server['id'])
         self._wait_until_deleted(server)
 
+    def _confirm_resize(self, server):
+        self.api.post_server_action(server['id'], {'confirmResize': None})
+        server = self._wait_for_state_change(server, 'ACTIVE')
+        self._wait_for_instance_action_event(
+            server, instance_actions.CONFIRM_RESIZE,
+            'compute_confirm_resize', 'success')
+        return server
+
+    def _revert_resize(self, server):
+        # NOTE(sbauza): This method requires the caller to setup a fake
+        # notifier by stubbing it.
+        self.api.post_server_action(server['id'], {'revertResize': None})
+        server = self._wait_for_state_change(server, 'ACTIVE')
+        self._wait_for_migration_status(server, ['reverted'])
+        # Note that the migration status is changed to "reverted" in the
+        # dest host revert_resize method but the allocations are cleaned up
+        # in the source host finish_revert_resize method so we need to wait
+        # for the finish_revert_resize method to complete.
+        fake_notifier.wait_for_versioned_notifications(
+            'instance.resize_revert.end')
+        return server
+
+    def _migrate_or_resize(self, server, request):
+        if not ('resize' in request or 'migrate' in request):
+            raise Exception('_migrate_or_resize only supports resize or '
+                            'migrate requests.')
+        self.api.post_server_action(server['id'], request)
+        self._wait_for_state_change(server, 'VERIFY_RESIZE')
+
+    def _resize_server(self, server, new_flavor):
+        resize_req = {
+            'resize': {
+                'flavorRef': new_flavor
+            }
+        }
+        self._migrate_or_resize(server, resize_req)
+
 
 class _IntegratedTestBase(test.TestCase, InstanceHelperMixin):
     REQUIRES_LOCKING = True
@@ -393,6 +430,9 @@ def setUp(self):
         self.placement_api = placement.api
         self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self))
 
+        fake_notifier.stub_notifier(self)
+        self.addCleanup(fake_notifier.reset)
+
         self._setup_services()
 
         self.addCleanup(nova.tests.unit.image.fake.FakeImageService_reset)
@@ -914,8 +954,7 @@ def _delete_and_check_allocations(self, server):
 
     def _move_and_check_allocations(self, server, request, old_flavor,
                                     new_flavor, source_rp_uuid, dest_rp_uuid):
-        self.api.post_server_action(server['id'], request)
-        self._wait_for_state_change(server, 'VERIFY_RESIZE')
+        self._migrate_or_resize(server, request)
 
         def _check_allocation():
             self.assertFlavorMatchesUsage(source_rp_uuid, old_flavor)
@@ -971,13 +1010,7 @@ def _resize_to_same_host_and_check_allocations(self, server, old_flavor,
         # Resize the server to the same host and check usages in VERIFY_RESIZE
         # state
         self.flags(allow_resize_to_same_host=True)
-        resize_req = {
-            'resize': {
-                'flavorRef': new_flavor['id']
-            }
-        }
-        self.api.post_server_action(server['id'], resize_req)
-        self._wait_for_state_change(server, 'VERIFY_RESIZE')
+        self._resize_server(server, new_flavor['id'])
 
         self.assertFlavorMatchesUsage(rp_uuid, old_flavor, new_flavor)
 
@@ -1044,23 +1077,3 @@ def assert_hypervisor_usage(self, compute_node_uuid, flavor,
         # Account for reserved_host_cpus.
         expected_vcpu_usage = CONF.reserved_host_cpus + flavor['vcpus']
         self.assertEqual(expected_vcpu_usage, hypervisor['vcpus_used'])
-
-    def _confirm_resize(self, server):
-        self.api.post_server_action(server['id'], {'confirmResize': None})
-        server = self._wait_for_state_change(server, 'ACTIVE')
-        self._wait_for_instance_action_event(
-            server, instance_actions.CONFIRM_RESIZE,
-            'compute_confirm_resize', 'success')
-        return server
-
-    def _revert_resize(self, server):
-        self.api.post_server_action(server['id'], {'revertResize': None})
-        server = self._wait_for_state_change(server, 'ACTIVE')
-        self._wait_for_migration_status(server, ['reverted'])
-        # Note that the migration status is changed to "reverted" in the
-        # dest host revert_resize method but the allocations are cleaned up
-        # in the source host finish_revert_resize method so we need to wait
-        # for the finish_revert_resize method to complete.
-        fake_notifier.wait_for_versioned_notifications(
-            'instance.resize_revert.end')
-        return server
diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py
@@ -14,6 +14,7 @@
 import fixtures
 import re
 
+import mock
 import os_resource_classes as orc
 from oslo_config import cfg
 from oslo_log import log as logging
@@ -24,17 +25,25 @@
 from nova import objects
 from nova.tests.functional.libvirt import base
 from nova.tests.unit.virt.libvirt import fakelibvirt
+from nova.virt.libvirt import driver as libvirt_driver
 from nova.virt.libvirt import utils as libvirt_utils
 
 CONF = cfg.CONF
 LOG = logging.getLogger(__name__)
 
 
+_DEFAULT_HOST = 'host1'
+
+
 class VGPUTestBase(base.ServersTestBase):
 
     FAKE_LIBVIRT_VERSION = 5000000
     FAKE_QEMU_VERSION = 3001000
 
+    # Since we run all computes by a single process, we need to identify which
+    # current compute service we use at the moment.
+    _current_host = _DEFAULT_HOST
+
     def setUp(self):
         super(VGPUTestBase, self).setUp()
         self.useFixture(fixtures.MockPatch(
@@ -45,6 +54,25 @@ def setUp(self):
         self.useFixture(fixtures.MockPatch(
             'nova.privsep.libvirt.create_mdev',
             side_effect=self._create_mdev))
+
+        # NOTE(sbauza): Since the fake create_mdev doesn't know which compute
+        # was called, we need to look at a value that can be provided just
+        # before the driver calls create_mdev. That's why we fake the below
+        # method for having the LibvirtDriver instance so we could modify
+        # the self.current_host value.
+        orig_get_vgpu_type_per_pgpu = (
+            libvirt_driver.LibvirtDriver._get_vgpu_type_per_pgpu)
+
+        def fake_get_vgpu_type_per_pgpu(_self, *args):
+            # See, here we look at the hostname from the virt driver...
+            self._current_host = _self._host.get_hostname()
+            # ... and then we call the original method
+            return orig_get_vgpu_type_per_pgpu(_self, *args)
+
+        self.useFixture(fixtures.MockPatch(
+            'nova.virt.libvirt.LibvirtDriver._get_vgpu_type_per_pgpu',
+             new=fake_get_vgpu_type_per_pgpu))
+
         self.context = context.get_admin_context()
 
     def pci2libvirt_address(self, address):
@@ -61,24 +89,30 @@ def _create_mdev(self, physical_device, mdev_type, uuid=None):
             uuid = uuidutils.generate_uuid()
         mdev_name = libvirt_utils.mdev_uuid2name(uuid)
         libvirt_parent = self.pci2libvirt_address(physical_device)
-        self.fake_connection.mdev_info.devices.update(
+        # Here, we get the right compute thanks by the self.current_host that
+        # was modified just before
+        connection = self.computes[
+            self._current_host].driver._host.get_connection()
+        connection.mdev_info.devices.update(
             {mdev_name: fakelibvirt.FakeMdevDevice(dev_name=mdev_name,
                                                    type_id=mdev_type,
                                                    parent=libvirt_parent)})
         return uuid
 
     def _start_compute_service(self, hostname):
-        self.fake_connection = self._get_connection(
+        fake_connection = self._get_connection(
             host_info=fakelibvirt.HostInfo(cpu_nodes=2, kB_mem=8192),
             # We want to create two pGPUs but no other PCI devices
             pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=0,
                                                     num_pfs=0,
                                                     num_vfs=0,
                                                     num_mdevcap=2),
             hostname=hostname)
-
-        self.mock_conn.return_value = self.fake_connection
-        compute = self.start_service('compute', host=hostname)
+        with mock.patch('nova.virt.libvirt.host.Host.get_connection',
+                        return_value=fake_connection):
+            # this method will update a self.computes dict keyed by hostname
+            compute = self._start_compute(hostname)
+            compute.driver._host.get_connection = lambda: fake_connection
         rp_uuid = self._get_provider_uuid_by_name(hostname)
         rp_uuids = self._get_all_rp_uuids_in_a_tree(rp_uuid)
         for rp in rp_uuids:
@@ -94,6 +128,11 @@ def _start_compute_service(self, hostname):
 
 class VGPUTests(VGPUTestBase):
 
+    # We want to target some hosts for some created instances
+    api_major_version = 'v2.1'
+    ADMIN_API = True
+    microversion = 'latest'
+
     def setUp(self):
         super(VGPUTests, self).setUp()
         extra_spec = {"resources:VGPU": "1"}
@@ -103,23 +142,86 @@ def setUp(self):
         self.flags(
             enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
             group='devices')
-        self.compute1 = self._start_compute_service('host1')
+
+        # for the sake of resizing, we need to patch the two methods below
+        self.useFixture(fixtures.MockPatch(
+            'nova.virt.libvirt.LibvirtDriver._get_instance_disk_info',
+             return_value=[]))
+        self.useFixture(fixtures.MockPatch('os.rename'))
+
+        self.compute1 = self._start_compute_service(_DEFAULT_HOST)
+
+    def assert_vgpu_usage_for_compute(self, compute, expected):
+        total_usage = 0
+        # We only want to get mdevs that are assigned to instances
+        mdevs = compute.driver._get_all_assigned_mediated_devices()
+        for mdev in mdevs:
+            mdev_name = libvirt_utils.mdev_uuid2name(mdev)
+            mdev_info = compute.driver._get_mediated_device_information(
+                mdev_name)
+            parent_name = mdev_info['parent']
+            parent_rp_name = compute.host + '_' + parent_name
+            parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name)
+            parent_usage = self._get_provider_usages(parent_rp_uuid)
+            if orc.VGPU in parent_usage:
+                total_usage += parent_usage[orc.VGPU]
+        self.assertEqual(expected, len(mdevs))
+        self.assertEqual(expected, total_usage)
 
     def test_create_servers_with_vgpu(self):
         self._create_server(
             image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
             flavor_id=self.flavor, host=self.compute1.host,
-            expected_state='ACTIVE')
-        # Now we should find a new mdev
-        mdevs = self.compute1.driver._get_mediated_devices()
-        self.assertEqual(1, len(mdevs))
+            networks='auto', expected_state='ACTIVE')
+        self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
+
+    def _confirm_resize(self, server, host='host1'):
+        # NOTE(sbauza): Unfortunately, _cleanup_resize() in libvirt checks the
+        # host option to know the source hostname but given we have a global
+        # CONF, the value will be the hostname of the last compute service that
+        # was created, so we need to change it here.
+        # TODO(sbauza): Remove the below once we stop using CONF.host in
+        # libvirt and rather looking at the compute host value.
+        orig_host = CONF.host
+        self.flags(host=host)
+        super(VGPUTests, self)._confirm_resize(server)
+        self.flags(host=orig_host)
+        self._wait_for_state_change(server, 'ACTIVE')
+
+    def test_resize_servers_with_vgpu(self):
+        # Add another compute for the sake of resizing
+        self.compute2 = self._start_compute_service('host2')
+        server = self._create_server(
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            flavor_id=self.flavor, host=self.compute1.host,
+            networks='auto', expected_state='ACTIVE')
+        # Make sure we only have 1 vGPU for compute1
+        self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
+        self.assert_vgpu_usage_for_compute(self.compute2, expected=0)
 
-        # Checking also the allocations for the parent pGPU
-        parent_name = mdevs[0]['parent']
-        parent_rp_name = self.compute1.host + '_' + parent_name
-        parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name)
-        usage = self._get_provider_usages(parent_rp_uuid)
-        self.assertEqual(1, usage[orc.VGPU])
+        extra_spec = {"resources:VGPU": "1"}
+        new_flavor = self._create_flavor(memory_mb=4096,
+                                         extra_spec=extra_spec)
+        # First, resize and then revert.
+        self._resize_server(server, new_flavor)
+        # After resizing, we then have two vGPUs, both for each compute
+        self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
+        self.assert_vgpu_usage_for_compute(self.compute2, expected=1)
+
+        self._revert_resize(server)
+        # We're back to the original resources usage
+        self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
+        self.assert_vgpu_usage_for_compute(self.compute2, expected=0)
+
+        # Now resize and then confirm it.
+        self._resize_server(server, new_flavor)
+        self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
+        self.assert_vgpu_usage_for_compute(self.compute2, expected=1)
+
+        self._confirm_resize(server)
+        # In the last case, the source guest disappeared so we only have 1 vGPU
+        self.assert_vgpu_usage_for_compute(self.compute1, expected=0)
+        self.assert_vgpu_usage_for_compute(self.compute2, expected=1)
 
 
 class VGPUMultipleTypesTests(VGPUTestBase):
@@ -180,7 +282,9 @@ def test_create_servers_with_vgpu(self):
 
     def test_create_servers_with_specific_type(self):
         # Regenerate the PCI addresses so both pGPUs now support nvidia-12
-        self.fake_connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
+        connection = self.computes[
+            self.compute1.host].driver._host.get_connection()
+        connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
             num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
             multiple_gpu_types=True)
         # Make a restart to update the Resource Providers