Skip to content

Commit

Permalink
Allocate mdevs when resizing or reverting resize
Browse files Browse the repository at this point in the history
Now that allocations are passed to the methods, we can ask whether we
need to use mediated devices for the instance.

Adding a large functional test for verifying it works.

Change-Id: I018762335b19c98045ad42147080203092b51c27
Closes-Bug: #1778563
  • Loading branch information
sbauza committed Apr 20, 2020
1 parent 580eedb commit d2e0afc
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 63 deletions.
15 changes: 13 additions & 2 deletions doc/source/admin/virtual-gpu.rst
Expand Up @@ -363,17 +363,27 @@ For libvirt:
vGPU resources). The proposed workaround is to rebuild the instance after
resizing it. The rebuild operation allocates vGPUS to the instance.

.. versionchanged:: 21.0.0

This has been resolved in the Ussuri release. See `bug 1778563`_.

* Cold migrating an instance to another host will have the same problem as
resize. If you want to migrate an instance, make sure to rebuild it after the
migration.

.. versionchanged:: 21.0.0

This has been resolved in the Ussuri release. See `bug 1778563`_.

* Rescue images do not use vGPUs. An instance being rescued does not keep its
vGPUs during rescue. During that time, another instance can receive those
vGPUs. This is a known issue. The recommended workaround is to rebuild an
instance immediately after rescue. However, rebuilding the rescued instance
only helps if there are other free vGPUs on the host.

.. note:: This has been resolved in the Rocky release [#]_.
.. versionchanged:: 18.0.0

This has been resolved in the Rocky release. See `bug 1762688`_.

For XenServer:

Expand All @@ -397,7 +407,8 @@ For XenServer:

* Multiple GPU types per compute is not supported by the XenServer driver.

.. [#] https://bugs.launchpad.net/nova/+bug/1762688
.. _bug 1778563: https://bugs.launchpad.net/nova/+bug/1778563
.. _bug 1762688: https://bugs.launchpad.net/nova/+bug/1762688

.. Links
.. _Intel GVT-g: https://01.org/igvt-g
Expand Down
71 changes: 42 additions & 29 deletions nova/tests/functional/integrated_helpers.py
Expand Up @@ -370,6 +370,43 @@ def _delete_server(self, server):
self.api.delete_server(server['id'])
self._wait_until_deleted(server)

def _confirm_resize(self, server):
self.api.post_server_action(server['id'], {'confirmResize': None})
server = self._wait_for_state_change(server, 'ACTIVE')
self._wait_for_instance_action_event(
server, instance_actions.CONFIRM_RESIZE,
'compute_confirm_resize', 'success')
return server

def _revert_resize(self, server):
# NOTE(sbauza): This method requires the caller to setup a fake
# notifier by stubbing it.
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(server, 'ACTIVE')
self._wait_for_migration_status(server, ['reverted'])
# Note that the migration status is changed to "reverted" in the
# dest host revert_resize method but the allocations are cleaned up
# in the source host finish_revert_resize method so we need to wait
# for the finish_revert_resize method to complete.
fake_notifier.wait_for_versioned_notifications(
'instance.resize_revert.end')
return server

def _migrate_or_resize(self, server, request):
if not ('resize' in request or 'migrate' in request):
raise Exception('_migrate_or_resize only supports resize or '
'migrate requests.')
self.api.post_server_action(server['id'], request)
self._wait_for_state_change(server, 'VERIFY_RESIZE')

def _resize_server(self, server, new_flavor):
resize_req = {
'resize': {
'flavorRef': new_flavor
}
}
self._migrate_or_resize(server, resize_req)


class _IntegratedTestBase(test.TestCase, InstanceHelperMixin):
REQUIRES_LOCKING = True
Expand All @@ -393,6 +430,9 @@ def setUp(self):
self.placement_api = placement.api
self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self))

fake_notifier.stub_notifier(self)
self.addCleanup(fake_notifier.reset)

self._setup_services()

self.addCleanup(nova.tests.unit.image.fake.FakeImageService_reset)
Expand Down Expand Up @@ -914,8 +954,7 @@ def _delete_and_check_allocations(self, server):

def _move_and_check_allocations(self, server, request, old_flavor,
new_flavor, source_rp_uuid, dest_rp_uuid):
self.api.post_server_action(server['id'], request)
self._wait_for_state_change(server, 'VERIFY_RESIZE')
self._migrate_or_resize(server, request)

def _check_allocation():
self.assertFlavorMatchesUsage(source_rp_uuid, old_flavor)
Expand Down Expand Up @@ -971,13 +1010,7 @@ def _resize_to_same_host_and_check_allocations(self, server, old_flavor,
# Resize the server to the same host and check usages in VERIFY_RESIZE
# state
self.flags(allow_resize_to_same_host=True)
resize_req = {
'resize': {
'flavorRef': new_flavor['id']
}
}
self.api.post_server_action(server['id'], resize_req)
self._wait_for_state_change(server, 'VERIFY_RESIZE')
self._resize_server(server, new_flavor['id'])

self.assertFlavorMatchesUsage(rp_uuid, old_flavor, new_flavor)

Expand Down Expand Up @@ -1044,23 +1077,3 @@ def assert_hypervisor_usage(self, compute_node_uuid, flavor,
# Account for reserved_host_cpus.
expected_vcpu_usage = CONF.reserved_host_cpus + flavor['vcpus']
self.assertEqual(expected_vcpu_usage, hypervisor['vcpus_used'])

def _confirm_resize(self, server):
self.api.post_server_action(server['id'], {'confirmResize': None})
server = self._wait_for_state_change(server, 'ACTIVE')
self._wait_for_instance_action_event(
server, instance_actions.CONFIRM_RESIZE,
'compute_confirm_resize', 'success')
return server

def _revert_resize(self, server):
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(server, 'ACTIVE')
self._wait_for_migration_status(server, ['reverted'])
# Note that the migration status is changed to "reverted" in the
# dest host revert_resize method but the allocations are cleaned up
# in the source host finish_revert_resize method so we need to wait
# for the finish_revert_resize method to complete.
fake_notifier.wait_for_versioned_notifications(
'instance.resize_revert.end')
return server
138 changes: 121 additions & 17 deletions nova/tests/functional/libvirt/test_vgpu.py
Expand Up @@ -14,6 +14,7 @@
import fixtures
import re

import mock
import os_resource_classes as orc
from oslo_config import cfg
from oslo_log import log as logging
Expand All @@ -24,17 +25,25 @@
from nova import objects
from nova.tests.functional.libvirt import base
from nova.tests.unit.virt.libvirt import fakelibvirt
from nova.virt.libvirt import driver as libvirt_driver
from nova.virt.libvirt import utils as libvirt_utils

CONF = cfg.CONF
LOG = logging.getLogger(__name__)


_DEFAULT_HOST = 'host1'


class VGPUTestBase(base.ServersTestBase):

FAKE_LIBVIRT_VERSION = 5000000
FAKE_QEMU_VERSION = 3001000

# Since we run all computes by a single process, we need to identify which
# current compute service we use at the moment.
_current_host = _DEFAULT_HOST

def setUp(self):
super(VGPUTestBase, self).setUp()
self.useFixture(fixtures.MockPatch(
Expand All @@ -45,6 +54,25 @@ def setUp(self):
self.useFixture(fixtures.MockPatch(
'nova.privsep.libvirt.create_mdev',
side_effect=self._create_mdev))

# NOTE(sbauza): Since the fake create_mdev doesn't know which compute
# was called, we need to look at a value that can be provided just
# before the driver calls create_mdev. That's why we fake the below
# method for having the LibvirtDriver instance so we could modify
# the self.current_host value.
orig_get_vgpu_type_per_pgpu = (
libvirt_driver.LibvirtDriver._get_vgpu_type_per_pgpu)

def fake_get_vgpu_type_per_pgpu(_self, *args):
# See, here we look at the hostname from the virt driver...
self._current_host = _self._host.get_hostname()
# ... and then we call the original method
return orig_get_vgpu_type_per_pgpu(_self, *args)

self.useFixture(fixtures.MockPatch(
'nova.virt.libvirt.LibvirtDriver._get_vgpu_type_per_pgpu',
new=fake_get_vgpu_type_per_pgpu))

self.context = context.get_admin_context()

def pci2libvirt_address(self, address):
Expand All @@ -61,24 +89,30 @@ def _create_mdev(self, physical_device, mdev_type, uuid=None):
uuid = uuidutils.generate_uuid()
mdev_name = libvirt_utils.mdev_uuid2name(uuid)
libvirt_parent = self.pci2libvirt_address(physical_device)
self.fake_connection.mdev_info.devices.update(
# Here, we get the right compute thanks by the self.current_host that
# was modified just before
connection = self.computes[
self._current_host].driver._host.get_connection()
connection.mdev_info.devices.update(
{mdev_name: fakelibvirt.FakeMdevDevice(dev_name=mdev_name,
type_id=mdev_type,
parent=libvirt_parent)})
return uuid

def _start_compute_service(self, hostname):
self.fake_connection = self._get_connection(
fake_connection = self._get_connection(
host_info=fakelibvirt.HostInfo(cpu_nodes=2, kB_mem=8192),
# We want to create two pGPUs but no other PCI devices
pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=0,
num_pfs=0,
num_vfs=0,
num_mdevcap=2),
hostname=hostname)

self.mock_conn.return_value = self.fake_connection
compute = self.start_service('compute', host=hostname)
with mock.patch('nova.virt.libvirt.host.Host.get_connection',
return_value=fake_connection):
# this method will update a self.computes dict keyed by hostname
compute = self._start_compute(hostname)
compute.driver._host.get_connection = lambda: fake_connection
rp_uuid = self._get_provider_uuid_by_name(hostname)
rp_uuids = self._get_all_rp_uuids_in_a_tree(rp_uuid)
for rp in rp_uuids:
Expand All @@ -94,6 +128,11 @@ def _start_compute_service(self, hostname):

class VGPUTests(VGPUTestBase):

# We want to target some hosts for some created instances
api_major_version = 'v2.1'
ADMIN_API = True
microversion = 'latest'

def setUp(self):
super(VGPUTests, self).setUp()
extra_spec = {"resources:VGPU": "1"}
Expand All @@ -103,23 +142,86 @@ def setUp(self):
self.flags(
enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
group='devices')
self.compute1 = self._start_compute_service('host1')

# for the sake of resizing, we need to patch the two methods below
self.useFixture(fixtures.MockPatch(
'nova.virt.libvirt.LibvirtDriver._get_instance_disk_info',
return_value=[]))
self.useFixture(fixtures.MockPatch('os.rename'))

self.compute1 = self._start_compute_service(_DEFAULT_HOST)

def assert_vgpu_usage_for_compute(self, compute, expected):
total_usage = 0
# We only want to get mdevs that are assigned to instances
mdevs = compute.driver._get_all_assigned_mediated_devices()
for mdev in mdevs:
mdev_name = libvirt_utils.mdev_uuid2name(mdev)
mdev_info = compute.driver._get_mediated_device_information(
mdev_name)
parent_name = mdev_info['parent']
parent_rp_name = compute.host + '_' + parent_name
parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name)
parent_usage = self._get_provider_usages(parent_rp_uuid)
if orc.VGPU in parent_usage:
total_usage += parent_usage[orc.VGPU]
self.assertEqual(expected, len(mdevs))
self.assertEqual(expected, total_usage)

def test_create_servers_with_vgpu(self):
self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, host=self.compute1.host,
expected_state='ACTIVE')
# Now we should find a new mdev
mdevs = self.compute1.driver._get_mediated_devices()
self.assertEqual(1, len(mdevs))
networks='auto', expected_state='ACTIVE')
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)

def _confirm_resize(self, server, host='host1'):
# NOTE(sbauza): Unfortunately, _cleanup_resize() in libvirt checks the
# host option to know the source hostname but given we have a global
# CONF, the value will be the hostname of the last compute service that
# was created, so we need to change it here.
# TODO(sbauza): Remove the below once we stop using CONF.host in
# libvirt and rather looking at the compute host value.
orig_host = CONF.host
self.flags(host=host)
super(VGPUTests, self)._confirm_resize(server)
self.flags(host=orig_host)
self._wait_for_state_change(server, 'ACTIVE')

def test_resize_servers_with_vgpu(self):
# Add another compute for the sake of resizing
self.compute2 = self._start_compute_service('host2')
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, host=self.compute1.host,
networks='auto', expected_state='ACTIVE')
# Make sure we only have 1 vGPU for compute1
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=0)

# Checking also the allocations for the parent pGPU
parent_name = mdevs[0]['parent']
parent_rp_name = self.compute1.host + '_' + parent_name
parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name)
usage = self._get_provider_usages(parent_rp_uuid)
self.assertEqual(1, usage[orc.VGPU])
extra_spec = {"resources:VGPU": "1"}
new_flavor = self._create_flavor(memory_mb=4096,
extra_spec=extra_spec)
# First, resize and then revert.
self._resize_server(server, new_flavor)
# After resizing, we then have two vGPUs, both for each compute
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=1)

self._revert_resize(server)
# We're back to the original resources usage
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=0)

# Now resize and then confirm it.
self._resize_server(server, new_flavor)
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=1)

self._confirm_resize(server)
# In the last case, the source guest disappeared so we only have 1 vGPU
self.assert_vgpu_usage_for_compute(self.compute1, expected=0)
self.assert_vgpu_usage_for_compute(self.compute2, expected=1)


class VGPUMultipleTypesTests(VGPUTestBase):
Expand Down Expand Up @@ -180,7 +282,9 @@ def test_create_servers_with_vgpu(self):

def test_create_servers_with_specific_type(self):
# Regenerate the PCI addresses so both pGPUs now support nvidia-12
self.fake_connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
connection = self.computes[
self.compute1.host].driver._host.get_connection()
connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
multiple_gpu_types=True)
# Make a restart to update the Resource Providers
Expand Down

0 comments on commit d2e0afc

Please sign in to comment.