/
test_vgpu.py
831 lines (725 loc) · 38.3 KB
/
test_vgpu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import fixtures
import re
import collections
import os_resource_classes as orc
from oslo_config import cfg
from oslo_log import log as logging
from oslo_utils import uuidutils
from nova.compute import instance_actions
import nova.conf
from nova import context
from nova import objects
from nova.tests.fixtures import libvirt as fakelibvirt
from nova.tests.functional.api import client
from nova.tests.functional.libvirt import base
from nova.virt.libvirt import driver as libvirt_driver
from nova.virt.libvirt import utils as libvirt_utils
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
class VGPUTestBase(base.ServersTestBase):
# We want to target some hosts for some created instances
api_major_version = 'v2.1'
microversion = 'latest'
ADMIN_API = True
FAKE_LIBVIRT_VERSION = 7000000
FAKE_QEMU_VERSION = 5002000
# Since we run all computes by a single process, we need to identify which
# current compute service we use at the moment.
_current_host = 'host1'
def setUp(self):
super(VGPUTestBase, self).setUp()
libvirt_driver.LibvirtDriver._get_local_gb_info.return_value = {
'total': 128,
'used': 44,
'free': 84,
}
self.useFixture(fixtures.MockPatch(
'nova.privsep.libvirt.create_mdev',
side_effect=self._create_mdev))
# for the sake of resizing, we need to patch the two methods below
self.useFixture(fixtures.MockPatch(
'nova.virt.libvirt.LibvirtDriver._get_instance_disk_info',
return_value=[]))
self.useFixture(fixtures.MockPatch('os.rename'))
# Allow non-admins to see instance action events.
self.policy.set_rules({
'os_compute_api:os-instance-actions:events': 'rule:admin_or_owner'
}, overwrite=False)
# NOTE(sbauza): Since the fake create_mdev doesn't know which compute
# was called, we need to look at a value that can be provided just
# before the driver calls create_mdev. That's why we fake the below
# method for having the LibvirtDriver instance so we could modify
# the self.current_host value.
orig_get_vgpu_type_per_pgpu = (
libvirt_driver.LibvirtDriver._get_vgpu_type_per_pgpu)
def fake_get_vgpu_type_per_pgpu(_self, *args):
# See, here we look at the hostname from the virt driver...
self._current_host = _self._host.get_hostname()
# ... and then we call the original method
return orig_get_vgpu_type_per_pgpu(_self, *args)
self.useFixture(fixtures.MockPatch(
'nova.virt.libvirt.LibvirtDriver._get_vgpu_type_per_pgpu',
new=fake_get_vgpu_type_per_pgpu))
self.context = context.get_admin_context()
def pci2libvirt_address(self, address):
return "pci_{}_{}_{}_{}".format(*re.split("[.:]", address))
def libvirt2pci_address(self, dev_name):
return "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
def _create_mdev(self, physical_device, mdev_type, uuid=None):
# We need to fake the newly created sysfs object by adding a new
# FakeMdevDevice in the existing persisted Connection object so
# when asking to get the existing mdevs, we would see it.
if not uuid:
uuid = uuidutils.generate_uuid()
mdev_name = libvirt_utils.mdev_uuid2name(uuid)
libvirt_parent = self.pci2libvirt_address(physical_device)
# Here, we get the right compute thanks by the self.current_host that
# was modified just before
connection = self.computes[
self._current_host].driver._host.get_connection()
connection.mdev_info.devices.update(
{mdev_name: fakelibvirt.FakeMdevDevice(dev_name=mdev_name,
type_id=mdev_type,
parent=libvirt_parent)})
return uuid
def start_compute_with_vgpu(self, hostname, pci_info=None):
if not pci_info:
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
)
hostname = self.start_compute(
pci_info=pci_info,
hostname=hostname,
libvirt_version=self.FAKE_LIBVIRT_VERSION,
qemu_version=self.FAKE_QEMU_VERSION
)
compute = self.computes[hostname]
rp_uuid = self.compute_rp_uuids[hostname]
rp_uuids = self._get_all_rp_uuids_in_a_tree(rp_uuid)
for rp in rp_uuids:
inventory = self._get_provider_inventory(rp)
if orc.VGPU in inventory:
usage = self._get_provider_usages(rp)
# if multiple types, the inventories are different
self.assertIn(inventory[orc.VGPU]['total'], [8, 16])
self.assertEqual(0, usage[orc.VGPU])
# Since we haven't created any mdevs yet, we shouldn't find them
self.assertEqual([], compute.driver._get_mediated_devices())
return compute
def _confirm_resize(self, server, host='host1'):
# NOTE(sbauza): Unfortunately, _cleanup_resize() in libvirt checks the
# host option to know the source hostname but given we have a global
# CONF, the value will be the hostname of the last compute service that
# was created, so we need to change it here.
# TODO(sbauza): Remove the below once we stop using CONF.host in
# libvirt and rather looking at the compute host value.
orig_host = CONF.host
self.flags(host=host)
super(VGPUTestBase, self)._confirm_resize(server)
self.flags(host=orig_host)
self._wait_for_state_change(server, 'ACTIVE')
def assert_mdev_usage(self, compute, expected_amount, instance=None,
expected_rc=orc.VGPU, expected_rp_name=None):
"""Verify the allocations for either a whole compute or just a
specific instance.
:param compute: the internal compute object
:param expected_amount: the expected amount of allocations
:param instance: if not None, a specific Instance to lookup instead
of the whole compute allocations.
:param expected_rc: the expected resource class
:param expected_rp_name: the expected resource provider name if an
instance is provided.
"""
total_usages = collections.defaultdict(int)
# We only want to get mdevs that are assigned to either all the
# instances or just one.
mdevs = compute.driver._get_all_assigned_mediated_devices(instance)
for mdev in mdevs:
mdev_name = libvirt_utils.mdev_uuid2name(mdev)
mdev_info = compute.driver._get_mediated_device_information(
mdev_name)
parent_name = mdev_info['parent']
parent_rp_name = compute.host + '_' + parent_name
parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name)
parent_usage = self._get_provider_usages(parent_rp_uuid)
if (expected_rc in parent_usage and
parent_rp_name not in total_usages
):
# We only set the total amount if we didn't had it already
total_usages[parent_rp_name] = parent_usage[expected_rc]
if expected_rp_name and instance is not None:
# If this is for an instance, all the mdevs should be in the
# same RP.
self.assertEqual(expected_rp_name, parent_rp_name)
self.assertEqual(expected_amount, len(mdevs))
self.assertEqual(expected_amount,
sum(total_usages[k] for k in total_usages))
class VGPUTests(VGPUTestBase):
def setUp(self):
super(VGPUTests, self).setUp()
extra_spec = {"resources:VGPU": "1"}
self.flavor = self._create_flavor(extra_spec=extra_spec)
# Start compute1 supporting only nvidia-11
self.flags(
enabled_mdev_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
group='devices')
self.compute1 = self.start_compute_with_vgpu('host1')
def assert_vgpu_usage_for_compute(self, compute, expected):
self.assert_mdev_usage(compute, expected_amount=expected)
def test_create_servers_with_vgpu(self):
self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, host=self.compute1.host,
networks='auto', expected_state='ACTIVE')
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
def test_resize_servers_with_vgpu(self):
# Add another compute for the sake of resizing
self.compute2 = self.start_compute_with_vgpu('host2')
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, host=self.compute1.host,
networks='auto', expected_state='ACTIVE')
# Make sure we only have 1 vGPU for compute1
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=0)
extra_spec = {"resources:VGPU": "1"}
new_flavor = self._create_flavor(memory_mb=4096,
extra_spec=extra_spec)
# First, resize and then revert.
self._resize_server(server, new_flavor)
# After resizing, we then have two vGPUs, both for each compute
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=1)
self._revert_resize(server)
# We're back to the original resources usage
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=0)
# Now resize and then confirm it.
self._resize_server(server, new_flavor)
self.assert_vgpu_usage_for_compute(self.compute1, expected=1)
self.assert_vgpu_usage_for_compute(self.compute2, expected=1)
self._confirm_resize(server)
# In the last case, the source guest disappeared so we only have 1 vGPU
self.assert_vgpu_usage_for_compute(self.compute1, expected=0)
self.assert_vgpu_usage_for_compute(self.compute2, expected=1)
def test_multiple_instance_create(self):
body = self._build_server(
name=None, image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', az=None,
host=self.compute1.host)
# Asking to multicreate two instances, each of them asking for 1 vGPU
body['min_count'] = 2
# Asking to get the reservation ID so we find all the servers from it
body['return_reservation_id'] = True
# We ask for two servers but the API only returns the first.
response = self.api.post_server({'server': body})
self.assertIn('reservation_id', response)
reservation_id = response['reservation_id']
# Lookup servers created by the request
servers = self.api.get_servers(detail=True,
search_opts={'reservation_id': reservation_id})
for server in servers:
self._wait_for_state_change(server, 'ACTIVE')
# Let's verify we created two mediated devices and we have a total of
# 2 vGPUs
self.assert_vgpu_usage_for_compute(self.compute1, expected=2)
def test_multiple_instance_create_filling_up_capacity(self):
# Each pGPU created by fakelibvirt defaults to a capacity of 16 vGPUs.
# By default, we created a compute service with 2 pGPUs before, so we
# have a total capacity of 32. In theory, we should be able to find
# space for two instances asking for 16 vGPUs each.
extra_spec = {"resources:VGPU": "16"}
flavor = self._create_flavor(extra_spec=extra_spec)
body = self._build_server(
name=None, image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=flavor, networks='auto', az=None,
host=self.compute1.host)
# Asking to multicreate two instances, each of them asking for 8 vGPU
body['min_count'] = 2
server = self.api.post_server({'server': body})
# But... we fail miserably because of bug #1874664
# FIXME(sbauza): Change this once we fix the above bug
server = self._wait_for_state_change(server, 'ERROR')
self.assertIn('fault', server)
self.assertIn('No valid host', server['fault']['message'])
self.assertEqual('', server['hostId'])
# Assert the "create" instance action exists and is failed.
actions = self.api.get_instance_actions(server['id'])
self.assertEqual(1, len(actions), actions)
action = actions[0]
self.assertEqual(instance_actions.CREATE, action['action'])
self.assertEqual('Error', action['message'])
# Get the events. There should be one with an Error result.
action = self.api.api_get(
'/servers/%s/os-instance-actions/%s' %
(server['id'], action['request_id'])).body['instanceAction']
events = action['events']
self.assertEqual(1, len(events), events)
event = events[0]
self.assertEqual('conductor_schedule_and_build_instances',
event['event'])
self.assertEqual('Error', event['result'])
# Normally non-admins cannot see the event traceback but we enabled
# that via policy in setUp so assert something was recorded.
self.assertIn('select_destinations', event['traceback'])
class VGPUMultipleTypesTests(VGPUTestBase):
def setUp(self):
super(VGPUMultipleTypesTests, self).setUp()
extra_spec = {"resources:VGPU": "1"}
self.flavor = self._create_flavor(extra_spec=extra_spec)
self.flags(
enabled_mdev_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE,
fakelibvirt.NVIDIA_12_VGPU_TYPE],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
# host1 will have 2 physical GPUs :
# - 0000:81:00.0 will only support nvidia-11
# - 0000:81:01.0 will only support nvidia-12
MDEVCAP_DEV1_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
MDEVCAP_DEV2_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
self.flags(device_addresses=[MDEVCAP_DEV1_PCI_ADDR],
group='mdev_nvidia-11')
self.flags(device_addresses=[MDEVCAP_DEV2_PCI_ADDR],
group='mdev_nvidia-12')
# Prepare traits for later on
self._create_trait('CUSTOM_NVIDIA_11')
self._create_trait('CUSTOM_NVIDIA_12')
self.compute1 = self.start_compute_with_vgpu('host1')
def test_create_servers_with_vgpu(self):
self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.compute1.host)
mdevs = self.compute1.driver._get_mediated_devices()
self.assertEqual(1, len(mdevs))
# We can be deterministic : since 0000:81:01.0 is asked to only support
# nvidia-12 *BUT* doesn't actually have this type as a PCI capability,
# we are sure that only 0000:81:00.0 is used.
parent_name = mdevs[0]['parent']
self.assertEqual(fakelibvirt.MDEVCAP_DEV1_PCI_ADDR, parent_name)
# We are also sure that there is no RP for 0000:81:01.0 since there
# is no inventory for nvidia-12
root_rp_uuid = self._get_provider_uuid_by_name(self.compute1.host)
rp_uuids = self._get_all_rp_uuids_in_a_tree(root_rp_uuid)
# We only have 2 RPs : the root RP and only the pGPU1 RP...
self.assertEqual(2, len(rp_uuids))
# ... but we double-check by asking the RP by its expected name
expected_pgpu2_rp_name = (self.compute1.host + '_' +
fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
pgpu2_rp = self.placement.get(
'/resource_providers?name=' + expected_pgpu2_rp_name).body[
'resource_providers']
# See, Placement API returned no RP for this name as it doesn't exist.
self.assertEqual([], pgpu2_rp)
def test_create_servers_with_specific_type(self):
# Regenerate the PCI addresses so both pGPUs now support nvidia-12
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
multiple_gpu_types=True)
# Make a restart to update the Resource Providers
self.compute1 = self.restart_compute_service(
self.compute1.host, pci_info=pci_info, keep_hypervisor_state=False)
pgpu1_rp_uuid = self._get_provider_uuid_by_name(
self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
pgpu2_rp_uuid = self._get_provider_uuid_by_name(
self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
pgpu1_inventory = self._get_provider_inventory(pgpu1_rp_uuid)
self.assertEqual(16, pgpu1_inventory[orc.VGPU]['total'])
pgpu2_inventory = self._get_provider_inventory(pgpu2_rp_uuid)
self.assertEqual(8, pgpu2_inventory[orc.VGPU]['total'])
# Attach traits to the pGPU RPs
self._set_provider_traits(pgpu1_rp_uuid, ['CUSTOM_NVIDIA_11'])
self._set_provider_traits(pgpu2_rp_uuid, ['CUSTOM_NVIDIA_12'])
expected = {'CUSTOM_NVIDIA_11': fakelibvirt.MDEVCAP_DEV1_PCI_ADDR,
'CUSTOM_NVIDIA_12': fakelibvirt.MDEVCAP_DEV2_PCI_ADDR}
for trait in expected.keys():
# Add a trait to the flavor
extra_spec = {"resources:VGPU": "1",
"trait:%s" % trait: "required"}
flavor = self._create_flavor(extra_spec=extra_spec)
# Use the new flavor for booting
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=flavor, networks='auto', host=self.compute1.host)
# Get the instance we just created
inst = objects.Instance.get_by_uuid(self.context, server['id'])
# Get the mdevs that were allocated for this instance, we should
# only have one
mdevs = self.compute1.driver._get_all_assigned_mediated_devices(
inst)
self.assertEqual(1, len(mdevs))
# It's a dict of mdev_uuid/instance_uuid pairs, we only care about
# the keys
mdevs = list(mdevs.keys())
# Now get the detailed information about this single mdev
mdev_info = self.compute1.driver._get_mediated_device_information(
libvirt_utils.mdev_uuid2name(mdevs[0]))
# We can be deterministic : since we asked for a specific type,
# we know which pGPU we landed.
self.assertEqual(expected[trait], mdev_info['parent'])
class VGPULimitMultipleTypesTests(VGPUTestBase):
def setUp(self):
super(VGPULimitMultipleTypesTests, self).setUp()
extra_spec = {"resources:VGPU": "1"}
self.flavor = self._create_flavor(extra_spec=extra_spec)
self.flags(
enabled_mdev_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE,
fakelibvirt.NVIDIA_12_VGPU_TYPE],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
# host1 will have 2 physical GPUs :
# - 0000:81:00.0 will only support nvidia-11
# - 0000:81:01.0 will only support nvidia-12
MDEVCAP_DEV1_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
MDEVCAP_DEV2_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
self.flags(device_addresses=[MDEVCAP_DEV1_PCI_ADDR],
group='mdev_nvidia-11')
self.flags(device_addresses=[MDEVCAP_DEV2_PCI_ADDR],
group='mdev_nvidia-12')
# Start the compute by supporting both types
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
multiple_gpu_types=True)
self.compute1 = self.start_compute_with_vgpu('host1', pci_info)
def test_create_servers_with_vgpu(self):
physdev1_rp_uuid = self._get_provider_uuid_by_name(
self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
physdev2_rp_uuid = self._get_provider_uuid_by_name(
self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
# Just for asserting the inventories we currently have.
physdev1_inventory = self._get_provider_inventory(physdev1_rp_uuid)
self.assertEqual(16, physdev1_inventory[orc.VGPU]['total'])
physdev2_inventory = self._get_provider_inventory(physdev2_rp_uuid)
self.assertEqual(8, physdev2_inventory[orc.VGPU]['total'])
# Now, let's limit the capacity for the first type to 2
self.flags(max_instances=2, group='mdev_nvidia-11')
# Make a restart to update the Resource Providers
self.compute2 = self.restart_compute_service('host1')
# Make sure we can still create an instance
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.compute1.host)
mdevs = self.compute1.driver._get_mediated_devices()
self.assertEqual(1, len(mdevs))
# ... but actually looking at Placement, only now the 2nd GPU can be
# used because nvidia-11 was limited to 2 while the GPU supporting it
# was having a 8th capacity.
physdev2_inventory = self._get_provider_inventory(physdev2_rp_uuid)
self.assertEqual(8, physdev2_inventory[orc.VGPU]['total'])
# Get the instance we just created
inst = objects.Instance.get_by_uuid(self.context, server['id'])
expected_rp_name = (self.compute1.host + '_' +
fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
# Yes, indeed we use the 2nd GPU
self.assert_mdev_usage(self.compute1, expected_amount=1,
expected_rc=orc.VGPU, instance=inst,
expected_rp_name=expected_rp_name)
# ... and what happened to the first GPU inventory ? Well, the whole
# Resource Provider disappeared !
provider = self._get_resource_provider_by_uuid(physdev1_rp_uuid)
self.assertEqual(404, provider['errors'][0]['status'])
self.assertIn(
"No resource provider with uuid %s found" % physdev1_rp_uuid,
provider['errors'][0]['detail'])
class VGPULiveMigrationTests(base.LibvirtMigrationMixin, VGPUTestBase):
# Use the right minimum versions for live-migration
FAKE_LIBVIRT_VERSION = 8006000
FAKE_QEMU_VERSION = 8001000
def setUp(self):
# Prepares two computes (src and dst), each of them having two GPUs
# (81:00.0 and 81:01.0) with two types but where the operator only
# wants to supports nvidia-11 by 81:00.0 and nvidia-12 by 81:01.0
super(VGPULiveMigrationTests, self).setUp()
# Let's set the configuration correctly.
self.flags(
enabled_mdev_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE,
fakelibvirt.NVIDIA_12_VGPU_TYPE],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
MDEVCAP_DEV1_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
MDEVCAP_DEV2_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
self.flags(device_addresses=[MDEVCAP_DEV1_PCI_ADDR],
group='mdev_nvidia-11')
self.flags(device_addresses=[MDEVCAP_DEV2_PCI_ADDR],
group='mdev_nvidia-12')
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
multiple_gpu_types=True)
self.src = self.start_compute_with_vgpu('src', pci_info=pci_info)
self.dest = self.start_compute_with_vgpu('dest', pci_info=pci_info)
# Add the custom traits to the 4 resource providers (two per host as
# we have two pGPUs)
self._create_trait('CUSTOM_NVIDIA_11')
self._create_trait('CUSTOM_NVIDIA_12')
for host in [self.src.host, self.dest.host]:
nvidia11_rp_uuid = self._get_provider_uuid_by_name(
host + '_' + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
nvidia12_rp_uuid = self._get_provider_uuid_by_name(
host + '_' + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
self._set_provider_traits(nvidia11_rp_uuid, ['CUSTOM_NVIDIA_11'])
self._set_provider_traits(nvidia12_rp_uuid, ['CUSTOM_NVIDIA_12'])
# We will test to live-migrate an instance using nvidia-11 type.
extra_spec = {"resources:VGPU": "1",
"trait:CUSTOM_NVIDIA_11": "required"}
self.flavor = self._create_flavor(extra_spec=extra_spec)
def test_live_migration_fails_on_old_source(self):
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
multiple_gpu_types=True)
self.src = self.restart_compute_service(
self.src.host,
pci_info=pci_info,
keep_hypervisor_state=False,
qemu_version=8000000,
libvirt_version=8005000)
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.src.host)
# now live migrate that server
ex = self.assertRaises(
client.OpenStackApiException,
self._live_migrate,
server, 'completed')
self.assertEqual(500, ex.response.status_code)
self.assertIn('NoValidHost', str(ex))
log_out = self.stdlog.logger.output
self.assertIn('Migration pre-check error: Unable to migrate %s: '
'Either libvirt or QEMU version for compute service '
'source are too old than the supported ones '
'' % server['id'], log_out)
def test_live_migration_fails_on_old_destination(self):
# For the fact to testing that we look at the dest object, we need to
# skip the verification for whether the destination HV version is older
self.flags(skip_hypervisor_version_check_on_lm=True,
group='workarounds')
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
multiple_gpu_types=True)
self.dest = self.restart_compute_service(
self.dest.host,
pci_info=pci_info,
keep_hypervisor_state=False,
qemu_version=8000000,
libvirt_version=8005000)
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.src.host)
# now live migrate that server
ex = self.assertRaises(
client.OpenStackApiException,
self._live_migrate,
server, 'completed')
self.assertEqual(500, ex.response.status_code)
self.assertIn('NoValidHost', str(ex))
log_out = self.stdlog.logger.output
self.assertIn('Migration pre-check error: Unable to migrate %s: '
'Either libvirt or QEMU version for compute service '
'target are too old than the supported ones '
'' % server['id'],
log_out)
def test_live_migration_fails_due_to_non_supported_mdev_types(self):
self.flags(
enabled_mdev_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE],
group='devices')
self.src = self.restart_compute_service(self.src.host)
self.flags(
enabled_mdev_types=[fakelibvirt.NVIDIA_12_VGPU_TYPE],
group='devices')
self.dest = self.restart_compute_service(self.dest.host)
# Force a periodic run in order to make sure all service resources
# are changed before we call create_service()
self._run_periodics()
self.server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.src.host)
# now live migrate that server
ex = self.assertRaises(
client.OpenStackApiException,
self._live_migrate,
self.server, 'completed')
self.assertEqual(500, ex.response.status_code)
self.assertIn('NoValidHost', str(ex))
log_out = self.stdlog.logger.output
# The log is fully JSON-serialized, so just check the phrase.
self.assertIn('Unable to migrate %s: ' % self.server['id'], log_out)
self.assertIn('Source mdev types ', log_out)
self.assertIn('are not supported by this compute : ', log_out)
def test_live_migrate_server(self):
self.server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.src.host)
inst = objects.Instance.get_by_uuid(self.context, self.server['id'])
mdevs = self.src.driver._get_all_assigned_mediated_devices(inst)
self.assertEqual(1, len(mdevs))
self._live_migrate(self.server, 'completed')
# Now the destination XML is updated, so the destination mdev is
# correctly used.
self.assert_mdev_usage(self.dest, 1)
class VGPULiveMigrationTestsLMFailed(VGPULiveMigrationTests):
"""Tests that expect the live migration to fail, and exist to test the
rollback code. Stubs out fakelibvirt's migrateToURI3() with a stub that
"fails" the migration.
"""
def _migrate_stub(self, domain, destination, params, flags):
"""Designed to stub fakelibvirt's migrateToURI3 and "fail" the
live migration by monkeypatching jobStats() to return an error.
"""
# During the migration, we reserved a mdev in the dest
self.assert_mdev_usage(self.src, 1)
self.assert_mdev_usage(self.dest, 1)
# The resource update periodic task should not change the consumed
# mdevs, as the migration is still happening. As usual, running
# periodics is not necessary to make the test pass, but it's good to
# make sure it does the right thing.
self._run_periodics()
self.assert_mdev_usage(self.src, 1)
self.assert_mdev_usage(self.dest, 1)
source = self.computes['src']
conn = source.driver._host.get_connection()
dom = conn.lookupByUUIDString(self.server['id'])
dom.fail_job()
self.migrate_stub_ran = True
def test_live_migrate_server(self):
self.server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.src.host)
inst = objects.Instance.get_by_uuid(self.context, self.server['id'])
mdevs = self.src.driver._get_all_assigned_mediated_devices(inst)
self.assertEqual(1, len(mdevs))
self._live_migrate(self.server, 'failed')
# We released the reserved mdev after the migration failed.
self.assert_mdev_usage(self.src, 1)
self.assert_mdev_usage(self.dest, 0)
class DifferentMdevClassesTests(VGPUTestBase):
def setUp(self):
super(DifferentMdevClassesTests, self).setUp()
self.extra_spec = {"resources:CUSTOM_NOTVGPU": "1"}
self.flavor = self._create_flavor(extra_spec=self.extra_spec)
self.flags(
enabled_mdev_types=[fakelibvirt.MLX5_CORE_TYPE,
fakelibvirt.NVIDIA_12_VGPU_TYPE],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
# host1 will have 2 physical devices :
# - 0000:81:00.0 will only support mlx5_core
# - 0000:81:01.0 will only support nvidia-12
MDEVCAP_DEV1_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
MDEVCAP_DEV2_PCI_ADDR = self.libvirt2pci_address(
fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
self.flags(device_addresses=[MDEVCAP_DEV1_PCI_ADDR],
group='mdev_mlx5_core')
self.flags(device_addresses=[MDEVCAP_DEV2_PCI_ADDR],
group='mdev_nvidia-12')
self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_mlx5_core')
self.compute1 = self.start_compute_with_vgpu('host1')
# Regenerate the PCI addresses so they can support both mlx5 and
# nvidia-12 types
connection = self.computes[
self.compute1.host].driver._host.get_connection()
connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
generic_types=True)
# Make a restart to update the Resource Providers
self.compute1 = self.restart_compute_service('host1')
def test_create_servers_with_different_mdev_classes(self):
physdev1_rp_uuid = self._get_provider_uuid_by_name(
self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
physdev2_rp_uuid = self._get_provider_uuid_by_name(
self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV2_PCI_ADDR)
# Remember, we asked to create 1st device inventory to use a
# CUSTOM_NOTVGPU RC.
physdev1_inventory = self._get_provider_inventory(physdev1_rp_uuid)
self.assertEqual(16, physdev1_inventory['CUSTOM_NOTVGPU']['total'])
# But, we didn't ask for the second device inventory...
physdev2_inventory = self._get_provider_inventory(physdev2_rp_uuid)
self.assertEqual(8, physdev2_inventory[orc.VGPU]['total'])
expected = {'CUSTOM_NOTVGPU': fakelibvirt.MDEVCAP_DEV1_PCI_ADDR,
orc.VGPU: fakelibvirt.MDEVCAP_DEV2_PCI_ADDR}
for mdev_rc in expected.keys():
# Use a specific mdev resource class for the flavor
extra_spec = {"resources:%s" % mdev_rc: "1"}
flavor = self._create_flavor(extra_spec=extra_spec)
# Use the new flavor for booting
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=flavor, networks='auto', host=self.compute1.host)
# Get the instance we just created
inst = objects.Instance.get_by_uuid(self.context, server['id'])
expected_rp_name = self.compute1.host + '_' + expected[mdev_rc]
self.assert_mdev_usage(self.compute1, expected_amount=1,
expected_rc=mdev_rc, instance=inst,
expected_rp_name=expected_rp_name)
def test_resize_servers_with_mlx5(self):
# Add another compute for the sake of resizing
self.compute2 = self.start_compute_with_vgpu('host2')
# Regenerate the PCI addresses so they can support both mlx5 and
# nvidia-12 types
connection = self.computes[
self.compute2.host].driver._host.get_connection()
connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
generic_types=True)
# Make a restart to update the Resource Providers
self.compute2 = self.restart_compute_service('host2')
# Use the new flavor for booting
server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.compute1.host)
# Make sure we only have 1 mdev for compute1
self.assert_mdev_usage(self.compute1, expected_amount=1,
expected_rc='CUSTOM_NOTVGPU')
self.assert_mdev_usage(self.compute2, expected_amount=0,
expected_rc='CUSTOM_NOTVGPU')
new_flavor = self._create_flavor(memory_mb=4096,
extra_spec=self.extra_spec)
# First, resize and then revert.
self._resize_server(server, new_flavor)
# After resizing, we then have two mdevs, both for each compute
self.assert_mdev_usage(self.compute1, expected_amount=1,
expected_rc='CUSTOM_NOTVGPU')
self.assert_mdev_usage(self.compute2, expected_amount=1,
expected_rc='CUSTOM_NOTVGPU')
self._revert_resize(server)
# We're back to the original resources usage
self.assert_mdev_usage(self.compute1, expected_amount=1,
expected_rc='CUSTOM_NOTVGPU')
self.assert_mdev_usage(self.compute2, expected_amount=0,
expected_rc='CUSTOM_NOTVGPU')
# Now resize and then confirm it.
self._resize_server(server, new_flavor)
self.assert_mdev_usage(self.compute1, expected_amount=1,
expected_rc='CUSTOM_NOTVGPU')
self.assert_mdev_usage(self.compute2, expected_amount=1,
expected_rc='CUSTOM_NOTVGPU')
self._confirm_resize(server)
# In the last case, the source guest disappeared so we only have 1 mdev
self.assert_mdev_usage(self.compute1, expected_amount=0,
expected_rc='CUSTOM_NOTVGPU')
self.assert_mdev_usage(self.compute2, expected_amount=1,
expected_rc='CUSTOM_NOTVGPU')