From 03c596a9f4324e572bc04d4bbad09a6d3d47366c Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Fri, 6 Jul 2018 09:09:05 -0700
Subject: [PATCH] Avoid requesting DISK_GB allocation for root_gb on BFV
 instances

Right now, we still ask placement for a disk allocation covering
swap, ephemeral, and root disks for the instance even if the instance
is going to be volume-backed. This patch makes us not include the
root size in that calculation for placement, avoiding a false
failure because the volume size is counted against the compute's
available space.

To do this, we need another flag in request_spec to track the
BFV-ness of the instance. Right now, this patch just sets that on
new builds and the scheduler client assumes a lack of said flag
as "I don't know, so assume not-BFV" for compatibility. A
subsequent patch can calculate that flag for existing instances
so that we will be able to heal over time by migrating instances
or re-writing their allocations to reflect reality.

Partial-Bug: #1469179
Change-Id: I9c2111f7377df65c1fc3c72323f85483b3295989
---
 nova/compute/api.py                           |  9 +++
 nova/objects/request_spec.py                  |  8 +-
 nova/scheduler/utils.py                       |  9 ++-
 nova/tests/functional/test_servers.py         | 79 +++++++++++++++++++
 nova/tests/unit/compute/test_compute_api.py   |  5 +-
 nova/tests/unit/objects/test_objects.py       |  2 +-
 nova/tests/unit/objects/test_request_spec.py  |  4 +-
 ...llocate-from-compute-95d048fbe9867c34.yaml | 13 +++
 8 files changed, 120 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/bfv-instances-no-longer-allocate-from-compute-95d048fbe9867c34.yaml

diff --git a/nova/compute/api.py b/nova/compute/api.py
index a14e1606267..b33cbd03aff 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -885,6 +885,15 @@ def _provision_instances(self, context, instance_type, min_count,
                         base_options['pci_requests'], filter_properties,
                         instance_group, base_options['availability_zone'],
                         security_groups=security_groups)
+
+                if block_device_mapping:
+                    # Record whether or not we are a BFV instance
+                    root = block_device_mapping.root_bdm()
+                    req_spec.is_bfv = bool(root and root.is_volume)
+                else:
+                    # If we have no BDMs, we're clearly not BFV
+                    req_spec.is_bfv = False
+
                 # NOTE(danms): We need to record num_instances on the request
                 # spec as this is how the conductor knows how many were in this
                 # batch.
diff --git a/nova/objects/request_spec.py b/nova/objects/request_spec.py
index a211edfd41d..7ba4a499804 100644
--- a/nova/objects/request_spec.py
+++ b/nova/objects/request_spec.py
@@ -44,7 +44,8 @@ class RequestSpec(base.NovaObject):
     # Version 1.8: Added security_groups
     # Version 1.9: Added user_id
     # Version 1.10: Added network_metadata
-    VERSION = '1.10'
+    # Version 1.11: Added is_bfv
+    VERSION = '1.11'
 
     fields = {
         'id': fields.IntegerField(),
@@ -82,11 +83,14 @@ class RequestSpec(base.NovaObject):
         'instance_uuid': fields.UUIDField(),
         'security_groups': fields.ObjectField('SecurityGroupList'),
         'network_metadata': fields.ObjectField('NetworkMetadata'),
+        'is_bfv': fields.BooleanField(),
     }
 
     def obj_make_compatible(self, primitive, target_version):
         super(RequestSpec, self).obj_make_compatible(primitive, target_version)
         target_version = versionutils.convert_version_to_tuple(target_version)
+        if target_version < (1, 11) and 'is_bfv' in primitive:
+            del primitive['is_bfv']
         if target_version < (1, 10):
             if 'network_metadata' in primitive:
                 del primitive['network_metadata']
@@ -473,7 +477,7 @@ def _from_db_object(context, spec, db_spec):
             # though they should match.
             if key in ['id', 'instance_uuid']:
                 setattr(spec, key, db_spec[key])
-            else:
+            elif key in spec_obj:
                 setattr(spec, key, getattr(spec_obj, key))
         spec._context = context
 
diff --git a/nova/scheduler/utils.py b/nova/scheduler/utils.py
index 5f35242daca..8e9b402da70 100644
--- a/nova/scheduler/utils.py
+++ b/nova/scheduler/utils.py
@@ -397,9 +397,14 @@ def resources_from_request_spec(spec_obj):
         fields.ResourceClass.MEMORY_MB: spec_obj.memory_mb,
     }
 
-    requested_disk_mb = (1024 * (spec_obj.root_gb +
-                                 spec_obj.ephemeral_gb) +
+    requested_disk_mb = ((1024 * spec_obj.ephemeral_gb) +
                          spec_obj.swap)
+
+    if 'is_bfv' not in spec_obj or not spec_obj.is_bfv:
+        # Either this is not a BFV instance, or we are not sure,
+        # so ask for root_gb allocation
+        requested_disk_mb += (1024 * spec_obj.root_gb)
+
     # NOTE(sbauza): Disk request is expressed in MB but we count
     # resources in GB. Since there could be a remainder of the division
     # by 1024, we need to ceil the result to the next bigger Gb so we
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index 4ec97875582..f1cf685aa3f 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -3578,6 +3578,85 @@ def test_soft_delete_then_restore(self):
         self._delete_and_check_allocations(server)
 
 
+class VolumeBackedServerTest(integrated_helpers.ProviderUsageBaseTestCase):
+    """Tests for volume-backed servers."""
+
+    compute_driver = 'fake.SmallFakeDriver'
+
+    def setUp(self):
+        super(VolumeBackedServerTest, self).setUp()
+        self.compute1 = self._start_compute('host1')
+        self.compute2 = self._start_compute('host2')
+        self.flavor_id = self._create_flavor()
+
+    def _create_flavor(self):
+        body = {
+            'flavor': {
+                'id': 'vbst',
+                'name': 'special',
+                'ram': 512,
+                'vcpus': 1,
+                'disk': 10,
+                'OS-FLV-EXT-DATA:ephemeral': 20,
+                'swap': 5 * 1024,
+                'rxtx_factor': 1.0,
+                'os-flavor-access:is_public': True,
+            },
+        }
+        self.admin_api.post_flavor(body)
+        return body['flavor']['id']
+
+    def _create_server(self):
+        with nova.utils.temporary_mutation(self.api, microversion='2.35'):
+            image_id = self.api.get_images()[0]['id']
+        server_req = self._build_minimal_create_server_request(
+            self.api, 'trait-based-server',
+            image_uuid=image_id,
+            flavor_id=self.flavor_id, networks='none')
+        server = self.api.post_server({'server': server_req})
+        server = self._wait_for_state_change(self.api, server, 'ACTIVE')
+        return server
+
+    def _create_volume_backed_server(self):
+        self.useFixture(nova_fixtures.CinderFixtureNewAttachFlow(self))
+        volume_id = nova_fixtures.CinderFixtureNewAttachFlow.IMAGE_BACKED_VOL
+        server_req_body = {
+            # There is no imageRef because this is boot from volume.
+            'server': {
+                'flavorRef': self.flavor_id,
+                'name': 'test_volume_backed',
+                # We don't care about networking for this test. This
+                # requires microversion >= 2.37.
+                'networks': 'none',
+                'block_device_mapping_v2': [{
+                    'boot_index': 0,
+                    'uuid': volume_id,
+                    'source_type': 'volume',
+                    'destination_type': 'volume'
+                }]
+            }
+        }
+        server = self.api.post_server(server_req_body)
+        server = self._wait_for_state_change(self.api, server, 'ACTIVE')
+        return server
+
+    def test_ephemeral_has_disk_allocation(self):
+        server = self._create_server()
+        allocs = self._get_allocations_by_server_uuid(server['id'])
+        resources = list(allocs.values())[0]['resources']
+        self.assertIn('MEMORY_MB', resources)
+        # 10gb root, 20gb ephemeral, 5gb swap
+        self.assertEqual(35, resources['DISK_GB'])
+
+    def test_volume_backed_no_disk_allocation(self):
+        server = self._create_volume_backed_server()
+        allocs = self._get_allocations_by_server_uuid(server['id'])
+        resources = list(allocs.values())[0]['resources']
+        self.assertIn('MEMORY_MB', resources)
+        # 0gb root, 20gb ephemeral, 5gb swap
+        self.assertEqual(25, resources['DISK_GB'])
+
+
 class TraitsBasedSchedulingTest(integrated_helpers.ProviderUsageBaseTestCase):
     """Tests for requesting a server with required traits in Placement"""
 
diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py
index 3848ed838c7..d879c36c2ee 100644
--- a/nova/tests/unit/compute/test_compute_api.py
+++ b/nova/tests/unit/compute/test_compute_api.py
@@ -4461,7 +4461,8 @@ def do_test(
                             'numa_topology': None,
                             'pci_requests': None}
             security_groups = {}
-            block_device_mapping = [objects.BlockDeviceMapping(
+            block_device_mapping = objects.BlockDeviceMappingList(
+                objects=[objects.BlockDeviceMapping(
                     **fake_block_device.FakeDbBlockDeviceDict(
                     {
                      'id': 1,
@@ -4470,7 +4471,7 @@ def do_test(
                      'destination_type': 'volume',
                      'device_name': 'vda',
                      'boot_index': 0,
-                     }))]
+                     }))])
             shutdown_terminate = True
             instance_group = None
             check_server_group_quota = False
diff --git a/nova/tests/unit/objects/test_objects.py b/nova/tests/unit/objects/test_objects.py
index 433cfa8b261..b88874870f5 100644
--- a/nova/tests/unit/objects/test_objects.py
+++ b/nova/tests/unit/objects/test_objects.py
@@ -1142,7 +1142,7 @@ def obj_name(cls):
     'PowerVMLiveMigrateData': '1.3-79c635ecf61d1d70b5b9fa04bf778a91',
     'Quotas': '1.3-40fcefe522111dddd3e5e6155702cf4e',
     'QuotasNoOp': '1.3-347a039fc7cfee7b225b68b5181e0733',
-    'RequestSpec': '1.10-afe714bc445ab7cb791150a775f3b779',
+    'RequestSpec': '1.11-851a690dbf116fb5165cc94ea6c85629',
     'S3ImageMapping': '1.0-7dd7366a890d82660ed121de9092276e',
     'SchedulerLimits': '1.0-249c4bd8e62a9b327b7026b7f19cc641',
     'SchedulerRetries': '1.1-3c9c8b16143ebbb6ad7030e999d14cc0',
diff --git a/nova/tests/unit/objects/test_request_spec.py b/nova/tests/unit/objects/test_request_spec.py
index 770f6b9d86a..4577d25409f 100644
--- a/nova/tests/unit/objects/test_request_spec.py
+++ b/nova/tests/unit/objects/test_request_spec.py
@@ -299,7 +299,7 @@ def test_from_primitives(self, mock_limits):
         spec = objects.RequestSpec.from_primitives(ctxt, spec_dict, filt_props)
         mock_limits.assert_called_once_with({})
         # Make sure that all fields are set using that helper method
-        skip = ['id', 'security_groups', 'network_metadata']
+        skip = ['id', 'security_groups', 'network_metadata', 'is_bfv']
         for field in [f for f in spec.obj_fields if f not in skip]:
             self.assertTrue(spec.obj_attr_is_set(field),
                              'Field: %s is not set' % field)
@@ -329,7 +329,7 @@ def test_from_components(self):
                 filter_properties, instance_group, instance.availability_zone,
                 objects.SecurityGroupList())
         # Make sure that all fields are set using that helper method
-        skip = ['id', 'network_metadata']
+        skip = ['id', 'network_metadata', 'is_bfv']
         for field in [f for f in spec.obj_fields if f not in skip]:
             self.assertTrue(spec.obj_attr_is_set(field),
                             'Field: %s is not set' % field)
diff --git a/releasenotes/notes/bfv-instances-no-longer-allocate-from-compute-95d048fbe9867c34.yaml b/releasenotes/notes/bfv-instances-no-longer-allocate-from-compute-95d048fbe9867c34.yaml
new file mode 100644
index 00000000000..c30679e53dd
--- /dev/null
+++ b/releasenotes/notes/bfv-instances-no-longer-allocate-from-compute-95d048fbe9867c34.yaml
@@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    Booting volume-backed instances no longer includes an incorrect allocation
+    against the compute node for the root disk. Historically, this has been
+    quite broken behavior in Nova, where volume-backed instances would count
+    against available space on the compute node, even though their storage
+    was provided by the volume service. Now, newly-booted volume-backed
+    instances will not create allocations of ``DISK_GB`` against the compute
+    node for the ``root_gb`` quantity in the flavor. Note that if you are
+    still using a scheduler configured with the (now deprecated)
+    DiskFilter (including deployments using CachingScheduler), the
+    above change will not apply to you.
\ No newline at end of file