Skip to content

Commit

Permalink
Use COMPUTE_SAME_HOST_COLD_MIGRATE trait during migrate
Browse files Browse the repository at this point in the history
This uses the COMPUTE_SAME_HOST_COLD_MIGRATE trait in the API during a
cold migration to filter out hosts that cannot support same-host cold
migration, which is all of them except for the hosts using the vCenter
driver.

For any nodes that do not report the trait, we won't know if they don't
because they don't support it or if they are not new enough to report
it, so the API has a service version check and will fallback to old
behavior using the config if the node is old. That compat code can be
removed in the next release.

As a result of this the FakeDriver capabilities are updated so the
FakeDriver no longer supports same-host cold migration and a new fake
driver is added to support that scenario for any tests that need it.

Change-Id: I7a4b951f3ab324c666ab924e6003d24cc8e539f5
Closes-Bug: #1748697
Related-Bug: #1811235
  • Loading branch information
mriedem authored and stephenfin committed Jan 29, 2020
1 parent 9fa3600 commit 4921e82
Show file tree
Hide file tree
Showing 10 changed files with 216 additions and 8 deletions.
2 changes: 1 addition & 1 deletion lower-constraints.txt
Expand Up @@ -66,7 +66,7 @@ os-brick==2.6.2
os-client-config==1.29.0
os-resource-classes==0.4.0
os-service-types==1.7.0
os-traits==2.0.0
os-traits==2.1.0
os-vif==1.14.0
os-win==3.0.0
os-xenapi==0.3.3
Expand Down
56 changes: 54 additions & 2 deletions nova/compute/api.py
Expand Up @@ -25,6 +25,7 @@
import string

from castellan import key_manager
import os_traits
from oslo_log import log as logging
from oslo_messaging import exceptions as oslo_exceptions
from oslo_serialization import base64 as base64utils
Expand Down Expand Up @@ -103,6 +104,7 @@

MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED = 38
MIN_COMPUTE_CROSS_CELL_RESIZE = 47
MIN_COMPUTE_SAME_HOST_COLD_MIGRATE = 48

# FIXME(danms): Keep a global cache of the cells we find the
# first time we look. This needs to be refreshed on a timer or
Expand Down Expand Up @@ -3881,8 +3883,7 @@ def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
validate_pci=True)

filter_properties = {'ignore_hosts': []}

if not CONF.allow_resize_to_same_host:
if not self._allow_resize_to_same_host(same_instance_type, instance):
filter_properties['ignore_hosts'].append(instance.host)

request_spec = objects.RequestSpec.get_by_instance_uuid(
Expand Down Expand Up @@ -3931,6 +3932,57 @@ def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
request_spec=request_spec,
do_cast=True)

def _allow_resize_to_same_host(self, cold_migrate, instance):
"""Contains logic for excluding the instance.host on resize/migrate.
If performing a cold migration and the compute node resource provider
reports the COMPUTE_SAME_HOST_COLD_MIGRATE trait then same-host cold
migration is allowed otherwise it is not and the current instance.host
should be excluded as a scheduling candidate.
:param cold_migrate: true if performing a cold migration, false
for resize
:param instance: Instance object being resized or cold migrated
:returns: True if same-host resize/cold migrate is allowed, False
otherwise
"""
if cold_migrate:
# Check to see if the compute node resource provider on which the
# instance is running has the COMPUTE_SAME_HOST_COLD_MIGRATE
# trait.
# Note that we check this here in the API since we cannot
# pre-filter allocation candidates in the scheduler using this
# trait as it would not work. For example, libvirt nodes will not
# report the trait but using it as a forbidden trait filter when
# getting allocation candidates would still return libvirt nodes
# which means we could attempt to cold migrate to the same libvirt
# node, which would fail.
ctxt = instance._context
cn = objects.ComputeNode.get_by_host_and_nodename(
ctxt, instance.host, instance.node)
traits = self.placementclient.get_provider_traits(
ctxt, cn.uuid).traits
# If the provider has the trait it is (1) new enough to report that
# trait and (2) supports cold migration on the same host.
if os_traits.COMPUTE_SAME_HOST_COLD_MIGRATE in traits:
allow_same_host = True
else:
# TODO(mriedem): Remove this compatibility code after one
# release. If the compute is old we will not know if it
# supports same-host cold migration so we fallback to config.
service = objects.Service.get_by_compute_host(ctxt, cn.host)
if service.version >= MIN_COMPUTE_SAME_HOST_COLD_MIGRATE:
# The compute is new enough to report the trait but does
# not so same-host cold migration is not allowed.
allow_same_host = False
else:
# The compute is not new enough to report the trait so we
# fallback to config.
allow_same_host = CONF.allow_resize_to_same_host
else:
allow_same_host = CONF.allow_resize_to_same_host
return allow_same_host

@check_instance_lock
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
vm_states.PAUSED, vm_states.SUSPENDED])
Expand Down
4 changes: 3 additions & 1 deletion nova/objects/service.py
Expand Up @@ -31,7 +31,7 @@


# NOTE(danms): This is the global service version counter
SERVICE_VERSION = 47
SERVICE_VERSION = 48


# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
Expand Down Expand Up @@ -176,6 +176,8 @@
# Version 47: Compute RPC v5.10:
# finish_revert_snapshot_based_resize_at_source
{'compute_rpc': '5.10'},
# Version 48: Drivers report COMPUTE_SAME_HOST_COLD_MIGRATE trait.
{'compute_rpc': '5.10'},
)


Expand Down
122 changes: 122 additions & 0 deletions nova/tests/functional/test_cold_migrate.py
@@ -0,0 +1,122 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import mock

from nova.compute import api as compute_api
from nova import context as nova_context
from nova import objects
from nova import test
from nova.tests.functional import integrated_helpers
from nova.tests.unit import fake_notifier


class ColdMigrationDisallowSameHost(
integrated_helpers.ProviderUsageBaseTestCase):
"""Tests cold migrate where the source host does not have the
COMPUTE_SAME_HOST_COLD_MIGRATE trait.
"""
compute_driver = 'fake.MediumFakeDriver'

def setUp(self):
super(ColdMigrationDisallowSameHost, self).setUp()
# Start one compute service which will use the fake virt driver
# which disallows cold migration to the same host.
self._start_compute('host1')

def _wait_for_migrate_no_valid_host(self, error='NoValidHost'):
event = fake_notifier.wait_for_versioned_notifications(
'compute_task.migrate_server.error')[0]
self.assertEqual(error,
event['payload']['nova_object.data']['reason'][
'nova_object.data']['exception'])

def test_cold_migrate_same_host_not_supported(self):
"""Simple test to show that you cannot cold-migrate to the same host
when the resource provider does not expose the
COMPUTE_SAME_HOST_COLD_MIGRATE trait.
"""
server = self._create_server(networks='none')
# The fake driver does not report COMPUTE_SAME_HOST_COLD_MIGRATE
# so cold migration should fail since we only have one host.
self.api.post_server_action(server['id'], {'migrate': None})
self._wait_for_migrate_no_valid_host()

def test_cold_migrate_same_host_old_compute_disallow(self):
"""Upgrade compat test where the resource provider does not report
the COMPUTE_SAME_HOST_COLD_MIGRATE trait but the compute service is
old so the API falls back to the allow_resize_to_same_host config which
defaults to False.
"""
server = self._create_server(networks='none')
# Stub the compute service version check to make the compute service
# appear old.
fake_service = objects.Service()
fake_service.version = (
compute_api.MIN_COMPUTE_SAME_HOST_COLD_MIGRATE - 1)
with mock.patch('nova.objects.Service.get_by_compute_host',
return_value=fake_service) as mock_get_service:
self.api.post_server_action(server['id'], {'migrate': None})
mock_get_service.assert_called_once_with(
test.MatchType(nova_context.RequestContext), 'host1')
# Since allow_resize_to_same_host defaults to False scheduling failed
# since there are no other hosts.
self._wait_for_migrate_no_valid_host()

def test_cold_migrate_same_host_old_compute_allow(self):
"""Upgrade compat test where the resource provider does not report
the COMPUTE_SAME_HOST_COLD_MIGRATE trait but the compute service is
old so the API falls back to the allow_resize_to_same_host config which
in this test is set to True.
"""
self.flags(allow_resize_to_same_host=True)
server = self._create_server(networks='none')
# Stub the compute service version check to make the compute service
# appear old.
fake_service = objects.Service()
fake_service.version = (
compute_api.MIN_COMPUTE_SAME_HOST_COLD_MIGRATE - 1)
with mock.patch('nova.objects.Service.get_by_compute_host',
return_value=fake_service) as mock_get_service:
self.api.post_server_action(server['id'], {'migrate': None})
mock_get_service.assert_called_once_with(
test.MatchType(nova_context.RequestContext), 'host1')
# In this case the compute is old so the API falls back to checking
# allow_resize_to_same_host which says same-host cold migrate is
# allowed so the scheduler sends the request to the only compute
# available but the virt driver says same-host cold migrate is not
# supported and raises UnableToMigrateToSelf. A reschedule is sent
# to conductor which results in MaxRetriesExceeded since there are no
# alternative hosts.
self._wait_for_migrate_no_valid_host(error='MaxRetriesExceeded')


class ColdMigrationAllowSameHost(
integrated_helpers.ProviderUsageBaseTestCase):
"""Tests cold migrate where the source host has the
COMPUTE_SAME_HOST_COLD_MIGRATE trait.
"""
compute_driver = 'fake.SameHostColdMigrateDriver'

def setUp(self):
super(ColdMigrationAllowSameHost, self).setUp()
self._start_compute('host1')

def test_cold_migrate_same_host_supported(self):
"""Simple test to show that you can cold-migrate to the same host
when the resource provider exposes the COMPUTE_SAME_HOST_COLD_MIGRATE
trait.
"""
server = self._create_server(networks='none')
self.api.post_server_action(server['id'], {'migrate': None})
server = self._wait_for_state_change(server, 'VERIFY_RESIZE')
self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host'])
4 changes: 4 additions & 0 deletions nova/tests/unit/compute/test_compute_api.py
Expand Up @@ -1741,6 +1741,7 @@ def test_revert_resize_concurrent_fail(

@mock.patch('nova.compute.api.API.get_instance_host_status',
new=mock.Mock(return_value=fields_obj.HostStatus.UP))
@mock.patch('nova.compute.api.API._allow_resize_to_same_host')
@mock.patch('nova.compute.utils.is_volume_backed_instance',
return_value=False)
@mock.patch('nova.compute.api.API._validate_flavor_image_nostatus')
Expand All @@ -1757,6 +1758,7 @@ def _test_resize(self, mock_get_all_by_host,
mock_get_by_instance_uuid, mock_get_flavor, mock_upsize,
mock_inst_save, mock_count, mock_limit, mock_record,
mock_migration, mock_validate, mock_is_vol_backed,
mock_allow_resize_to_same_host,
flavor_id_passed=True,
same_host=False, allow_same_host=False,
project_id=None,
Expand All @@ -1768,6 +1770,7 @@ def _test_resize(self, mock_get_all_by_host,
allow_cross_cell_resize=False):

self.flags(allow_resize_to_same_host=allow_same_host)
mock_allow_resize_to_same_host.return_value = allow_same_host

params = {}
if project_id is not None:
Expand Down Expand Up @@ -1871,6 +1874,7 @@ def _check_state(expected_task_state=None):
self.assertEqual(
allow_cross_cell_resize,
fake_spec.requested_destination.allow_cross_cell_move)
mock_allow_resize_to_same_host.assert_called_once()

if host_name:
mock_get_all_by_host.assert_called_once_with(
Expand Down
5 changes: 3 additions & 2 deletions nova/tests/unit/compute/test_compute_mgr.py
Expand Up @@ -7705,6 +7705,7 @@ def setUp(self):
super(ComputeManagerMigrationTestCase, self).setUp()
fake_notifier.stub_notifier(self)
self.addCleanup(fake_notifier.reset)
self.flags(compute_driver='fake.SameHostColdMigrateDriver')
self.compute = manager.ComputeManager()
self.context = context.RequestContext(fakes.FAKE_USER_ID,
fakes.FAKE_PROJECT_ID)
Expand Down Expand Up @@ -9658,7 +9659,7 @@ def _get_migration(self, migration_id, status, migration_type):

@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
@mock.patch.object(objects.Migration, 'get_by_id')
@mock.patch.object(nova.virt.fake.SmallFakeDriver, 'live_migration_abort')
@mock.patch.object(nova.virt.fake.FakeDriver, 'live_migration_abort')
@mock.patch('nova.compute.utils.notify_about_instance_action')
def test_live_migration_abort(self, mock_notify_action, mock_driver,
mock_get_migration, mock_notify):
Expand Down Expand Up @@ -9711,7 +9712,7 @@ def test_live_migration_abort_queued(self, mock_notify_action,
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
@mock.patch.object(objects.Migration, 'get_by_id')
@mock.patch.object(nova.virt.fake.SmallFakeDriver, 'live_migration_abort')
@mock.patch.object(nova.virt.fake.FakeDriver, 'live_migration_abort')
@mock.patch('nova.compute.utils.notify_about_instance_action')
def test_live_migration_abort_not_supported(self, mock_notify_action,
mock_driver,
Expand Down
3 changes: 3 additions & 0 deletions nova/virt/driver.py
Expand Up @@ -124,6 +124,9 @@ def block_device_info_get_mapping(block_device_info):
"supports_image_type_vmdk": os_traits.COMPUTE_IMAGE_TYPE_VMDK,
# Added in os-traits 2.0.0
"supports_image_type_ploop": os_traits.COMPUTE_IMAGE_TYPE_PLOOP,

# Added in os-traits 2.1.0.
"supports_migrate_to_same_host": os_traits.COMPUTE_SAME_HOST_COLD_MIGRATE,
}


Expand Down
8 changes: 7 additions & 1 deletion nova/virt/fake.py
Expand Up @@ -104,7 +104,7 @@ class FakeDriver(driver.ComputeDriver):
capabilities = {
"has_imagecache": True,
"supports_evacuate": True,
"supports_migrate_to_same_host": True,
"supports_migrate_to_same_host": False,
"supports_attach_interface": True,
"supports_device_tagging": True,
"supports_tagged_attach_interface": True,
Expand Down Expand Up @@ -702,6 +702,12 @@ class MediumFakeDriver(FakeDriver):
local_gb = 1028


class SameHostColdMigrateDriver(MediumFakeDriver):
"""MediumFakeDriver variant that supports same-host cold migrate."""
capabilities = dict(FakeDriver.capabilities,
supports_migrate_to_same_host=True)


class PowerUpdateFakeDriver(SmallFakeDriver):
# A specific fake driver for the power-update external event testing.

Expand Down
@@ -0,0 +1,18 @@
---
fixes:
- |
This release contains a fix for `bug 1748697`_ which distinguishes between
resize and cold migrate such that the ``allow_resize_to_same_host`` config
option is no longer used to determine if a server can be cold migrated to
the same compute service host. Now when requesting a cold migration the
API will check if the source compute node resource provider has the
``COMPUTE_SAME_HOST_COLD_MIGRATE`` trait and if so the scheduler can select
the source host. Note that the only in-tree compute driver that supports
cold migration to the same host is ``VMwareVCDriver``.
To support rolling upgrades with N-1 computes if a node does not report the
trait and is old the API will fallback to the ``allow_resize_to_same_host``
config option value. That compatibility will be removed in a subsequent
release.
.. _bug 1748697: https://launchpad.net/bugs/1748697
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -55,7 +55,7 @@ psutil>=3.2.2 # BSD
oslo.versionedobjects>=1.35.0 # Apache-2.0
os-brick>=2.6.2 # Apache-2.0
os-resource-classes>=0.4.0 # Apache-2.0
os-traits>=2.0.0 # Apache-2.0
os-traits>=2.1.0 # Apache-2.0
os-vif>=1.14.0 # Apache-2.0
os-win>=3.0.0 # Apache-2.0
castellan>=0.16.0 # Apache-2.0
Expand Down

0 comments on commit 4921e82

Please sign in to comment.