Skip to content

Commit

Permalink
Merge "Add retry logic for detaching device using LibVirt"
Browse files Browse the repository at this point in the history
  • Loading branch information
Jenkins authored and openstack-gerrit committed Dec 19, 2015
2 parents e3c0cc9 + 3a3fb3c commit 47e5199
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 7 deletions.
8 changes: 8 additions & 0 deletions nova/exception.py
Expand Up @@ -639,6 +639,14 @@ class VolumeBDMPathNotFound(VolumeBDMNotFound):
msg_fmt = _("No volume Block Device Mapping at path: %(path)s")


class DeviceDetachFailed(NovaException):
msg_fmt = _("Device detach failed for %(device)s: %(reason)s)")


class DeviceNotFound(NotFound):
msg_fmt = _("Device '%(device)s' not found.")


class SnapshotNotFound(NotFound):
ec2_code = 'InvalidSnapshot.NotFound'
msg_fmt = _("Snapshot %(snapshot_id)s could not be found.")
Expand Down
36 changes: 33 additions & 3 deletions nova/tests/unit/virt/libvirt/test_driver.py
Expand Up @@ -5108,16 +5108,25 @@ def test_detach_volume_with_vir_domain_affect_live_flag(self,
mock_get_domain):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance)
mock_xml = """<domain>
mock_xml_with_disk = """<domain>
<devices>
<disk type='file'>
<source file='/path/to/fake-volume'/>
<target dev='vdc' bus='virtio'/>
</disk>
</devices>
</domain>"""
mock_xml_without_disk = """<domain>
<devices>
</devices>
</domain>"""
mock_dom = mock.MagicMock()
mock_dom.XMLDesc.return_value = mock_xml

# Second time don't return anything about disk vdc so it looks removed
return_list = [mock_xml_with_disk, mock_xml_without_disk]
# Doubling the size of return list because we test with two guest power
# states
mock_dom.XMLDesc.side_effect = return_list + return_list

connection_info = {"driver_volume_type": "fake",
"data": {"device_path": "/fake",
Expand All @@ -5130,7 +5139,6 @@ def test_detach_volume_with_vir_domain_affect_live_flag(self,
for state in (power_state.RUNNING, power_state.PAUSED):
mock_dom.info.return_value = [state, 512, 512, 2, 1234, 5678]
mock_get_domain.return_value = mock_dom

drvr.detach_volume(connection_info, instance, '/dev/vdc')

mock_get_domain.assert_called_with(instance)
Expand All @@ -5142,6 +5150,28 @@ def test_detach_volume_with_vir_domain_affect_live_flag(self,
mock_disconnect_volume.assert_called_with(
connection_info, 'vdc')

@mock.patch('nova.virt.libvirt.host.Host.get_domain')
def test_detach_volume_disk_not_found(self, mock_get_domain):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance)
mock_xml_without_disk = """<domain>
<devices>
</devices>
</domain>"""
mock_dom = mock.MagicMock(return_value=mock_xml_without_disk)

connection_info = {"driver_volume_type": "fake",
"data": {"device_path": "/fake",
"access_mode": "rw"}}

mock_dom.info.return_value = [power_state.RUNNING, 512, 512, 2, 1234,
5678]
mock_get_domain.return_value = mock_dom
self.assertRaises(exception.DiskNotFound, drvr.detach_volume,
connection_info, instance, '/dev/vdc')

mock_get_domain.assert_called_once_with(instance)

def test_multi_nic(self):
network_info = _fake_network_info(self.stubs, 2)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
Expand Down
50 changes: 50 additions & 0 deletions nova/tests/unit/virt/libvirt/test_guest.py
Expand Up @@ -218,6 +218,56 @@ def test_detach_device_persistent_live(self):
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))

def test_detach_device_with_retry_detach_success(self):
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
conf.to_xml.return_value = "</xml>"
get_config = mock.Mock()
# Force multiple retries of detach
get_config.side_effect = [conf, conf, conf, None]
dev_path = "/dev/vdb"

retry_detach = self.guest.detach_device_with_retry(
get_config, dev_path, persistent=True, live=True,
inc_sleep_time=.01)
# Ensure we've only done the initial detach call
self.domain.detachDeviceFlags.assert_called_once_with(
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))

get_config.assert_called_with(dev_path)

# Some time later, we can do the wait/retry to ensure detach succeeds
self.domain.detachDeviceFlags.reset_mock()
retry_detach()
# Should have two retries before we pretend device is detached
self.assertEqual(2, self.domain.detachDeviceFlags.call_count)

def test_detach_device_with_retry_detach_failure(self):
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
conf.to_xml.return_value = "</xml>"
# Continue to return some value for the disk config
get_config = mock.Mock(return_value=conf)

retry_detach = self.guest.detach_device_with_retry(
get_config, "/dev/vdb", persistent=True, live=True,
inc_sleep_time=.01, max_retry_count=3)
# Ensure we've only done the initial detach call
self.domain.detachDeviceFlags.assert_called_once_with(
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))

# Some time later, we can do the wait/retry to ensure detach
self.domain.detachDeviceFlags.reset_mock()
# Should hit max # of retries
self.assertRaises(exception.DeviceDetachFailed, retry_detach)
self.assertEqual(4, self.domain.detachDeviceFlags.call_count)

def test_detach_device_with_retry_device_not_found(self):
get_config = mock.Mock(return_value=None)
self.assertRaises(
exception.DeviceNotFound, self.guest.detach_device_with_retry,
get_config, "/dev/vdb", persistent=True, live=True)

def test_get_xml_desc(self):
self.guest.get_xml_desc()
self.domain.XMLDesc.assert_called_once_with(flags=0)
Expand Down
20 changes: 20 additions & 0 deletions nova/tests/unit/virt/test_virt_drivers.py
Expand Up @@ -139,6 +139,22 @@ def fake_get_instance_disk_info(_self, instance, xml=None,
def fake_delete_instance_files(_self, _instance):
pass

def fake_wait():
pass

def fake_detach_device_with_retry(_self, get_device_conf_func, device,
persistent, live,
max_retry_count=7,
inc_sleep_time=2,
max_sleep_time=30):
# Still calling detach, but instead of returning function
# that actually checks if device is gone from XML, just continue
# because XML never gets updated in these tests
_self.detach_device(get_device_conf_func(device),
persistent=persistent,
live=live)
return fake_wait

self.stubs.Set(nova.virt.libvirt.driver.LibvirtDriver,
'_get_instance_disk_info',
fake_get_instance_disk_info)
Expand All @@ -150,6 +166,10 @@ def fake_delete_instance_files(_self, _instance):
'delete_instance_files',
fake_delete_instance_files)

self.stubs.Set(nova.virt.libvirt.guest.Guest,
'detach_device_with_retry',
fake_detach_device_with_retry)

# Like the existing fakelibvirt.migrateToURI, do nothing,
# but don't fail for these tests.
self.stubs.Set(nova.virt.libvirt.driver.libvirt.Domain,
Expand Down
13 changes: 9 additions & 4 deletions nova/virt/libvirt/driver.py
Expand Up @@ -1229,13 +1229,14 @@ def detach_volume(self, connection_info, instance, mountpoint,
disk_dev = mountpoint.rpartition("/")[2]
try:
guest = self._host.get_guest(instance)
conf = guest.get_disk(disk_dev)
if not conf:
raise exception.DiskNotFound(location=disk_dev)

state = guest.get_power_state(self._host)
live = state in (power_state.RUNNING, power_state.PAUSED)
guest.detach_device(conf, persistent=True, live=live)

wait_for_detach = guest.detach_device_with_retry(guest.get_disk,
disk_dev,
persistent=True,
live=live)

if encryption:
# The volume must be detached from the VM before
Expand All @@ -1244,12 +1245,16 @@ def detach_volume(self, connection_info, instance, mountpoint,
encryptor = self._get_volume_encryptor(connection_info,
encryption)
encryptor.detach_volume(**encryption)

wait_for_detach()
except exception.InstanceNotFound:
# NOTE(zhaoqin): If the instance does not exist, _lookup_by_name()
# will throw InstanceNotFound exception. Need to
# disconnect volume under this circumstance.
LOG.warn(_LW("During detach_volume, instance disappeared."),
instance=instance)
except exception.DeviceNotFound:
raise exception.DiskNotFound(location=disk_dev)
except libvirt.libvirtError as ex:
# NOTE(vish): This is called to cleanup volumes after live
# migration, so we should still disconnect even if
Expand Down
50 changes: 50 additions & 0 deletions nova/virt/libvirt/guest.py
Expand Up @@ -29,6 +29,7 @@

from lxml import etree
from oslo_log import log as logging
from oslo_service import loopingcall
from oslo_utils import encodeutils
from oslo_utils import excutils
from oslo_utils import importutils
Expand Down Expand Up @@ -277,6 +278,55 @@ def get_all_devices(self, devtype=None):
devs.append(dev)
return devs

def detach_device_with_retry(self, get_device_conf_func, device,
persistent, live, max_retry_count=7,
inc_sleep_time=2,
max_sleep_time=30):
"""Detaches a device from the guest. After the initial detach request,
a function is returned which can be used to ensure the device is
successfully removed from the guest domain (retrying the removal as
necessary).
:param get_device_conf_func: function which takes device as a parameter
and returns the configuration for device
:param device: device to detach
:param persistent: bool to indicate whether the change is
persistent or not
:param live: bool to indicate whether it affects the guest in running
state
:param max_retry_count: number of times the returned function will
retry a detach before failing
:param inc_sleep_time: incremental time to sleep in seconds between
detach retries
:param max_sleep_time: max sleep time in seconds beyond which the sleep
time will not be incremented using param
inc_sleep_time. On reaching this threshold,
max_sleep_time will be used as the sleep time.
"""

conf = get_device_conf_func(device)
if conf is None:
raise exception.DeviceNotFound(device=device)

self.detach_device(conf, persistent, live)

@loopingcall.RetryDecorator(max_retry_count=max_retry_count,
inc_sleep_time=inc_sleep_time,
max_sleep_time=max_sleep_time,
exceptions=exception.DeviceDetachFailed)
def _do_wait_and_retry_detach():
config = get_device_conf_func(device)
if config is not None:
# Device is already detached from persistent domain
# and only transient domain needs update
self.detach_device(config, persistent=False, live=live)
# Raise error since the device still existed on the guest
reason = _("Unable to detach from guest transient domain.")
raise exception.DeviceDetachFailed(device=device,
reason=reason)

return _do_wait_and_retry_detach

def detach_device(self, conf, persistent=False, live=False):
"""Detaches device to the guest.
Expand Down

0 comments on commit 47e5199

Please sign in to comment.