Skip to content

Commit

Permalink
snapshot: introduce non-memory timeout
Browse files Browse the repository at this point in the history
Although the abort thread is irrelevant for the non-memory snapshot
flow, we do have limitations. If the pivot did happen, we shouldn't
abort it. But, if it happens before, it's most likely to get stuck on
freeze operation. We can't abort the freeze and we get response only
when the file system is frozen. For Windows VMs there is a limitation
of 10 minutes file system freeze. With that, having a snapshot that
took more than 10 minutes to be frozen, will make the data inconsistent.
10 minutes will work fine if it's only one file system, so we can be
sure we are frozen, but we go on the safe side, which is 8 minutes to
get 2 minutes margin to execute the pivot and to have enough time to let
the whole file system to be frozen. The API will respect other given
value for the new timeout if override is desired.

Change-Id: Ia97b218e8566c8ca297355216a53739ae07683dc
Bug-Url: https://bugzilla.redhat.com/1985973
Signed-off-by: Liran Rotenberg <lrotenbe@redhat.com>
  • Loading branch information
liranr23 authored and OST committed Oct 6, 2021
1 parent cd5eafa commit f6a1cf8
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 25 deletions.
10 changes: 6 additions & 4 deletions lib/vdsm/API.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,26 +708,28 @@ def dump_checkpoint(self, checkpoint_id):
@api.logged(on="api.virt")
@api.method
def snapshot(self, snapDrives, snapMemory=None,
frozen=False, jobUUID=None, timeout=30):
frozen=False, jobUUID=None, timeout=30, freeze_timeout=8):
# for backward compatibility reasons, we need to
# do the instance check before to run the hooks.
vm = self.vm

if timeout <= 0:
if timeout <= 0 or freeze_timeout <= 0:
# It makes no sense to run a snapshot with timeout <= 0.
# It can happen due to an Engine bug running a synchronous
# instead of an asynchronous snapshot (bz#1950209).
self.log.error("Zero snapshot timeout requested")
param = 'timeout' if timeout <= 0 else 'freeze_timeout'
raise exception.InvalidParameter(action='VM.snapshot',
parameter='timeout', value=0)
parameter=param, value=0)

memoryParams = {}
if snapMemory:
memoryParams['dst'], memoryParams['dstparams'] = \
self._getHibernationPathsFromString(snapMemory)

return vm.snapshot(
snapDrives, memoryParams, frozen, jobUUID, False, timeout
snapDrives, memoryParams, frozen, jobUUID, False, timeout,
freeze_timeout
)

@api.logged(on="api.virt")
Expand Down
6 changes: 6 additions & 0 deletions lib/vdsm/api/vdsm-api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11544,6 +11544,12 @@ VM.snapshot:
type: uint
added: '4.4'

- defaultvalue: 8
description: Timeout in minutes for non-memory snapshot operation
name: freeze_timeout
type: uint
added: '4.5'

VM.setBalloonTarget:
added: '3.1'
description: Dynamically change the target amount of physical memory
Expand Down
60 changes: 41 additions & 19 deletions lib/vdsm/virt/jobs/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,14 @@ def set_completed(vm, snapshot_data, completed, abort, lock):
snapshot_data["completed"] = completed.is_set()


def _running_time(start_time):
return monotonic_time() - start_time


class Job(vdsm.virt.jobs.Job):

def __init__(self, vm, snap_drives, memory_params,
frozen, job_uuid, recovery=False, timeout=30):
def __init__(self, vm, snap_drives, memory_params, frozen,
job_uuid, recovery=False, timeout=30, freeze_timeout=8):
super(Job, self).__init__(job_uuid, 'snapshot_vm')
self._vm = vm
self._snap_drives = snap_drives
Expand All @@ -100,6 +104,7 @@ def __init__(self, vm, snap_drives, memory_params,
self._completed = threading.Event()
self._lock = threading.Lock()
self._snapshot_job = {}
self._freeze_timeout = freeze_timeout * 60
if recovery:
self._snapshot_job = read_snapshot_md(self._vm, self._lock)
self._load_metadata()
Expand All @@ -121,7 +126,8 @@ def _run(self):
self._memory_params, self._frozen,
self._job_uuid, self._abort, self._completed,
self._start_time, self._timeout,
self._snapshot_job, self._lock)
self._snapshot_job, self._lock,
self._freeze_timeout)
snap.snapshot()
except:
# Setting the abort in cases where the snapshot job failed before
Expand All @@ -144,6 +150,7 @@ def _load_metadata(self):
if self._snapshot_job:
self._start_time = float(self._snapshot_job['startTime'])
self._timeout = int(self._snapshot_job['timeout'])
self._freeze_timeout = int(self._snapshot_job['freezeTimeout'])
if self._snapshot_job['abort']:
self._abort.set()
if self._snapshot_job['completed']:
Expand All @@ -157,7 +164,8 @@ class Snapshot(properties.Owner):
_job_uuid = properties.UUID(required=True)

def __init__(self, vm, snap_drives, memory_params, frozen, job_uuid,
abort, completed, start_time, timeout, snapshot_job, lock):
abort, completed, start_time, timeout, snapshot_job, lock,
freeze_timeout):
self._vm = vm
self._snap_drives = snap_drives
self._memory_params = memory_params
Expand All @@ -169,6 +177,7 @@ def __init__(self, vm, snap_drives, memory_params, frozen, job_uuid,
self._should_freeze = not (self._memory_params or self._frozen)
self._start_time = start_time
self._timeout = timeout
self._freeze_timeout = freeze_timeout
self._snapshot_job = snapshot_job
self._lock = lock
self._init_snapshot_metadata()
Expand All @@ -182,13 +191,16 @@ def _init_snapshot_metadata(self):
# for the snapshot job and the abort thread.
# Initializing the job parameters; 'abort' and 'completed'
# will be changed once the job status changes.
self._snapshot_job.update({'startTime': str(self._start_time),
'timeout': str(self._timeout),
'abort': False,
'completed': False,
"jobUUID": self._job_uuid,
"frozen": self._frozen,
"memoryParams": self._memory_params})
self._snapshot_job.update({
'startTime': str(self._start_time),
'timeout': str(self._timeout),
'freezeTimeout': str(self._freeze_timeout),
'abort': False,
'completed': False,
'jobUUID': self._job_uuid,
'frozen': self._frozen,
'memoryParams': self._memory_params,
})
write_snapshot_md(self._vm, self._snapshot_job, self._lock)

def _thaw_vm(self):
Expand All @@ -200,8 +212,7 @@ def _thaw_vm(self):

def finalize_vm(self, memory_vol):
try:
if self._abort.is_set():
self._thaw_vm()
self._thaw_vm()
self._vm.drive_monitor.enable()
if self._memory_params:
self._vm.cif.teardownVolumePath(memory_vol)
Expand Down Expand Up @@ -463,6 +474,16 @@ def vm_conf_for_memory_snapshot():
try:
if self._should_freeze:
self._vm.freeze()
if not self._memory_params:
run_time = _running_time(self._start_time)
if run_time > self._freeze_timeout:
self._vm.log.error(
"Non-memory snapshot timeout %s passed after %s "
"seconds",
self._freeze_timeout, run_time
)
raise exception.SnapshotFailed()

self._vm.log.info("Taking a live snapshot (drives=%s,"
"memory=%s)", ', '
.join(drive["name"] for drive in
Expand Down Expand Up @@ -589,9 +610,12 @@ def run(self):
except KeyError:
self._vm.log.error("Missing data on the snapshot job "
"metadata.. calling teardown")
# We only use the Snapshot class to perform the teardown. Therefore,
# some of the passed values are not important.
snap = Snapshot(
self._vm, None, self._memory_params, self._frozen, self._job_uuid,
self._abort, self._completed, 0, 0, self._snapshot_job, self._lock
self._abort, self._completed, 0, 0, self._snapshot_job, self._lock,
0
)
if self._abort.is_set():
snap.finalize_vm(memory_vol)
Expand Down Expand Up @@ -635,7 +659,8 @@ def run(self):
while self._timeout_not_reached() and self._job_running() and not \
self._completed.is_set():
self._vm.log.info("Time passed: %s, out of %s",
self._running_time(), self._timeout)
_running_time(self._start_time),
self._timeout)
time.sleep(monitoring_interval)
if not self._job_completed():
self._abort_job()
Expand Down Expand Up @@ -669,10 +694,7 @@ def _job_completed(self):
return False

def _timeout_not_reached(self):
return self._running_time() < self._timeout

def _running_time(self):
return monotonic_time() - self._start_time
return _running_time(self._start_time) < self._timeout

def _job_running(self):
try:
Expand Down
5 changes: 3 additions & 2 deletions lib/vdsm/virt/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4361,10 +4361,11 @@ def dump_checkpoint(self, checkpoint_id):

@api.guard(_not_migrating)
def snapshot(self, snap_drives, memory_params, frozen,
job_uuid, recovery=False, timeout=30):
job_uuid, recovery=False, timeout=30, freeze_timeout=8):
job_id = job_uuid or str(uuid.uuid4())
job = snapshot.Job(self, snap_drives, memory_params,
frozen, job_id, recovery, timeout)
frozen, job_id, recovery, timeout,
freeze_timeout)
jobs.add(job)
vdsm.virt.jobs.schedule(job)
return {'status': doneCode}
Expand Down

0 comments on commit f6a1cf8

Please sign in to comment.