Navigation Menu

Skip to content

Commit

Permalink
Support multiple processes on Cinder Backup
Browse files Browse the repository at this point in the history
Cinder Backup has always been run as a single process, and this works
fine if the number of backups is small, but there are deployments where
a big number of backups are run simultaneously, and in such cases we
will find a bottleneck on CPU intensive operations, like compression and
SHA calculations, since we are using a single process and therefore a
single CPU core.

This patch adds support to define the number of processes we want to run
on the backup service using a new "backup_processes" configuration
option.

When running multiple processes they will all be running as children of
a parent process, but when running a single process it will run on its
own.

To preven race conditions and avoid the backup service from becoming
available before it should only the first process will do the initial
cleanup and all the others will wait on a lock until it is released by
this first process.

Change-Id: Ib43095024754a6219eb51cc0663913fac10bb642
  • Loading branch information
Akrog committed Mar 14, 2018
1 parent 9d894b5 commit 373b524
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 8 deletions.
11 changes: 11 additions & 0 deletions cinder/backup/manager.py
Expand Up @@ -111,6 +111,7 @@ def __init__(self, *args, **kwargs):
super(BackupManager, self).__init__(*args, **kwargs)
self.is_initialized = False
self._set_tpool_size(CONF.backup_native_threads_pool_size)
self._process_number = kwargs.get('process_number', 1)

@property
def driver_name(self):
Expand Down Expand Up @@ -175,7 +176,17 @@ def reset(self):
self.backup_rpcapi = backup_rpcapi.BackupAPI()
self.volume_rpcapi = volume_rpcapi.VolumeAPI()

@utils.synchronized('backup-pgid-%s' % os.getpgrp(),
external=True, delay=0.1)
def _cleanup_incomplete_backup_operations(self, ctxt):
# Only the first launched process should do the cleanup, the others
# have waited on the lock for the first one to finish the cleanup and
# can now continue with the start process.
if self._process_number != 1:
LOG.debug("Process #%s (pgid=%s) skips cleanup.",
self._process_number, os.getpgrp())
return

LOG.info("Cleaning up incomplete backup operations.")

# TODO(smulcahy) implement full resume of backup and restore
Expand Down
49 changes: 45 additions & 4 deletions cinder/cmd/backup.py
Expand Up @@ -27,6 +27,7 @@
import eventlet
eventlet.monkey_patch()

from oslo_concurrency import processutils
from oslo_config import cfg
from oslo_log import log as logging
from oslo_privsep import priv_context
Expand All @@ -39,6 +40,7 @@

# Need to register global_opts
from cinder.common import config # noqa
from cinder.db import api as session
from cinder import objects
from cinder import service
from cinder import utils
Expand All @@ -47,13 +49,38 @@

CONF = cfg.CONF

backup_workers_opt = cfg.IntOpt(
'backup_workers',
default=1, min=1, max=processutils.get_worker_count(),
help='Number of backup processes to launch. Improves performance with '
'concurrent backups.')
CONF.register_opt(backup_workers_opt)

LOG = None

# NOTE(mriedem): The default backup driver uses swift and performs read/write
# operations in a thread. swiftclient will log requests and responses at DEBUG
# level, which can cause a thread switch and break the backup operation. So we
# set a default log level of WARN for swiftclient to try and avoid this issue.
_EXTRA_DEFAULT_LOG_LEVELS = ['swiftclient=WARN']


def _launch_backup_process(launcher, num_process):
try:
server = service.Service.create(binary='cinder-backup',
coordination=True,
process_number=num_process + 1)
except Exception:
LOG.exception('Backup service %s failed to start.', CONF.host)
sys.exit(1)
else:
# Dispose of the whole DB connection pool here before
# starting another process. Otherwise we run into cases where
# child processes share DB connections which results in errors.
session.dispose_engine()
launcher.launch_service(server)


def main():
objects.register_all()
gmr_opts.set_defaults(CONF)
Expand All @@ -67,7 +94,21 @@ def main():
priv_context.init(root_helper=shlex.split(utils.get_root_helper()))
utils.monkey_patch()
gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)
server = service.Service.create(binary='cinder-backup',
coordination=True)
service.serve(server)
service.wait()
global LOG
LOG = logging.getLogger(__name__)

if CONF.backup_workers > 1:
LOG.info('Backup running with %s processes.', CONF.backup_workers)
launcher = service.get_launcher()

for i in range(CONF.backup_workers):
_launch_backup_process(launcher, i)

launcher.wait()
else:
LOG.info('Backup running in single process mode.')
server = service.Service.create(binary='cinder-backup',
coordination=True,
process_number=1)
service.serve(server)
service.wait()
2 changes: 2 additions & 0 deletions cinder/opts.py
Expand Up @@ -42,6 +42,7 @@
from cinder.backup.drivers import swift as cinder_backup_drivers_swift
from cinder.backup.drivers import tsm as cinder_backup_drivers_tsm
from cinder.backup import manager as cinder_backup_manager
from cinder.cmd import backup as cinder_cmd_backup
from cinder.cmd import volume as cinder_cmd_volume
from cinder.common import config as cinder_common_config
import cinder.compute
Expand Down Expand Up @@ -221,6 +222,7 @@ def list_opts():
cinder_backup_drivers_swift.swiftbackup_service_opts,
cinder_backup_drivers_tsm.tsm_opts,
cinder_backup_manager.backup_manager_opts,
[cinder_cmd_backup.backup_workers_opt],
[cinder_cmd_volume.cluster_opt],
cinder_common_config.core_opts,
cinder_common_config.global_opts,
Expand Down
4 changes: 2 additions & 2 deletions cinder/service.py
Expand Up @@ -366,7 +366,7 @@ def __getattr__(self, key):
def create(cls, host=None, binary=None, topic=None, manager=None,
report_interval=None, periodic_interval=None,
periodic_fuzzy_delay=None, service_name=None,
coordination=False, cluster=None):
coordination=False, cluster=None, **kwargs):
"""Instantiates class and passes back application object.
:param host: defaults to CONF.host
Expand Down Expand Up @@ -400,7 +400,7 @@ def create(cls, host=None, binary=None, topic=None, manager=None,
periodic_fuzzy_delay=periodic_fuzzy_delay,
service_name=service_name,
coordination=coordination,
cluster=cluster)
cluster=cluster, **kwargs)

return service_obj

Expand Down
15 changes: 15 additions & 0 deletions cinder/tests/unit/backup/test_backup.py
Expand Up @@ -379,6 +379,21 @@ def test_cleanup_incomplete_backup_operations_with_exceptions(self):
self.assertEqual(len(fake_backup_list), mock_backup_cleanup.call_count)
self.assertEqual(len(fake_backup_list), mock_temp_cleanup.call_count)

@mock.patch('cinder.objects.BackupList')
@mock.patch.object(manager.BackupManager, '_cleanup_one_backup')
@mock.patch.object(manager.BackupManager,
'_cleanup_temp_volumes_snapshots_for_one_backup')
def test_cleanup_non_primary_process(self, temp_cleanup_mock,
backup_cleanup_mock, backup_ovo_mock):
"""Test cleanup doesn't run on non primary processes."""
self.backup_mgr._process_number = 2

self.backup_mgr._cleanup_incomplete_backup_operations(self.ctxt)

backup_ovo_mock.get_all_by_host.assert_not_called()
backup_cleanup_mock.assert_not_called()
temp_cleanup_mock.assert_not_called()

def test_cleanup_one_backing_up_volume(self):
"""Test cleanup_one_volume for volume status 'backing-up'."""

Expand Down
31 changes: 29 additions & 2 deletions cinder/tests/unit/test_cmd.py
Expand Up @@ -93,13 +93,14 @@ def setUp(self):
super(TestCinderBackupCmd, self).setUp()
sys.argv = ['cinder-backup']

@mock.patch('cinder.cmd.backup._launch_backup_process')
@mock.patch('cinder.service.wait')
@mock.patch('cinder.service.serve')
@mock.patch('cinder.service.Service.create')
@mock.patch('cinder.utils.monkey_patch')
@mock.patch('oslo_log.log.setup')
def test_main(self, log_setup, monkey_patch, service_create, service_serve,
service_wait):
service_wait, launch_mock):
server = service_create.return_value

cinder_backup.main()
Expand All @@ -109,9 +110,35 @@ def test_main(self, log_setup, monkey_patch, service_create, service_serve,
log_setup.assert_called_once_with(CONF, "cinder")
monkey_patch.assert_called_once_with()
service_create.assert_called_once_with(binary='cinder-backup',
coordination=True)
coordination=True,
process_number=1)
service_serve.assert_called_once_with(server)
service_wait.assert_called_once_with()
launch_mock.assert_not_called()

@mock.patch('cinder.service.get_launcher')
@mock.patch('cinder.service.Service.create')
@mock.patch('cinder.utils.monkey_patch')
@mock.patch('oslo_log.log.setup')
def test_main_multiprocess(self, log_setup, monkey_patch, service_create,
get_launcher):
CONF.set_override('backup_workers', 2)
cinder_backup.main()

self.assertEqual('cinder', CONF.project)
self.assertEqual(CONF.version, version.version_string())

c1 = mock.call(binary=constants.BACKUP_BINARY,
coordination=True,
process_number=1)
c2 = mock.call(binary=constants.BACKUP_BINARY,
coordination=True,
process_number=2)
service_create.assert_has_calls([c1, c2])

launcher = get_launcher.return_value
self.assertEqual(2, launcher.launch_service.call_count)
launcher.wait.assert_called_once_with()


class TestCinderSchedulerCmd(test.TestCase):
Expand Down
@@ -0,0 +1,7 @@
---
features:
- |
Cinder backup now supports running multiple processes to make the most of
the available CPU cores. Performance gains will be significant when
running multiple concurrent backups/restores with compression. The number
of processes is set with `backup_workers` configuration option.

0 comments on commit 373b524

Please sign in to comment.