Skip to content
This repository has been archived by the owner on Oct 7, 2020. It is now read-only.

Commit

Permalink
Retry with sleep in case of of backup or snapshot deletion error
Browse files Browse the repository at this point in the history
When hits error like 'The maximum per volume CreateSnapshot request rate
has been exceeded', backup monkey sleeps for a while and then retry.
  • Loading branch information
EricDHS committed Nov 7, 2016
1 parent c1a82ad commit abc07e3
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 34 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,4 +144,4 @@ law or agreed to in writing, software distributed under the License is
distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the specific
language governing permissions and limitations under the License.

17 changes: 9 additions & 8 deletions backup_monkey/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,19 @@ def _fail(message="Unknown failure", code=1):

def run():
parser = argparse.ArgumentParser(description='Loops through all EBS volumes, and snapshots them, then loops through all snapshots, and removes the oldest ones.')
parser.add_argument('--region', metavar='REGION',
parser.add_argument('--region', metavar='REGION',
help='the region to loop through and snapshot (default is current region of EC2 instance this is running on). E.g. us-east-1')
parser.add_argument('--max-snapshots-per-volume', metavar='SNAPSHOTS', default=14, type=int,
help='the maximum number of snapshots to keep per EBS volume. The oldest snapshots will be deleted. Default: 3')
parser.add_argument('--snapshot-only', action='store_true', default=False,
help='Only snapshot EBS volumes, do not remove old snapshots')
parser.add_argument('--remove-only', action='store_true', default=False,
help='Only remove old snapshots, do not create new snapshots')
parser.add_argument('--verbose', '-v', action='count',
parser.add_argument('--verbose', '-v', action='count',
help='enable verbose output (-vvv for more)')
parser.add_argument('--version', action='version', version='%(prog)s ' + __version__,
help='display version number and exit')
parser.add_argument('--tags', nargs="+",
parser.add_argument('--tags', nargs="+",
help='Only snapshot instances that match passed in tags. E.g. --tag Name:foo will snapshot all instances with a tag `Name` and value is `foo`')
parser.add_argument('--reverse-tags', action='store_true', default=False,
help='Do a reverse match on the passed in tags. E.g. --tag Name:foo --reverse-tags will snapshot all instances that do not have a `Name` tag with the value `foo`')
Expand Down Expand Up @@ -77,20 +77,21 @@ def run():
log.debug('Instance meta-data: %s', instance_metadata)
if not instance_metadata:
_fail('Could not determine region. This script is either not running on an EC2 instance (in which case you should use the --region option), or the meta-data service is down')

region = instance_metadata['placement']['availability-zone'][:-1]
log.debug("Running in region: %s", region)

try:
monkey = BackupMonkey(region, args.max_snapshots_per_volume, args.tags, args.reverse_tags, args.cross_account_number, args.cross_account_role, args.verbose)

monkey = BackupMonkey(region, args.max_snapshots_per_volume, args.tags, args.reverse_tags, args.cross_account_number,
args.cross_account_role, args.verbose)

if not args.remove_only:
monkey.snapshot_volumes()
if not args.snapshot_only:
monkey.remove_old_snapshots()

except BackupMonkeyException as e:
_fail(str(e))

log.info('Backup Monkey completed successfully!')
sys.exit(0)
89 changes: 65 additions & 24 deletions backup_monkey/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging, sys, os, re
import logging, sys, os, re, time

from boto.exception import NoAuthHandlerFound, BotoServerError
from boto import ec2
Expand All @@ -21,7 +21,7 @@
__all__ = ('BackupMonkey', 'Logging')
log = logging.getLogger(__name__)

from splunk_logging import SplunkLogging
from splunk_logging import SplunkLogging
from status import BackupMonkeyStatus as _status

class BackupMonkey(object):
Expand All @@ -39,8 +39,8 @@ def __init__(self, region, max_snapshots_per_volume, tags, reverse_tags, cross_a

def _info(self, **kwargs):
log.info('%s: %s' % (kwargs['subject'], kwargs['body']) if 'subject' in kwargs and 'body' in kwargs else kwargs['subject'] if 'subject' in kwargs else None)
kwargs['severity'] = kwargs['severity'] if 'severity' in kwargs else 'informational'
kwargs['type'] = kwargs['type'] if 'type' in kwargs else 'event'
kwargs['severity'] = kwargs['severity'] if 'severity' in kwargs else 'informational'
kwargs['type'] = kwargs['type'] if 'type' in kwargs else 'event'
kwargs['src_region'] = self._region
SplunkLogging.write(**kwargs)

Expand All @@ -49,7 +49,7 @@ def get_connection(self):
if self._cross_account_number and self._cross_account_role:
self._info(
subject=_status.parse_status('cross_account_connect', (self._cross_account_number, self._cross_account_role, self._region)),
src_account=self._cross_account_number,
src_account=self._cross_account_number,
src_role=self._cross_account_role,
category='connection')
from boto.sts import STSConnection
Expand All @@ -60,8 +60,8 @@ def get_connection(self):
assumed_role = sts.assume_role(role_arn=role_arn, role_session_name='AssumeRoleSession')
ret = ec2.connect_to_region(
self._region,
aws_access_key_id=assumed_role.credentials.access_key,
aws_secret_access_key=assumed_role.credentials.secret_key,
aws_access_key_id=assumed_role.credentials.access_key,
aws_secret_access_key=assumed_role.credentials.secret_key,
security_token=assumed_role.credentials.session_token
)
except BotoServerError, e:
Expand All @@ -73,7 +73,7 @@ def get_connection(self):
category='connection')
else:
self._info(
subject=_status.parse_status('region_connect', self._region),
subject=_status.parse_status('region_connect', self._region),
category='connection')
try:
ret = ec2.connect_to_region(self._region)
Expand Down Expand Up @@ -123,7 +123,7 @@ def get_volumes_to_snapshot(self):
self._info(
subject=_status.parse_status('volumes_fetch', self._region),
category='volumes')
volumes = []
volumes = []
if self._reverse_tags:
filters = self.get_filters()
black_list = []
Expand All @@ -134,40 +134,45 @@ def get_volumes_to_snapshot(self):
black_list.append((f, filters[f]))
for v in self.get_all_volumes():
if len(set(v.tags.items()) - set(black_list)) == len(set(v.tags.items())):
volumes.append(v)
volumes.append(v)
return volumes
else:
if self._tags:
return self.get_all_volumes(filters=self.get_filters())
else:
return self.get_all_volumes()

def remove_reserved_tags(self, tags):
return dict((key,value) for key, value in tags.iteritems() if not key.startswith('aws:'))

def snapshot_volumes(self):
''' Loops through all EBS volumes and creates snapshots of them '''

log.info('Getting list of EBS volumes')
volumes = self.get_volumes_to_snapshot()
log.info('Found %d volumes', len(volumes))
for volume in volumes:
for volume in volumes:
description_parts = [self._prefix]
description_parts.append(volume.id)
if volume.attach_data.instance_id:
description_parts.append(volume.attach_data.instance_id)
if volume.attach_data.device:
description_parts.append(volume.attach_data.device)
description = ' '.join(description_parts)
self._info(subject=_status.parse_status('snapshot_create', (volume.id, description)),
self._info(subject=_status.parse_status('snapshot_create', (volume.id, description)),
src_volume=volume.id,
src_tags=' '.join([':'.join(i) for i in volume.tags.items()]),
category='snapshots')
try:
snapshot = volume.create_snapshot(description)
snapshot = self._retryInCaseOfException(
volume.create_snapshot, description,
src_volume=volume.id,
category='snapshots',
type='alert',
severity='high')
if volume.tags:
snapshot.add_tags(self.remove_reserved_tags(volume.tags))
self._info(subject=_status.parse_status('snapshot_create_success', (snapshot.id, volume.id)),
self._info(subject=_status.parse_status('snapshot_create_success', (snapshot.id, volume.id)),
src_volume=volume.id,
src_snapshot=snapshot.id,
src_tags=' '.join([':'.join(i) for i in snapshot.tags.items()]),
Expand Down Expand Up @@ -215,10 +220,10 @@ def remove_old_snapshots(self):
if not snapshot.status == 'completed':
log.debug('Skipping %s as it is not a complete snapshot', snapshot.id)
continue

log.debug('Found %s: %s', snapshot.id, snapshot.description)
vol_snap_map.setdefault(snapshot.volume_id, []).append(snapshot)

for volume_id, most_recent_snapshots in vol_snap_map.iteritems():
most_recent_snapshots.sort(key=lambda s: s.start_time, reverse=True)
num_snapshots = len(most_recent_snapshots)
Expand All @@ -228,13 +233,18 @@ def remove_old_snapshots(self):
snapshot = most_recent_snapshots[i]
snapshot_id = snapshot.id
snapshot_description = snapshot.description
self._info(subject=_status.parse_status('snapshot_delete', (snapshot_id, snapshot_description)),
self._info(subject=_status.parse_status('snapshot_delete', (snapshot_id, snapshot_description)),
src_snapshot=snapshot_id,
src_volume=volume_id,
category='snapshots')
try:
snapshot.delete()
self._info(subject=_status.parse_status('snapshot_delete_success', (snapshot_id, snapshot_description)),
self._retryInCaseOfException(
snapshot.delete,
src_snapshot=snapshot_id,
category='snapshots',
type='alert',
severity='high')
self._info(subject=_status.parse_status('snapshot_delete_success', (snapshot_id, snapshot_description)),
src_snapshot=snapshot_id,
category='snapshots')
except BotoServerError, e:
Expand All @@ -247,7 +257,39 @@ def remove_old_snapshots(self):
type='alarm',
severity='critical')
return True


def _retryInCaseOfException(self, func, *args, **kwargs):
'''Retry with sleep in case of RequestLimitExceeded exception'''
result = None
for attempt in range(1, 6):
try:
result = func(*args)
except BotoServerError, e:
sleep_time = attempt + 5
log.error("Encountered Error %s on %s, waiting %d seconds then retrying", e.message, str(kwargs), sleep_time)
splunk_kwargs = {
'subject':_status.parse_status('retry_after_sleep', (str(attempt), str(sleep_time))),
'body':e.message
}
splunk_kwargs.update(kwargs)
SplunkLogging.write(**splunk_kwargs)
time.sleep(sleep_time)
continue
except Exception, e:
log.error("Encountered Error %s on %s", e.message, str(kwargs))
raise e
else:
return result
else:
log.error("Encountered Error %s on %s, %d retries failed, continuing", e.message, str(kwargs), attempt)
splunk_kwargs = {
'subject':_status.parse_status('retry_all_fail', (str(attempt))),
'body':e.message
}
splunk_kwargs.update(kwargs)
SplunkLogging.write(**splunk_kwargs)
raise e

class ErrorFilter(object):
def filter(self, record):
return record.levelno >= logging.ERROR
Expand All @@ -268,7 +310,7 @@ def getHandler(self, stream, format_, handler_filter):
if handler_filter:
_handler.addFilter(handler_filter)
return _handler

def clearLoggingHandlers(self, logger):
while len(logger.handlers) > 0:
logger.removeHandler(logger.handlers[0])
Expand Down Expand Up @@ -298,4 +340,3 @@ def configure(self, verbosity = None, module = __name__):
logging.getLogger('boto').setLevel(logging.INFO)
else:
logging.getLogger('boto').setLevel(logging.CRITICAL)

4 changes: 3 additions & 1 deletion backup_monkey/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ class BackupMonkeyStatus(object):
'snapshot_create_error': 'Cannot create snapshot of volume `%s`',
'snapshot_delete': 'Deleting snapshot `%s` with a description of `%s`',
'snapshot_delete_success': 'Successfully deleted snapshot `%s` with a description of `%s`',
'snapshot_delete_error': 'Cannot delete snapshot `%s` with a description of `%s`'
'snapshot_delete_error': 'Cannot delete snapshot `%s` with a description of `%s`',
'retry_after_sleep': '`%s` attmpts fails and waiting `%s` seconds then retry',
'retry_all_fail': 'Total `%s` retries fail and give up'
}

@staticmethod
Expand Down

0 comments on commit abc07e3

Please sign in to comment.