From b66dcd8ff34dbfe168677cb8e228dfd2f153f44d Mon Sep 17 00:00:00 2001 From: Niall Creech Date: Wed, 24 Feb 2016 12:53:39 +0000 Subject: [PATCH] Modified instance-cycling - Added a delay to wait the ASG::HealthCheckGracePeriod - Add missing VPC class import to fab_tasks - Removed a `pass` statement no longer needed - Added a check on number of instances during the instance cycling loop --- bootstrap_cfn/autoscale.py | 155 ++++++++++++++++++++++++++++++++++++- bootstrap_cfn/errors.py | 12 +++ bootstrap_cfn/fab_tasks.py | 41 ++++++++++ 3 files changed, 207 insertions(+), 1 deletion(-) diff --git a/bootstrap_cfn/autoscale.py b/bootstrap_cfn/autoscale.py index 4586683..2f9591e 100644 --- a/bootstrap_cfn/autoscale.py +++ b/bootstrap_cfn/autoscale.py @@ -1,7 +1,15 @@ +import logging + +import time + import boto.ec2.autoscale +import boto3 + from bootstrap_cfn import utils +from bootstrap_cfn.errors import AutoscalingGroupNotFound, AutoscalingInstanceCountError + class Autoscale: @@ -27,7 +35,8 @@ def set_tag(self, key, value): resource_id=self.group.name) self.conn_asg.create_or_update_tags([tag]) - print "Created ASG Tag: Tag({0}, {1})".format(key, value) + logging.getLogger("bootstrap-cfn").info("Created ASG Tag: Tag({0}, {1})" + .format(key, value)) return True def get_all_autoscaling_groups(self): @@ -46,3 +55,147 @@ def get_all_autoscaling_groups(self): response = self.conn_asg.get_all_groups(next_token=response.next_token) all_asgs += response return all_asgs + + def cycle_instances(self, + termination_delay=None): + """ + Cycle all the instances in an autoscaling group, waiting for the + specified delay before terminating each instance that was replaced + + Args: + termination_delay(int): The delay in seconds between the new instance becoming + healthy and in-service, and the termination of the old one its replacing. + """ + client = boto3.client('autoscaling') + + # Use the type of health check the ASG is using to determine a sensible default for termination + # delay. The ELB check is more nuanced and should know what a healthy service really looks like. + # EC2 checks are basic and generally, the instance will be up and 'healthy' long before the service + # is available. In that case we want to delay termination for a long enough time to hope the service + # sets itself up. + if not termination_delay and self.group.health_check_type != "ELB": + termination_delay = 360 + + # Get a list of the current instances + current_instance_ids = [instance.get('InstanceId') for instance in self.get_healthy_instances()] + logging.getLogger("bootstrap-cfn").info("cycle_instances: Found {} instance ids, {}" + .format(len(current_instance_ids), current_instance_ids)) + + # save the number of instances before starting the upgrade + num_instances = len(current_instance_ids) + + # get the ASG HealthCheckGracePeriod + health_check_grace_period = self.group.health_check_period + logging.getLogger("bootstrap-cfn").info("ASG HealthCheckGracePeriod: %s" % health_check_grace_period) + + # Iterate through the current instances, replacing current instances with new ones + for current_instance_id in current_instance_ids: + logging.getLogger("bootstrap-cfn").info("current instance: %s" % current_instance_id) + # Set the desired instances +1 and wait for it to be created + + logging.getLogger("bootstrap-cfn").info("cycle_instances: Creating new instance...") + self.set_autoscaling_desired_capacity(len(current_instance_ids) + 1) + self.wait_for_instances(len(current_instance_ids) + 1) + logging.getLogger("bootstrap-cfn").info("cycle_instances: Terminating recycled instance {} after {} seconds..." + .format(current_instance_id, termination_delay)) + + # wait for the same time as the "HealthCheckGracePeriod" in the ASG + logging.getLogger("bootstrap-cfn").info("Waiting %ss - HealthCheckGracePeriod" % health_check_grace_period) + time.sleep(health_check_grace_period) + logging.getLogger("bootstrap-cfn").info("End of waiting period") + + # check if the number of healthy instances is = to the number of expected instances, where + # expected instances is num_instances + 1 + new_curr_inst_ids = [instance.get('InstanceId') for instance in self.get_healthy_instances()] + logging.getLogger("bootstrap-cfn").info("new instance list %r" % new_curr_inst_ids) + if len(new_curr_inst_ids) != num_instances + 1: + logging.getLogger("bootstrap-cfn").error("Expected %s instances, found %s." % ( + num_instances + 1, len(new_curr_inst_ids)) + ) + raise AutoscalingInstanceCountError(self.group.name, num_instances + 1, new_curr_inst_ids) + else: + logging.getLogger("bootstrap-cfn").info("Expected %s instances, found %s." % ( + num_instances + 1, len(new_curr_inst_ids)) + ) + + # If we have a delay before termination defined, delay before terminating the current instance + if termination_delay: + logging.getLogger("bootstrap-cfn").info("Waiting %ss - termination_delay" % termination_delay) + time.sleep(termination_delay) + logging.getLogger("bootstrap-cfn").info("End of waiting period") + client.terminate_instance_in_auto_scaling_group( + InstanceId=current_instance_id, + ShouldDecrementDesiredCapacity=True + ) + new_instance_ids = [instance.get('InstanceId') for instance in self.get_healthy_instances()] + logging.getLogger("bootstrap-cfn").info("cycle_instances: {} instances recycled, {}" + .format(len(current_instance_ids), current_instance_ids)) + logging.getLogger("bootstrap-cfn").info("cycle_instances: {} instances created, {}" + .format(len(new_instance_ids), new_instance_ids)) + + def set_autoscaling_desired_capacity(self, + capacity): + """ + Set the desired instances count on an autosaling group + + Args: + capacity(int): The target size of the instances in the + autoscaling group. + """ + client = boto3.client('autoscaling') + logging.getLogger("bootstrap-cfn").info("set_autoscaling_desired_capacity: Setting capacity to {}".format(capacity)) + client.set_desired_capacity( + AutoScalingGroupName=self.group.name, + DesiredCapacity=capacity, + HonorCooldown=False + ) + + def wait_for_instances(self, + expected_instance_count, + retry_delay=30, + retry_max=10): + """ + Wait for the autoscaling group to register a specified number of healthy, + in-service instances. + + Args: + expected_instance_count(int): The target size of the instances in the + autoscaling group. + retry_delay(int): The time in seconds between checks on the number of + instances. + retry_max(int): The maximum number of retries on checking the instance + count before failing. + Exceptions: + AutoscalingInstanceCountError: On target instance count not reached in + retry_delay * retry_count time. + """ + instances = self.get_healthy_instances() + count = 0 + while (len(instances) != expected_instance_count and count < retry_max): + count += 1 + logging.getLogger("bootstrap-cfn").info("cycle_instances: Waiting {} seconds for instances (attempt {}/{})..." + .format(retry_delay, count, retry_max)) + if count == retry_max: + raise AutoscalingInstanceCountError(self.group.name, expected_instance_count, instances) + time.sleep(retry_delay) + instances = self.get_healthy_instances() + logging.getLogger("bootstrap-cfn").info("wait_for_instances: Found {} instances, {}" + .format(len(instances), [instance.get('InstanceId') for instance in instances])) + + def get_healthy_instances(self): + instances = [instance for instance in self.get_instances() + if instance.get('LifecycleState') == 'InService' and + instance.get('HealthStatus') == 'Healthy'] + return instances + + def get_instances(self): + """ + Get all instances in an autoscaling group + """ + client = boto3.client('autoscaling') + groups = client.describe_auto_scaling_groups(AutoScalingGroupNames=[self.group.name]).get('AutoScalingGroups') + if not len(groups) > 0: + logging.getLogger("bootstrap-cfn").critical("cycle_instances: Could not describe autoscaling group") + raise AutoscalingGroupNotFound + instances = [instance for instance in groups[0].get('Instances')] + return instances diff --git a/bootstrap_cfn/errors.py b/bootstrap_cfn/errors.py index cacad55..ed89e0b 100644 --- a/bootstrap_cfn/errors.py +++ b/bootstrap_cfn/errors.py @@ -65,3 +65,15 @@ def __init__(self, type, available_types): msg = ("The os type '{}' is not recognised, should be one of {}. " .format(type, available_types)) super(OSTypeNotFoundError, self).__init__(msg) + + +class AutoscalingGroupNotFound(BootstrapCfnError): + pass + + +class AutoscalingInstanceCountError(BootstrapCfnError): + def __init__(self, autoscaling_group, expected_instance_count, instances): + super(AutoscalingInstanceCountError, self).__init__( + "Could not find {} instances in autoscaling group {}. Actual state is {} instances, {}" + .format(expected_instance_count, autoscaling_group, len(instances), instances) + ) diff --git a/bootstrap_cfn/fab_tasks.py b/bootstrap_cfn/fab_tasks.py index 030b076..d18ffae 100755 --- a/bootstrap_cfn/fab_tasks.py +++ b/bootstrap_cfn/fab_tasks.py @@ -12,6 +12,7 @@ from fabric.colors import green, red from fabric.utils import abort +from bootstrap_cfn.autoscale import Autoscale from bootstrap_cfn.cloudformation import Cloudformation from bootstrap_cfn.config import ConfigParser, ProjectConfig from bootstrap_cfn.elb import ELB @@ -19,6 +20,7 @@ from bootstrap_cfn.iam import IAM from bootstrap_cfn.r53 import R53 from bootstrap_cfn.utils import tail +from bootstrap_cfn.vpc import VPC # Default fab config. Set via the tasks below or --set @@ -42,6 +44,7 @@ # Set up the logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger("bootstrap-cfn") +logging.getLogger("requests").setLevel(logging.WARNING) @task @@ -641,3 +644,41 @@ def disable_vpc_peering(): if vpc_cfg: vpc_obj = VPC(cfg.data, get_stack_name()) vpc_obj.disable_peering() + + +@task +def set_autoscaling_desired_capacity(capacity, block=True): + """ + Set the desired capacity the autoscaling group + + Args: + capacity(int): Number of instances desired in + the autoscaling group. + block(bool): Wait for instances to become healthy + and in-service. + """ + asg = get_connection(Autoscale) + if not asg.group: + asg.set_autoscaling_group(get_stack_name()) + asg.set_autoscaling_desired_capacity(capacity=int(capacity)) + if block: + asg.wait_for_instances(int(capacity)) + + +@task +def cycle_instances(delay=None): + """ + Cycle the instances in the autoscaling group + + Args: + delay(int): Number of seconds between new instance + becoming healthy and killing the old one. + """ + asg = get_connection(Autoscale) + if not asg.group: + asg.set_autoscaling_group(get_stack_name()) + if delay: + termination_delay = int(delay) + else: + termination_delay = None + asg.cycle_instances(termination_delay=termination_delay)