Skip to content
This repository has been archived by the owner on Jan 19, 2022. It is now read-only.

Commit

Permalink
Modified instance-cycling
Browse files Browse the repository at this point in the history
  - Added a delay to wait the ASG::HealthCheckGracePeriod
  - Add missing VPC class import to fab_tasks
  - Removed a `pass` statement no longer needed
  - Added a check on number of instances during the instance cycling loop
  • Loading branch information
Niall Creech authored and Benedetto Lo Giudice committed Feb 25, 2016
1 parent 2d8263c commit b66dcd8
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 1 deletion.
155 changes: 154 additions & 1 deletion bootstrap_cfn/autoscale.py
@@ -1,7 +1,15 @@
import logging

import time

import boto.ec2.autoscale

import boto3

from bootstrap_cfn import utils

from bootstrap_cfn.errors import AutoscalingGroupNotFound, AutoscalingInstanceCountError


class Autoscale:

Expand All @@ -27,7 +35,8 @@ def set_tag(self, key, value):
resource_id=self.group.name)

self.conn_asg.create_or_update_tags([tag])
print "Created ASG Tag: Tag({0}, {1})".format(key, value)
logging.getLogger("bootstrap-cfn").info("Created ASG Tag: Tag({0}, {1})"
.format(key, value))
return True

def get_all_autoscaling_groups(self):
Expand All @@ -46,3 +55,147 @@ def get_all_autoscaling_groups(self):
response = self.conn_asg.get_all_groups(next_token=response.next_token)
all_asgs += response
return all_asgs

def cycle_instances(self,
termination_delay=None):
"""
Cycle all the instances in an autoscaling group, waiting for the
specified delay before terminating each instance that was replaced
Args:
termination_delay(int): The delay in seconds between the new instance becoming
healthy and in-service, and the termination of the old one its replacing.
"""
client = boto3.client('autoscaling')

# Use the type of health check the ASG is using to determine a sensible default for termination
# delay. The ELB check is more nuanced and should know what a healthy service really looks like.
# EC2 checks are basic and generally, the instance will be up and 'healthy' long before the service
# is available. In that case we want to delay termination for a long enough time to hope the service
# sets itself up.
if not termination_delay and self.group.health_check_type != "ELB":
termination_delay = 360

# Get a list of the current instances
current_instance_ids = [instance.get('InstanceId') for instance in self.get_healthy_instances()]
logging.getLogger("bootstrap-cfn").info("cycle_instances: Found {} instance ids, {}"
.format(len(current_instance_ids), current_instance_ids))

# save the number of instances before starting the upgrade
num_instances = len(current_instance_ids)

# get the ASG HealthCheckGracePeriod
health_check_grace_period = self.group.health_check_period
logging.getLogger("bootstrap-cfn").info("ASG HealthCheckGracePeriod: %s" % health_check_grace_period)

# Iterate through the current instances, replacing current instances with new ones
for current_instance_id in current_instance_ids:
logging.getLogger("bootstrap-cfn").info("current instance: %s" % current_instance_id)
# Set the desired instances +1 and wait for it to be created

logging.getLogger("bootstrap-cfn").info("cycle_instances: Creating new instance...")
self.set_autoscaling_desired_capacity(len(current_instance_ids) + 1)
self.wait_for_instances(len(current_instance_ids) + 1)
logging.getLogger("bootstrap-cfn").info("cycle_instances: Terminating recycled instance {} after {} seconds..."
.format(current_instance_id, termination_delay))

# wait for the same time as the "HealthCheckGracePeriod" in the ASG
logging.getLogger("bootstrap-cfn").info("Waiting %ss - HealthCheckGracePeriod" % health_check_grace_period)
time.sleep(health_check_grace_period)
logging.getLogger("bootstrap-cfn").info("End of waiting period")

# check if the number of healthy instances is = to the number of expected instances, where
# expected instances is num_instances + 1
new_curr_inst_ids = [instance.get('InstanceId') for instance in self.get_healthy_instances()]
logging.getLogger("bootstrap-cfn").info("new instance list %r" % new_curr_inst_ids)
if len(new_curr_inst_ids) != num_instances + 1:
logging.getLogger("bootstrap-cfn").error("Expected %s instances, found %s." % (
num_instances + 1, len(new_curr_inst_ids))
)
raise AutoscalingInstanceCountError(self.group.name, num_instances + 1, new_curr_inst_ids)
else:
logging.getLogger("bootstrap-cfn").info("Expected %s instances, found %s." % (
num_instances + 1, len(new_curr_inst_ids))
)

# If we have a delay before termination defined, delay before terminating the current instance
if termination_delay:
logging.getLogger("bootstrap-cfn").info("Waiting %ss - termination_delay" % termination_delay)
time.sleep(termination_delay)
logging.getLogger("bootstrap-cfn").info("End of waiting period")
client.terminate_instance_in_auto_scaling_group(
InstanceId=current_instance_id,
ShouldDecrementDesiredCapacity=True
)
new_instance_ids = [instance.get('InstanceId') for instance in self.get_healthy_instances()]
logging.getLogger("bootstrap-cfn").info("cycle_instances: {} instances recycled, {}"
.format(len(current_instance_ids), current_instance_ids))
logging.getLogger("bootstrap-cfn").info("cycle_instances: {} instances created, {}"
.format(len(new_instance_ids), new_instance_ids))

def set_autoscaling_desired_capacity(self,
capacity):
"""
Set the desired instances count on an autosaling group
Args:
capacity(int): The target size of the instances in the
autoscaling group.
"""
client = boto3.client('autoscaling')
logging.getLogger("bootstrap-cfn").info("set_autoscaling_desired_capacity: Setting capacity to {}".format(capacity))
client.set_desired_capacity(
AutoScalingGroupName=self.group.name,
DesiredCapacity=capacity,
HonorCooldown=False
)

def wait_for_instances(self,
expected_instance_count,
retry_delay=30,
retry_max=10):
"""
Wait for the autoscaling group to register a specified number of healthy,
in-service instances.
Args:
expected_instance_count(int): The target size of the instances in the
autoscaling group.
retry_delay(int): The time in seconds between checks on the number of
instances.
retry_max(int): The maximum number of retries on checking the instance
count before failing.
Exceptions:
AutoscalingInstanceCountError: On target instance count not reached in
retry_delay * retry_count time.
"""
instances = self.get_healthy_instances()
count = 0
while (len(instances) != expected_instance_count and count < retry_max):
count += 1
logging.getLogger("bootstrap-cfn").info("cycle_instances: Waiting {} seconds for instances (attempt {}/{})..."
.format(retry_delay, count, retry_max))
if count == retry_max:
raise AutoscalingInstanceCountError(self.group.name, expected_instance_count, instances)
time.sleep(retry_delay)
instances = self.get_healthy_instances()
logging.getLogger("bootstrap-cfn").info("wait_for_instances: Found {} instances, {}"
.format(len(instances), [instance.get('InstanceId') for instance in instances]))

def get_healthy_instances(self):
instances = [instance for instance in self.get_instances()
if instance.get('LifecycleState') == 'InService' and
instance.get('HealthStatus') == 'Healthy']
return instances

def get_instances(self):
"""
Get all instances in an autoscaling group
"""
client = boto3.client('autoscaling')
groups = client.describe_auto_scaling_groups(AutoScalingGroupNames=[self.group.name]).get('AutoScalingGroups')
if not len(groups) > 0:
logging.getLogger("bootstrap-cfn").critical("cycle_instances: Could not describe autoscaling group")
raise AutoscalingGroupNotFound
instances = [instance for instance in groups[0].get('Instances')]
return instances
12 changes: 12 additions & 0 deletions bootstrap_cfn/errors.py
Expand Up @@ -65,3 +65,15 @@ def __init__(self, type, available_types):
msg = ("The os type '{}' is not recognised, should be one of {}. "
.format(type, available_types))
super(OSTypeNotFoundError, self).__init__(msg)


class AutoscalingGroupNotFound(BootstrapCfnError):
pass


class AutoscalingInstanceCountError(BootstrapCfnError):
def __init__(self, autoscaling_group, expected_instance_count, instances):
super(AutoscalingInstanceCountError, self).__init__(
"Could not find {} instances in autoscaling group {}. Actual state is {} instances, {}"
.format(expected_instance_count, autoscaling_group, len(instances), instances)
)
41 changes: 41 additions & 0 deletions bootstrap_cfn/fab_tasks.py
Expand Up @@ -12,13 +12,15 @@
from fabric.colors import green, red
from fabric.utils import abort

from bootstrap_cfn.autoscale import Autoscale
from bootstrap_cfn.cloudformation import Cloudformation
from bootstrap_cfn.config import ConfigParser, ProjectConfig
from bootstrap_cfn.elb import ELB
from bootstrap_cfn.errors import BootstrapCfnError, CfnConfigError, CloudResourceNotFoundError, DNSRecordNotFoundError, ZoneIDNotFoundError
from bootstrap_cfn.iam import IAM
from bootstrap_cfn.r53 import R53
from bootstrap_cfn.utils import tail
from bootstrap_cfn.vpc import VPC


# Default fab config. Set via the tasks below or --set
Expand All @@ -42,6 +44,7 @@
# Set up the logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("bootstrap-cfn")
logging.getLogger("requests").setLevel(logging.WARNING)


@task
Expand Down Expand Up @@ -641,3 +644,41 @@ def disable_vpc_peering():
if vpc_cfg:
vpc_obj = VPC(cfg.data, get_stack_name())
vpc_obj.disable_peering()


@task
def set_autoscaling_desired_capacity(capacity, block=True):
"""
Set the desired capacity the autoscaling group
Args:
capacity(int): Number of instances desired in
the autoscaling group.
block(bool): Wait for instances to become healthy
and in-service.
"""
asg = get_connection(Autoscale)
if not asg.group:
asg.set_autoscaling_group(get_stack_name())
asg.set_autoscaling_desired_capacity(capacity=int(capacity))
if block:
asg.wait_for_instances(int(capacity))


@task
def cycle_instances(delay=None):
"""
Cycle the instances in the autoscaling group
Args:
delay(int): Number of seconds between new instance
becoming healthy and killing the old one.
"""
asg = get_connection(Autoscale)
if not asg.group:
asg.set_autoscaling_group(get_stack_name())
if delay:
termination_delay = int(delay)
else:
termination_delay = None
asg.cycle_instances(termination_delay=termination_delay)

0 comments on commit b66dcd8

Please sign in to comment.