Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Fetching contributors…
Cannot retrieve contributors at this time
263 lines (212 sloc) 9.05 KB
#!/usr/bin/env python
Watches running EC2 instances and shuts them down when idle
import re
import time
import simplejson as json
except ImportError:
import json
import boto.ec2
from paramiko import SSHClient
import requests
import logging
log = logging.getLogger()
def get_buildbot_instances(conn):
# Look for instances with moz-state=ready and hostname *-ec2-000
reservations = conn.get_all_instances(filters={
'tag:moz-state': 'ready',
'instance-state-name': 'running',
retval = []
for r in reservations:
for i in r.instances:
name = i.tags['Name']
if not re.match(".*-ec2-\d+", name):
return retval
class IgnorePolicy:
def missing_host_key(self, client, hostname, key):
def get_ssh_client(name, ip, passwords):
client = SSHClient()
for p in passwords:
client.connect(hostname=ip, username='cltbld', password=p)
return client
log.warn("Couldn't log into {name} at {ip} with any known passwords".format(name=name, ip=ip))
return None
def get_last_activity(name, client):
stdin, stdout, stderr = client.exec_command("date +%Y%m%d%H%M%S")
slave_time =
slave_time = time.mktime(time.strptime(slave_time, "%Y%m%d%H%M%S"))
stdin, stdout, stderr = client.exec_command("cat /proc/uptime")
uptime = float([0])
if uptime < 3*60:
# Assume we're still booting
log.debug("%s - uptime is %.2f; assuming we're still booting up", name, uptime)
return "booting"
stdin, stdout, stderr = client.exec_command("tail -n 100 /builds/slave/twistd.log.1 /builds/slave/twistd.log")
last_activity = None
running_command = False
t = time.time()
line = ""
for line in stdout:
m ="^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", line)
if m:
t = time.strptime(, "%Y-%m-%d %H:%M:%S")
t = time.mktime(t)
# Not sure what to do with this line...
# uncomment to dump out ALL the lines
#log.debug("%s - %s", name, line.strip())
if "RunProcess._startCommand" in line or "using PTY: " in line:
log.debug("%s - started command - %s", name, line.strip())
running_command = True
elif "commandComplete" in line or "stopCommand" in line:
log.debug("%s - done command - %s", name, line.strip())
running_command = False
if "Shut Down" in line:
# Check if this happened before we booted, i.e. we're still booting up
if (slave_time - t) > uptime:
log.debug("%s - shutdown line is older than uptime; assuming we're still booting %s", name, line.strip())
last_activity = "booting"
last_activity = "stopped"
elif running_command:
# We're in the middle of running something, so say that our last
# activity is now (0 seconds ago)
last_activity = 0
last_activity = slave_time - t
# If this was over 10 minutes ago
if (slave_time - t) > 10*60 and (slave_time - t) > uptime:
log.warning("%s - shut down happened %ss ago, but we've been up for %ss - %s", name, slave_time-t, uptime, line.strip())
# If longer than 30 minutes, try rebooting
if (slave_time - t) > 30*60:
log.warning("%s - rebooting", name)
stdin, stdout, stderr = client.exec_command("sudo reboot")
# If there's *no* activity (e.g. no twistd.log files), and we've been up a while, then reboot
if last_activity is None and uptime > 15*60:
log.warning("%s - no activity; rebooting", name)
# If longer than 30 minutes, try rebooting
stdin, stdout, stderr = client.exec_command("sudo reboot")
log.debug("%s - %s - %s", name, last_activity, line.strip())
return last_activity
def get_tacfile(client):
stdin, stdout, stderr = client.exec_command("cat /builds/slave/buildbot.tac")
data =
return data
def get_buildbot_master(client):
tacfile = get_tacfile(client)
host ="^buildmaster_host = '(.*?)'$", tacfile, re.M)
port ="^port = (\d+)", tacfile, re.M)
assert host and port
host =
port = int(
return host, port
def graceful_shutdown(name, ip, client):
# Find out which master we're attached to by looking at buildbot.tac
log.debug("%s - looking up which master we're attached to", name)
host, port = get_buildbot_master(client)
# http port is pb port -1000
port -= 1000
url = "http://{host}:{port}/buildslaves/{name}/shutdown".format(host=host, port=port, name=name)
log.debug("%s - POSTing to %s", name, url), allow_redirects=False)
def aws_stop_idle(secrets, passwords, regions, dryrun=False):
if not regions:
# Look at all regions
log.debug("loading all regions")
regions = [ for r in boto.ec2.regions(**secrets)]
min_running_by_type = 0
for r in regions:
log.debug("looking at region %s", r)
conn = boto.ec2.connect_to_region(r, **secrets)
instances = get_buildbot_instances(conn)
instances_by_type = {}
for i in instances:
# TODO: Check if launch_time is too old, and terminate the instance
# if it is
# NB can't turn this on until aws_create_instance is working properly (with ssh keys)
instances_by_type.setdefault(i.tags['moz-type'], []).append(i)
# Make sure min_running_by_type are kept running
for t in instances_by_type:
to_remove = instances_by_type[t][:min_running_by_type]
for i in to_remove:
log.debug("%s - keep running (min %i instances of type %s)", i.tags['Name'], min_running_by_type, i.tags['moz-type'])
for i in instances:
name = i.tags['Name']
# TODO: Check with slavealloc
ip = i.private_ip_address
ssh_client = get_ssh_client(name, ip, passwords)
if not ssh_client:
last_activity = get_last_activity(name, ssh_client)
if last_activity == "stopped":
# TODO: could be that the machine is just starting up....
if not dryrun:"%s - stopping instance (launched %s)", name, i.launch_time)
else:"%s - would have stopped", name)
if last_activity == "booting":
# Wait harder
log.debug("%s - last activity %is ago", name, last_activity)
# Determine if the machine is idle for more than 10 minutes
if last_activity > 300:
if not dryrun:
# Hit graceful shutdown on the master"%s - starting graceful shutdown", name)
graceful_shutdown(name, ip, ssh_client)
# Check if we've exited right away
if get_last_activity(name, ssh_client) == "stopped":"%s - stopping instance", name)
else:"%s - would have started graceful shutdown", name)
log.debug("%s - not stopping", name)
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-r", "--region", action="append", dest="regions")
parser.add_option("-k", "--secrets", dest="secrets")
parser.add_option("-s", "--key-name", dest="key_name")
parser.add_option("-v", "--verbose", action="store_const", dest="loglevel", const=logging.DEBUG)
parser.add_option("-p", "--passwords", dest="passwords")
parser.add_option("--dry-run", action="store_true", dest="dryrun")
options, args = parser.parse_args()
logging.basicConfig(level=options.loglevel, format="%(asctime)s - %(message)s")
if not options.regions:
parser.error("at least one region is required")
if not options.secrets:
parser.error("secrets are required")
if not options.passwords:
parser.error("passwords are required")
secrets = json.load(open(options.secrets))
passwords = json.load(open(options.passwords))
aws_stop_idle(secrets, passwords, options.regions, dryrun=options.dryrun)
Jump to Line
Something went wrong with that request. Please try again.