Permalink
Browse files

Take reserved slaves into account when starting slaves.

Add region priorities
  • Loading branch information...
1 parent c162886 commit bdf4d20d96ef547b1f40c94633dda74aff633808 @catlee catlee committed Nov 16, 2012
Showing with 174 additions and 94 deletions.
  1. +164 −89 aws/aws_watch_pending.py
  2. +0 −2 aws/configs/bld-linux64
  3. +0 −2 aws/configs/try-linux64
  4. +10 −1 aws/watch_pending.cfg.template
@@ -14,15 +14,14 @@
import boto.ec2
import sqlalchemy as sa
-from aws_create_instance import make_instances
-
import logging
log = logging.getLogger()
def find_pending(db):
engine = sa.create_engine(db)
- result = engine.execute(sa.text("""
+ result = engine.execute(
+ sa.text("""
SELECT buildername, count(*) FROM
buildrequests WHERE
complete=0 AND
@@ -38,105 +37,173 @@ def find_pending(db):
return retval
-def aws_resume_instances(instance_type, count, regions, secrets):
- "resume up to `count` stopped instances of the given type in the given regions"
- instance_config = json.load(open("configs/%s" % instance_type))
- max_running = instance_config.get('max_running')
- if max_running is not None:
- running = aws_count_running(instance_type, regions, secrets)
- if running + count > max_running:
- count = max_running - running
- if count <= 0:
- log.info("max_running limit hit (%s - %i)", instance_type, max_running)
- return 0
-
- started = 0
- for region in regions:
- conn = boto.ec2.connect_to_region(region, **secrets)
- reservations = conn.get_all_instances(filters={
- 'tag:moz-state': 'ready',
- 'tag:moz-type': instance_type,
- 'instance-state-name': 'stopped',
- })
- stopped_instances = []
- for r in reservations:
- for i in r.instances:
- if not i.tags.get('moz-type') == instance_type:
- log.debug("skipping %s; wrong type (%s)", i, i.tags.get('moz-type'))
- continue
- if i.state != 'stopped':
- log.debug("skipping %s; wrong state (%s)", i, i.state)
- continue
- stopped_instances.append(i)
-
- # Sort by launch_time so we can start most recently stopped instances first
- stopped_instances.sort(key=lambda i: i.launch_time)
- stopped_instances.reverse()
- for i in stopped_instances:
- log.info("%s - %s - starting instance", region, i.tags['Name'])
- i.start()
- started += 1
-
- if started == count:
- return started
+# Used by aws_connect_to_region to cache connection objects per region
+_aws_cached_connections = {}
- return started
+def aws_connect_to_region(region, secrets):
+ """Connect to an EC2 region. Caches connection objects"""
+ if region in _aws_cached_connections:
+ return _aws_cached_connections[region]
+ conn = boto.ec2.connect_to_region(region, **secrets)
+ _aws_cached_connections[region] = conn
+ return conn
-def aws_count_running(instance_type, regions, secrets):
- num = 0
+def aws_get_all_instances(regions, secrets):
+ """
+ Returns a list of all instances in the given regions
+ """
+ log.debug("fetching all instances for %s", regions)
+ retval = []
for region in regions:
- conn = boto.ec2.connect_to_region(region, **secrets)
- reservations = conn.get_all_instances(filters={
- 'tag:moz-type': instance_type,
- 'instance-state-name': 'running',
- })
+ conn = aws_connect_to_region(region, secrets)
+ reservations = conn.get_all_instances()
for r in reservations:
- num += len(r.instances)
- return num
+ retval.extend(r.instances)
+ return retval
+
+def aws_filter_instances(all_instances, state=None, tags=None):
+ retval = []
+ for i in all_instances:
+ matched = True
+ if state and i.state != state:
+ matched = False
+ continue
+ if tags:
+ for k, v in tags.items():
+ if i.tags.get(k) != v:
+ matched = False
+ continue
+ if matched:
+ retval.append(i)
+ return retval
-def aws_create_instances(instance_type, count, regions, secrets, key_name, instance_data):
- instance_config = json.load(open("configs/%s" % instance_type))
- max_count = instance_config['max_instances']
- # Count how many we have in all regions
- num = 0
- instances = []
- names = []
+def aws_get_reservations(regions, secrets):
+ """
+ Return a mapping of (availability zone, ec2 instance type) -> count
+ """
+ log.debug("getting reservations for %s", regions)
+ retval = {}
for region in regions:
- conn = boto.ec2.connect_to_region(region, **secrets)
- reservations = conn.get_all_instances(filters={'tag:moz-type': instance_type})
+ conn = aws_connect_to_region(region, secrets)
+ reservations = conn.get_all_reserved_instances(filters={
+ 'state': 'active',
+ })
for r in reservations:
- for i in r.instances:
- if i.tags.get('moz-type') == instance_type and i.state != "terminated":
- instances.append(i)
- names.append(i.tags['Name'])
- num += 1
-
- num_to_create = min(max_count - num, count)
- log.info("%s - we have %i instances across all regions; we will create %i more (max is %i)", instance_type, num, num_to_create, max_count)
+ az = r.availability_zone
+ ec2_instance_type = r.instance_type
+ if (az, ec2_instance_type) not in retval:
+ retval[az, ec2_instance_type] = 0
+ retval[az, ec2_instance_type] += r.instance_count
+ return retval
- i = 1
- to_create = []
- while len(to_create) < num_to_create:
- # Figure out its names
- name = instance_config['hostname'] % i
- if name not in names and name not in to_create:
- to_create.append(name)
- i += 1
- log.info("%s - creating %s", instance_type, to_create)
+def aws_filter_reservations(reservations, running_instances):
+ """
+ Filters reservations by reducing the count for reservations by the number
+ of running instances of the appropriate type. Removes entries for
+ reservations that are fully used.
+
+ Modifies reservations in place
+ """
+ # Subtract running instances from our reservations
+ for i in running_instances:
+ if (i.placement, i.instance_type) in reservations:
+ reservations[i.placement, i.instance_type] -= 1
+ log.debug("available reservations: %s", reservations)
+
+ # Remove reservations that are used up
+ for k, count in reservations.items():
+ if count <= 0:
+ log.debug("all reservations for %s are used; removing", k)
+ del reservations[k]
+
+
+def aws_resume_instances(moz_instance_type, start_count, regions, secrets, region_priorities, dryrun):
+ "Resume up to `start_count` stopped instances of the given type in the given regions"
+ # Fetch all our instance information
+ all_instances = aws_get_all_instances(regions, secrets)
+ # We'll filter by these tags in general
+ tags = {'moz-state': 'ready', 'moz-type': moz_instance_type}
+
+ # If our instance config specifies a maximum number of running instances,
+ # apply that now. This may mean that we reduce start_count, or return early
+ # if we're already running >= max_running
+ instance_config = json.load(open("configs/%s" % moz_instance_type))
+ max_running = instance_config.get('max_running')
+ if max_running is not None:
+ running = len(aws_filter_instances(all_instances, state='running', tags=tags))
+ if running + start_count > max_running:
+ start_count = max_running - running
+ if start_count <= 0:
+ log.info("max_running limit hit (%s - %i)", moz_instance_type, max_running)
+ return 0
- # TODO do multi-region
- if to_create:
- make_instances(to_create, instance_config[regions[0]], regions[0], secrets, key_name, instance_data, create_ami=False)
+ # Get our list of stopped instances, sorted by region priority, then launch_time
+ # Higher region priorities mean we'll prefer to start those instances first
+ def _instance_sort_key(i):
+ # Region is (usually?) the placement with the last character dropped
+ r = i.placement[:-1]
+ if r not in region_priorities:
+ log.warning("No region priority for %s; az=%s; region_priorities=%s",
+ r, i.placement, region_priorities)
+ p = region_priorities.get(r, 0)
+ return (p, i.launch_time)
+ stopped_instances = list(reversed(sorted(
+ aws_filter_instances(all_instances, state='stopped', tags=tags),
+ key=_instance_sort_key)))
+ log.debug("stopped_instances: %s", stopped_instances)
+
+ # Get our current reservations
+ reservations = aws_get_reservations(regions, secrets)
+ log.debug("current reservations: %s", reservations)
+
+ # Get our currently running instances
+ running_instances = aws_filter_instances(all_instances, state='running')
+
+ # Filter the reservations
+ aws_filter_reservations(reservations, running_instances)
+ log.debug("filtered reservations: %s", reservations)
+
+ # List of (instance, is_reserved) tuples
+ to_start = []
+
+ # While we still have reservations, start instances that can use those
+ # reservations first
+ for i in stopped_instances[:]:
+ k = (i.placement, i.instance_type)
+ if k not in reservations:
+ continue
+ stopped_instances.remove(i)
+ to_start.append((i, True))
+ reservations[k] -= 1
+ if reservations[k] <= 0:
+ del reservations[k]
+
+ # Add the rest of the stopped instances
+ to_start.extend((i, False) for i in stopped_instances)
+
+ # Limit ourselves to start only start_count instances
+ log.debug("starting up to %i instances", start_count)
+ to_start = to_start[:start_count]
+
+ log.debug("to_start: %s", to_start)
+
+ for i, is_reserved in to_start:
+ r = "reserved instance" if is_reserved else "instance"
+ if not dryrun:
+ log.info("%s - %s - starting %s", i.placement, i.tags['Name'], r)
+ i.start()
+ else:
+ log.info("%s - %s - would start %s", i.placement, i.tags['Name'], r)
- return len(to_create)
+ return len(to_start)
-def aws_watch_pending(db, regions, secrets, key_name, builder_map):
+def aws_watch_pending(db, regions, secrets, key_name, builder_map, region_priorities, dryrun):
# First find pending jobs in the db
pending = find_pending(db)
@@ -154,10 +221,10 @@ def aws_watch_pending(db, regions, secrets, key_name, builder_map):
log.debug("%s has %i pending jobs, but no instance types defined", pending_buildername, count)
for instance_type, count in to_create.items():
- log.debug("Need %i %s", count, instance_type)
+ log.debug("need %i %s", count, instance_type)
# Check for stopped instances in the given regions and start them if there are any
- started = aws_resume_instances(instance_type, count, regions, secrets)
+ started = aws_resume_instances(instance_type, count, regions, secrets, region_priorities, dryrun)
count -= started
log.info("%s - started %i instances; need %i", instance_type, started, count)
@@ -170,13 +237,15 @@ def aws_watch_pending(db, regions, secrets, key_name, builder_map):
loglevel=logging.INFO,
key_name=None,
config=None,
+ dryrun=False,
)
parser.add_option("-r", "--region", action="append", dest="regions")
parser.add_option("-k", "--secrets", dest="secrets")
parser.add_option("-s", "--key-name", dest="key_name")
parser.add_option("-v", "--verbose", action="store_const", dest="loglevel", const=logging.DEBUG)
parser.add_option("-c", "--config", dest="config")
+ parser.add_option("-n", "--dryrun", dest="dryrun", action="store_true", help="don't actually do anything")
options, args = parser.parse_args()
@@ -194,7 +263,13 @@ def aws_watch_pending(db, regions, secrets, key_name, builder_map):
config = json.load(open(options.config))
secrets = json.load(open(options.secrets))
+
aws_watch_pending(
- config['db'], options.regions, secrets,
- options.key_name, config['buildermap'],
+ config['db'],
+ options.regions,
+ secrets,
+ options.key_name,
+ config['buildermap'],
+ config['region_priorities'],
+ options.dryrun,
)
@@ -1,6 +1,4 @@
{
- "max_instances": 50,
- "min_running_instances": 2,
"hostname": "bld-linux64-ec2-%03d",
"us-west-1": {
"type": "bld-linux64",
@@ -1,6 +1,4 @@
{
- "max_instances": 50,
- "min_running_instances": 2,
"hostname": "try-linux64-ec2-%03d",
"us-west-1": {
"type": "try-linux64",
@@ -1,5 +1,10 @@
{
"db": "mysql://user@host/db",
+"region_priorities": {
+ "us-west-2": 5,
+ "us-east-1": 5,
+ "us-west-1": 0
+ },
"buildermap": {
"B2G ics_armv7a_gecko(-debug)? (?!try)\\S+ build": "bld-linux64",
"B2G ics_armv7a_gecko(-debug)? try build": "try-linux64",
@@ -9,5 +14,9 @@
"^Linux x86-64 (?!try)\\S+ (pgo-)?build": "bld-linux64",
"^Linux.* nightly": "bld-linux64",
"^Linux.* valgrind": "bld-linux64",
- "^Linux.* try.*build": "try-linux64"
+ "^Linux.* try.*build": "try-linux64",
+ "^b2g_(?!try)\\S+_(unagi|panda)_(dep|nightly)": "bld-linux64",
+ "^b2g_try_(unagi|panda)_(dep|nightly)": "try-linux64",
+ "^b2g_(?!try)\\S+_ics_armv7a_gecko(-debug)?": "bld-linux64",
+ "^b2g_try_ics_armv7a_gecko(-debug)?": "try-linux64"
}}

0 comments on commit bdf4d20

Please sign in to comment.