feat: use cloudwatch logs for container output

Uses a CloudWatch LogGroup for ardere test plan runs. Also includes a refactor of the handler into a step_function file and a separate handler.py for Lambda as it requires some quirky path manipulation for AWS to see ardere as a top-level Python package. Closes #27
loads · Mar 17, 2017 · 8bafa09 · 8bafa09
1 parent 4dd67db
commit 8bafa09
Show file tree

Hide file tree

Showing 7 changed files with 287 additions and 206 deletions.
diff --git a/ardere/aws.py b/ardere/aws.py
@@ -38,22 +38,31 @@ class ECSManager(object):
     }
 
     def __init__(self, plan):
-        # type: (str) -> None
+        # type: (Dict[str, Any]) -> None
         """Create and return a ECSManager for a cluster of the given name."""
         self._ecs_client = self.boto.client('ecs')
         self._ec2_client = self.boto.client('ec2')
         self._ecs_name = plan["ecs_name"]
         self._plan = plan
 
+        # Pull out the env vars
+        self.s3_ready_bucket = os.environ["s3_ready_bucket"]
+        self.container_log_group = os.environ["container_log_group"]
+        self.ecs_profile = os.environ["ecs_profile"]
+
         if "plan_run_uuid" not in plan:
             plan["plan_run_uuid"] = str(uuid.uuid4())
 
         self._plan_uuid = plan["plan_run_uuid"]
 
+    @property
+    def plan_uuid(self):
+        return self._plan_uuid
+
     @property
     def s3_ready_file(self):
         return "https://s3.amazonaws.com/{bucket}/{key}".format(
-            bucket=os.environ["s3_ready_bucket"],
+            bucket=self.s3_ready_bucket,
             key="{}.ready".format(self._plan_uuid)
         )
 
@@ -112,7 +121,7 @@ def request_instances(self, instances):
                 InstanceType=instance_type,
                 UserData="#!/bin/bash \necho ECS_CLUSTER='" + self._ecs_name +
                          "' >> /etc/ecs/ecs.config",
-                IamInstanceProfile={"Arn": os.environ["ecs_profile"]}
+                IamInstanceProfile={"Arn": self.ecs_profile}
             )
 
             # Track returned instances for tagging step
@@ -133,17 +142,17 @@ def create_service(self, step):
         logger.info("CreateService called with: {}".format(step))
 
         # Prep the shell command
-        shell_command = [
-            'sh', '-c', '"$WAITFORCLUSTER"',
-            'waitforcluster.sh', self.s3_ready_file,
-            str(step.get("run_delay", 0))
-        ]
-        shell_command2 = ' '.join(shell_command) + ' && ' + step[
-            "additional_command_args"]
-        shell_command3 = ['sh', '-c', '{}'.format(shell_command2)]
+        wfc_var = '__ARDERE_WAITFORCLUSTER_SH__'
+        wfc_cmd = 'sh -c "${}" waitforcluster.sh {} {}'.format(
+            wfc_var,
+            self.s3_ready_file,
+            step.get("run_delay", 0)
+        )
+        service_cmd = step["additional_command_args"]
+        cmd = ['sh', '-c', '{} && {}'.format(wfc_cmd, service_cmd)]
 
         # Prep the env vars
-        env_vars = [{"name": "WAITFORCLUSTER", "value": shell_script}]
+        env_vars = [{"name": wfc_var, "value": shell_script}]
         for env_var in step.get("environment_data", []):
             name, value = env_var.split("=", 1)
             env_vars.append({"name": name, "value": value})
@@ -160,7 +169,17 @@ def create_service(self, step):
                     # using only memoryReservation sets no hard limit
                     "memoryReservation": 256,
                     "environment": env_vars,
-                    "entryPoint": shell_command3
+                    "entryPoint": cmd,
+                    "logConfiguration": {
+                        "logDriver": "awslogs",
+                        "options": {
+                            "awslogs-group": self.container_log_group,
+                            "awslogs-region": "us-east-1",
+                            "awslogs-stream-prefix": "ardere-{}".format(
+                                self.plan_uuid
+                            )
+                        }
+                    }
                 }
             ],
             placementConstraints=[

diff --git a/ardere/handler.py b/ardere/handler.py
diff --git a/ardere/step_functions.py b/ardere/step_functions.py
@@ -0,0 +1,159 @@
+import os
+import logging
+import time
+from collections import defaultdict
+
+import boto3
+import botocore
+from typing import Any, Dict, List  # noqa
+
+from ardere.aws import ECSManager
+from ardere.exceptions import (
+    ServicesStartingException,
+    ShutdownPlanException
+)
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+class AsynchronousPlanRunner(object):
+    """Asynchronous Test Plan Runner
+
+    This step function based runner handles running a test plan in an
+    asynchronous manner, where each step will wait for its run_delay if
+    present before running.
+
+    """
+    # For testing purposes
+    boto = boto3
+
+    def __init__(self, event, context):
+        logger.info("Called with {}".format(event))
+        logger.info("Environ: {}".format(os.environ))
+        self.ecs = ECSManager(plan=event)
+        self.event = event
+        self.context = context
+
+    def _build_instance_map(self):
+        """Given a JSON test-plan, build and return a dict of instance types
+        and how many should exist for each type."""
+        instances = defaultdict(int)
+        for step in self.event["steps"]:
+            instances[step["instance_type"]] += step["instance_count"]
+        return instances
+
+    def _find_test_plan_duration(self):
+        # type: (Dict[str, Any]) -> int
+        """Locates and calculates the longest test plan duration from its
+        delay through its duration of the plan."""
+        return max(
+            [x.get("run_delay", 0) + x["run_max_time"] for x in
+             self.event["steps"]]
+        )
+
+    def populate_missing_instances(self):
+        """Populate any missing EC2 instances needed for the test plan in the
+        cluster
+
+        Step 1
+
+        """
+        needed = self._build_instance_map()
+        logger.info("Plan instances needed: {}".format(needed))
+        current_instances = self.ecs.query_active_instances()
+        missing_instances = self.ecs.calculate_missing_instances(
+            desired=needed, current=current_instances
+        )
+        if missing_instances:
+            logger.info("Requesting instances: {}".format(missing_instances))
+            self.ecs.request_instances(missing_instances)
+        return self.event
+
+    def create_ecs_services(self):
+        """Create all the ECS services needed
+
+        Step 2
+
+        """
+        self.ecs.create_services(self.event["steps"])
+        return self.event
+
+    def wait_for_cluster_ready(self):
+        """Check all the ECS services to see if they're ready
+
+        Step 3
+
+        """
+        if not self.ecs.all_services_ready(self.event["steps"]):
+            raise ServicesStartingException()
+        return self.event
+
+    def signal_cluster_start(self):
+        """Drop a ready file in S3 to trigger the test plan to being
+
+        Step 4
+
+        """
+        s3_client = self.boto.client('s3')
+        s3_client.put_object(
+            ACL="public-read",
+            Body=b'{}'.format(int(time.time())),
+            Bucket=os.environ["s3_ready_bucket"],
+            Key="{}.ready".format(self.ecs.plan_uuid),
+            Metadata={
+                "ECSCluster": self.event["ecs_name"]
+            }
+        )
+        return self.event
+
+    def check_for_cluster_done(self):
+        """Check all the ECS services to see if they've run for their
+        specified duration
+
+        Step 5
+
+        """
+        # Check to see if the S3 file is still around
+        s3 = self.boto.resource('s3')
+        try:
+            ready_file = s3.Object(
+                os.environ["s3_ready_bucket"],
+                "{}.ready".format(self.ecs.plan_uuid)
+            )
+        except botocore.exceptions.ClientError:
+            # Error getting to the bucket/key, abort test run
+            raise ShutdownPlanException("Error accessing ready file")
+
+        file_contents = ready_file.get()['Body'].read().decode('utf-8')
+        start_time = int(file_contents)
+
+        # Update to running count 0 any services that should halt by now
+        self.ecs.stop_finished_services(start_time, self.event["steps"])
+
+        # If we're totally done, exit.
+        now = time.time()
+        plan_duration = self._find_test_plan_duration()
+        if now > (start_time + plan_duration):
+            raise ShutdownPlanException("Test Plan has completed")
+        return self.event
+
+    def cleanup_cluster(self):
+        """Shutdown all ECS services and deregister all task definitions
+
+        Step 6
+
+        """
+        self.ecs.shutdown_plan(self.event["steps"])
+
+        # Attempt to remove the S3 object
+        s3 = self.boto.resource('s3')
+        try:
+            ready_file = s3.Object(
+                os.environ["s3_ready_bucket"],
+                "{}.ready".format(self.ecs.plan_uuid)
+            )
+            ready_file.delete()
+        except botocore.exceptions.ClientError:
+            pass
+        return self.event