Skip to content
This repository has been archived by the owner on Oct 24, 2020. It is now read-only.

Commit

Permalink
feat: secure metrics/load-test nodes from outside access
Browse files Browse the repository at this point in the history
Locks down the metrics node to the load-test nodes for metrics and
restricts outside grafana access to the Grafana SG.

Closes #54
  • Loading branch information
bbangert committed Apr 18, 2017
1 parent 067bedc commit bebe38a
Show file tree
Hide file tree
Showing 13 changed files with 514 additions and 214 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ dist: precise
matrix:
include:
- python: 2.7
env: CODECOV=true

install:
- pip install -r test-requirements.txt
- pip install ${CODECOV:+codecov}
script:
- nosetests -d tests -- ${CODECOV:+--with-coverage --cover-xml --cover-package=ardere}
after_success:
- ${CODECOV:+codecov}
- codecov
notifications:
slack:
secure: vT9sWtUuxk28g6xYKAsQmiPZllErOYVfx5lcL+/jo1eRFrmbpYnyndT6s+FxGI1547oizZ0IqZbHVvB7BUoSJixXJyQJYXW2MchwN1UeHrey8mYpF1GNEaJT7FMfqSkxUU9gvAZ3IU7zstNeTLbfG1GkLuzybp0WAiHl/ocUTz8=
180 changes: 135 additions & 45 deletions ardere/aws.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""AWS Helper Classes"""
import json
import logging
import os
import time
Expand All @@ -8,7 +9,13 @@
import boto3
import botocore
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional # noqa
from typing import (
Any,
Dict,
List,
Optional,
Tuple
) # noqa

logger = logging.getLogger()
logger.setLevel(logging.INFO)
Expand All @@ -20,6 +27,8 @@
"waitforcluster.sh")
telegraf_script_path = os.path.join(parent_dir_path, "src", "shell",
"telegraf.toml")
metric_create_script = os.path.join(parent_dir_path, "ardere", "scripts",
"metric_creator.py")

# EC2 userdata to setup values on load
# Settings for net.ipv4 settings based on:
Expand Down Expand Up @@ -88,6 +97,7 @@


def cpu_units_for_instance_type(instance_type):
# type: (str) -> int
"""Calculate how many CPU units to allocate for an instance_type
We calculate cpu_units as 1024 * vcpu's for each instance to allocate
Expand Down Expand Up @@ -115,9 +125,11 @@ class ECSManager(object):
influxdb_container = "influxdb:1.2-alpine"
telegraf_container = "telegraf:1.2-alpine"
grafana_container = "grafana/grafana:4.1.2"
python_container = "jfloff/alpine-python:2.7-slim"

_wait_script = None
_telegraf_script = None
_metric_create_script = None

def __init__(self, plan):
# type: (Dict[str, Any]) -> None
Expand Down Expand Up @@ -151,6 +163,13 @@ def telegraf_script(self):
self._telegraf_script = f.read()
return self._telegraf_script

@property
def metric_create_script(self):
if not self._metric_create_script:
with open(metric_create_script, 'r') as f:
self._metric_create_script = f.read()
return self._metric_create_script

@property
def plan_uuid(self):
return self._plan_uuid
Expand All @@ -162,6 +181,17 @@ def s3_ready_file(self):
key="{}.ready".format(self._plan_uuid)
)

@property
def log_config(self):
return {
"logDriver": "awslogs",
"options": {"awslogs-group": self.container_log_group,
"awslogs-region": "us-east-1",
"awslogs-stream-prefix":
"ardere-{}".format(self.plan_uuid)
}
}

@property
def influx_db_name(self):
return "run-{}".format(self.plan_uuid)
Expand All @@ -175,12 +205,20 @@ def grafana_admin_password(self):
return self._plan["metrics_options"]["dashboard"]["admin_password"]

def family_name(self, step):
# type: (Dict[str, Any]) -> str
"""Generate a consistent family name for a given step"""
return step["name"] + "-" + self._plan_uuid

def metrics_family_name(self):
# type: () -> str
"""Generate a consistent metrics family name"""
return "{}-metrics".format(self._ecs_name)

def metrics_setup_family_name(self):
# type: () -> str
"""Generate a consistent metric setup family name"""
return "{}-metrics-setup".format(self._ecs_name)

def query_active_instances(self, additional_tags=None):
# type: (Optional[Dict[str, str]]) -> Dict[str, int]
"""Query EC2 for all the instances owned by ardere for this cluster."""
Expand Down Expand Up @@ -225,6 +263,25 @@ def has_metrics_node(self, instance_type):
)
return instance_type in instances

def has_started_metric_creation(self):
# type: () -> bool
"""Return whether the metric creation container was started"""
response = self._ecs_client.list_tasks(
cluster=self._ecs_name,
startedBy=self.plan_uuid
)
return bool(response["taskArns"])

def has_finished_metric_creation(self):
# type: () -> bool
"""Return whether the metric creation container was started"""
response = self._ecs_client.list_tasks(
cluster=self._ecs_name,
startedBy=self.plan_uuid,
desiredStatus="STOPPED"
)
return bool(response["taskArns"])

def request_instances(self, instances, security_group_ids,
additional_tags=None):
# type: (Dict[str, int], List[str], Optional[Dict[str, str]]) -> None
Expand Down Expand Up @@ -255,25 +312,32 @@ def request_instances(self, instances, security_group_ids,
)

def locate_metrics_container_ip(self):
"""Locates the metrics container IP"""
# type: () -> Tuple[Optional[str], Optional[str]]
"""Locates the metrics container IP and container instance arn
Returns a tuple of (public_ip, container_arn)
"""
response = self._ecs_client.list_container_instances(
cluster=self._ecs_name,
filter="task:group == service:metrics"
)
if not response["containerInstanceArns"]:
return None
return None, None

container_arn = response["containerInstanceArns"][0]
response = self._ecs_client.describe_container_instances(
cluster=self._ecs_name,
containerInstances=[container_arn]
)

ec2_instance_id = response["containerInstances"][0]["ec2InstanceId"]
container_instance = response["containerInstances"][0]
ec2_instance_id = container_instance["ec2InstanceId"]
instance = self.boto.resource("ec2").Instance(ec2_instance_id)
return instance.public_ip_address
return instance.private_ip_address, container_arn

def locate_metrics_service(self):
# type: () -> Optional[str]
"""Locate and return the metrics service arn if any"""
response = self._ecs_client.describe_services(
cluster=self._ecs_name,
Expand Down Expand Up @@ -309,6 +373,30 @@ def create_metrics_service(self, options):
"http://admin:admin@localhost:3000/api/datasources"
}

# Setup the task definition for setting up influxdb/grafana instances
# per run
mc_cmd = """\
pip install influxdb requests boto3 && \
echo "${__ARDERE_PYTHON_SCRIPT__}" > setup_db.py && \
python setup_db.py
"""
mc_cmd = ['sh', '-c', '{}'.format(mc_cmd)]
self._ecs_client.register_task_definition(
family=self.metrics_setup_family_name(),
containerDefinitions=[
{
"name": "metricsetup",
"image": self.python_container,
"cpu": 128,
"entryPoint": mc_cmd,
"memoryReservation": 256,
"privileged": True,
"logConfiguration": self.log_config
}
],
networkMode="host"
)

task_response = self._ecs_client.register_task_definition(
family=self.metrics_family_name(),
containerDefinitions=[
Expand All @@ -323,20 +411,12 @@ def create_metrics_service(self, options):
{"containerPort": 8086},
{"containerPort": 8088}
],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": self.container_log_group,
"awslogs-region": "us-east-1",
"awslogs-stream-prefix":
"ardere-{}".format(self.plan_uuid)
}
}
"logConfiguration": self.log_config
},
{
"name": "grafana",
"image": self.grafana_container,
"cpu": 512,
"cpu": 256,
"memoryReservation": 256,
"entryPoint": cmd,
"portMappings": [
Expand All @@ -347,15 +427,7 @@ def create_metrics_service(self, options):
{"name": key, "value": value} for key, value in
gf_env.items()
],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": self.container_log_group,
"awslogs-region": "us-east-1",
"awslogs-stream-prefix":
"ardere-{}".format(self.plan_uuid)
}
}
"logConfiguration": self.log_config
}
],
# use host network mode for optimal performance
Expand Down Expand Up @@ -389,6 +461,40 @@ def create_metrics_service(self, options):
service_arn = service_result["service"]["serviceArn"]
return dict(task_arn=task_arn, service_arn=service_arn)

def run_metric_creation_task(self, container_instance, grafana_auth,
dashboard=None,
dashboard_name=None):
# type: (str, Tuple[str, str], Optional[str], Optional[str]) -> None
"""Starts the metric creation task"""
env = {
"__ARDERE_GRAFANA_USER__": grafana_auth[0],
"__ARDERE_GRAFANA_PASS__": grafana_auth[1],
"__ARDERE_PYTHON_SCRIPT__": self.metric_create_script,
"__ARDERE_INFLUXDB_NAME__": self.influx_db_name
}

if dashboard:
env["__ARDERE_DASHBOARD__"] = dashboard
env["__ARDERE_DASHBOARD_NAME__"] = dashboard_name

self._ecs_client.start_task(
cluster=self._ecs_name,
taskDefinition=self.metrics_setup_family_name(),
overrides={
'containerOverrides': [
{
"name": "metricsetup",
"environment": [
{"name": key, "value": value} for key, value in
env.items()
]
}
]
},
containerInstances=[container_instance],
startedBy=self.plan_uuid
)

def create_service(self, step):
# type: (Dict[str, Any]) -> Dict[str, Any]
"""Creates an ECS service for a step and returns its info"""
Expand Down Expand Up @@ -432,16 +538,7 @@ def create_service(self, step):
"ulimits": [
dict(name="nofile", softLimit=1000000, hardLimit=1000000)
],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": self.container_log_group,
"awslogs-region": "us-east-1",
"awslogs-stream-prefix": "ardere-{}".format(
self.plan_uuid
)
}
}
"logConfiguration": self.log_config
}
if "port_mapping" in step:
ports = [{"containerPort": port} for port in step["port_mapping"]]
Expand Down Expand Up @@ -470,22 +567,13 @@ def create_service(self, step):
{"name": "__ARDERE_TELEGRAF_STEP__",
"value": step["name"]},
{"name": "__ARDERE_INFLUX_ADDR__",
"value": "{}:8086".format(self._plan["influxdb_public_ip"])},
"value": "{}:8086".format(self._plan["influxdb_private_ip"])},
{"name": "__ARDERE_INFLUX_DB__",
"value": self.influx_db_name},
{"name": "__ARDERE_TELEGRAF_TYPE__",
"value": step["docker_series"]}
],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": self.container_log_group,
"awslogs-region": "us-east-1",
"awslogs-stream-prefix": "ardere-{}".format(
self.plan_uuid
)
}
}
"logConfiguration": self.log_config
}

task_response = self._ecs_client.register_task_definition(
Expand Down Expand Up @@ -582,6 +670,7 @@ def stop_finished_services(self, start_time, steps):
self.stop_finished_service(start_time, step)

def shutdown_plan(self, steps):
# type: (List[Dict[str, Any]]) -> None
"""Terminate the entire plan, ensure all services and task
definitions are completely cleaned up and removed"""
# Locate all the services for the ECS Cluster
Expand Down Expand Up @@ -627,6 +716,7 @@ def shutdown_plan(self, steps):
# Add in the metrics family name if we need to tear_down
if self._plan["metrics_options"]["tear_down"]:
step_family_names.append(self.metrics_family_name())
step_family_names.append(self.metrics_setup_family_name())

for family_name in step_family_names:
try:
Expand Down
4 changes: 4 additions & 0 deletions ardere/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ class ValidationException(Exception):

class UndrainedInstancesException(Exception):
"""There are still ACTIVE or DRAINING instances in the cluster"""


class CreatingMetricSourceException(Exception):
"""Metric creation task hasn't completed yet"""
2 changes: 2 additions & 0 deletions ardere/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#

Loading

0 comments on commit bebe38a

Please sign in to comment.