From 843fc255976f0efa807da62f2c8c56a28b10ac37 Mon Sep 17 00:00:00 2001 From: Justin Walgran Date: Tue, 14 May 2019 15:43:08 -0700 Subject: [PATCH] Report batch job failures to Rollbar via a Lambda function We want to be alerted when there is a job failure. To accomplish this we set up a CloudWatch event to detect failures and connect that event to a Lambda function that posts the details of the failure to Rollbar. This strategy has been used successfully on another project. --- .gitignore | 4 ++ deployment/terraform/iam.tf | 26 +++++++++ .../alert_batch_failures/Makefile | 11 ++++ .../alert_batch_failures/requirements.txt | 1 + .../src/alert_batch_failures.py | 37 +++++++++++++ deployment/terraform/lambda.tf | 55 +++++++++++++++++++ deployment/terraform/variables.tf | 4 ++ scripts/infra | 3 + 8 files changed, 141 insertions(+) create mode 100644 deployment/terraform/lambda-functions/alert_batch_failures/Makefile create mode 100644 deployment/terraform/lambda-functions/alert_batch_failures/requirements.txt create mode 100644 deployment/terraform/lambda-functions/alert_batch_failures/src/alert_batch_failures.py create mode 100644 deployment/terraform/lambda.tf diff --git a/.gitignore b/.gitignore index 27cf3ccf8..468d7d04a 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,7 @@ research/dedupe/gazetteer/gazetteer_messy_pass_2.csv research/dedupe/gazetteer/gazetteer_output.csv research/dedupe/gazetteer/gazetteer_pass_2_output.csv research/dedupe/gazetteer/gazetteer_training.json +deployment/terraform/lambda-functions/**/*.zip +deployment/terraform/lambda-functions/**/*.out +deployment/terraform/lambda-functions/**/src/* +!deployment/terraform/lambda-functions/alert_batch_failures/src/alert_batch_failures.py diff --git a/deployment/terraform/iam.tf b/deployment/terraform/iam.tf index b1e16274b..00af9cfc3 100644 --- a/deployment/terraform/iam.tf +++ b/deployment/terraform/iam.tf @@ -150,3 +150,29 @@ resource "aws_iam_role_policy_attachment" "spot_fleet_policy" { role = "${aws_iam_role.container_instance_spot_fleet.name}" policy_arn = "${var.spot_fleet_service_role_policy_arn}" } + +# +# Lambda resources +# +data "aws_iam_policy_document" "alert_batch_failures_assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + + actions = ["sts:AssumeRole"] + } +} + +resource "aws_iam_role" "alert_batch_failures" { + name = "lambda${var.environment}AlertBatchFailures" + assume_role_policy = "${data.aws_iam_policy_document.alert_batch_failures_assume_role.json}" +} + +resource "aws_iam_role_policy_attachment" "alert_batch_failures_lambda_policy" { + role = "${aws_iam_role.alert_batch_failures.name}" + policy_arn = "${var.aws_lambda_service_role_policy_arn}" +} diff --git a/deployment/terraform/lambda-functions/alert_batch_failures/Makefile b/deployment/terraform/lambda-functions/alert_batch_failures/Makefile new file mode 100644 index 000000000..539b92627 --- /dev/null +++ b/deployment/terraform/lambda-functions/alert_batch_failures/Makefile @@ -0,0 +1,11 @@ +all: requirements + rm -rf alert_batch_failures.zip + (cd src && zip -qXr ../alert_batch_failures.zip .) + +requirements: requirements.txt.out + +requirements.txt.out: requirements.txt + git clean -qfdx src + pip install -q -t src -r requirements.txt | tee requirements.txt.out + +.PHONY: all requirements diff --git a/deployment/terraform/lambda-functions/alert_batch_failures/requirements.txt b/deployment/terraform/lambda-functions/alert_batch_failures/requirements.txt new file mode 100644 index 000000000..f0b54caee --- /dev/null +++ b/deployment/terraform/lambda-functions/alert_batch_failures/requirements.txt @@ -0,0 +1 @@ +rollbar==0.14.6 diff --git a/deployment/terraform/lambda-functions/alert_batch_failures/src/alert_batch_failures.py b/deployment/terraform/lambda-functions/alert_batch_failures/src/alert_batch_failures.py new file mode 100644 index 000000000..e3bdfc947 --- /dev/null +++ b/deployment/terraform/lambda-functions/alert_batch_failures/src/alert_batch_failures.py @@ -0,0 +1,37 @@ +import os +import rollbar + +environment = os.getenv('ENVIRONMENT', 'Staging') +rollbar_token = os.getenv('ROLLBAR_SERVER_SIDE_ACCESS_TOKEN') +rollbar.init(rollbar_token, environment) + +CLOUDWATCH_LOGS_URL = "https://console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/batch/job;stream={logstream}" # NOQA + + +@rollbar.lambda_function +def handler(event, context): + cloudwatch_urls = \ + list(map(lambda attempt: + CLOUDWATCH_LOGS_URL.format( + logstream=attempt["container"]["logStreamName"]), + event["detail"]["attempts"])) + + msg = """ +{jobname} (JobID {jid}) entered state {state}. Reason: {reason}. + +CloudWatch URLs: + + +- {cloudwatchurls} + """.format(jobname=event["detail"]["jobName"].split("-")[0], + jid=event["detail"]["jobId"], + state=event["detail"]["status"], + reason=event["detail"]["statusReason"], + cloudwatchurls="\n\n-".join(cloudwatch_urls)) + + rollbar.report_message( + msg, + level='error', + # Give each CloudWatch URL its own column in the occurrences tab + extra_data=dict(map(lambda x: ("cloudwatch_url_{}".format(x[0]), x[1]), + enumerate(cloudwatch_urls)))) diff --git a/deployment/terraform/lambda.tf b/deployment/terraform/lambda.tf new file mode 100644 index 000000000..6d6645852 --- /dev/null +++ b/deployment/terraform/lambda.tf @@ -0,0 +1,55 @@ +resource "aws_lambda_function" "alert_batch_failures" { + filename = "${path.module}/lambda-functions/alert_batch_failures/alert_batch_failures.zip" + source_code_hash = "${base64sha256(file("${path.module}/lambda-functions/alert_batch_failures/alert_batch_failures.zip"))}" + function_name = "func${var.environment}AlertBatchFailures" + description = "Function to alert on AWS Batch Job Failures." + role = "${aws_iam_role.alert_batch_failures.arn}" + handler = "alert_batch_failures.handler" + runtime = "python3.6" + timeout = 10 + memory_size = 128 + + environment { + variables = { + ENVIRONMENT = "${var.environment}" + ROLLBAR_SERVER_SIDE_ACCESS_TOKEN = "${var.rollbar_server_side_access_token}" + } + } + + tags { + Project = "${var.project}" + Environment = "${var.environment}" + } +} + +resource "aws_cloudwatch_event_rule" "alert_batch_failures" { + name = "rule${var.environment}AlertBatchFailures" + description = "Rule to send alerts when batch jobs fail." + + event_pattern = <