Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
Report batch job failures to Rollbar via a Lambda function
Browse files Browse the repository at this point in the history
We want to be alerted when there is a job failure. To accomplish this we set up
a CloudWatch event to detect failures and connect that event to a Lambda
function that posts the details of the failure to Rollbar. This strategy has
been used successfully on another project.
  • Loading branch information
jwalgran committed May 16, 2019
1 parent 20871af commit 843fc25
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -49,3 +49,7 @@ research/dedupe/gazetteer/gazetteer_messy_pass_2.csv
research/dedupe/gazetteer/gazetteer_output.csv
research/dedupe/gazetteer/gazetteer_pass_2_output.csv
research/dedupe/gazetteer/gazetteer_training.json
deployment/terraform/lambda-functions/**/*.zip
deployment/terraform/lambda-functions/**/*.out
deployment/terraform/lambda-functions/**/src/*
!deployment/terraform/lambda-functions/alert_batch_failures/src/alert_batch_failures.py
26 changes: 26 additions & 0 deletions deployment/terraform/iam.tf
Expand Up @@ -150,3 +150,29 @@ resource "aws_iam_role_policy_attachment" "spot_fleet_policy" {
role = "${aws_iam_role.container_instance_spot_fleet.name}"
policy_arn = "${var.spot_fleet_service_role_policy_arn}"
}

#
# Lambda resources
#
data "aws_iam_policy_document" "alert_batch_failures_assume_role" {
statement {
effect = "Allow"

principals {
type = "Service"
identifiers = ["lambda.amazonaws.com"]
}

actions = ["sts:AssumeRole"]
}
}

resource "aws_iam_role" "alert_batch_failures" {
name = "lambda${var.environment}AlertBatchFailures"
assume_role_policy = "${data.aws_iam_policy_document.alert_batch_failures_assume_role.json}"
}

resource "aws_iam_role_policy_attachment" "alert_batch_failures_lambda_policy" {
role = "${aws_iam_role.alert_batch_failures.name}"
policy_arn = "${var.aws_lambda_service_role_policy_arn}"
}
@@ -0,0 +1,11 @@
all: requirements
rm -rf alert_batch_failures.zip
(cd src && zip -qXr ../alert_batch_failures.zip .)

requirements: requirements.txt.out

requirements.txt.out: requirements.txt
git clean -qfdx src
pip install -q -t src -r requirements.txt | tee requirements.txt.out

.PHONY: all requirements
@@ -0,0 +1 @@
rollbar==0.14.6
@@ -0,0 +1,37 @@
import os
import rollbar

environment = os.getenv('ENVIRONMENT', 'Staging')
rollbar_token = os.getenv('ROLLBAR_SERVER_SIDE_ACCESS_TOKEN')
rollbar.init(rollbar_token, environment)

CLOUDWATCH_LOGS_URL = "https://console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/batch/job;stream={logstream}" # NOQA


@rollbar.lambda_function
def handler(event, context):
cloudwatch_urls = \
list(map(lambda attempt:
CLOUDWATCH_LOGS_URL.format(
logstream=attempt["container"]["logStreamName"]),
event["detail"]["attempts"]))

msg = """
{jobname} (JobID {jid}) entered state {state}. Reason: {reason}.
CloudWatch URLs:
- {cloudwatchurls}
""".format(jobname=event["detail"]["jobName"].split("-")[0],
jid=event["detail"]["jobId"],
state=event["detail"]["status"],
reason=event["detail"]["statusReason"],
cloudwatchurls="\n\n-".join(cloudwatch_urls))

rollbar.report_message(
msg,
level='error',
# Give each CloudWatch URL its own column in the occurrences tab
extra_data=dict(map(lambda x: ("cloudwatch_url_{}".format(x[0]), x[1]),
enumerate(cloudwatch_urls))))
55 changes: 55 additions & 0 deletions deployment/terraform/lambda.tf
@@ -0,0 +1,55 @@
resource "aws_lambda_function" "alert_batch_failures" {
filename = "${path.module}/lambda-functions/alert_batch_failures/alert_batch_failures.zip"
source_code_hash = "${base64sha256(file("${path.module}/lambda-functions/alert_batch_failures/alert_batch_failures.zip"))}"
function_name = "func${var.environment}AlertBatchFailures"
description = "Function to alert on AWS Batch Job Failures."
role = "${aws_iam_role.alert_batch_failures.arn}"
handler = "alert_batch_failures.handler"
runtime = "python3.6"
timeout = 10
memory_size = 128

environment {
variables = {
ENVIRONMENT = "${var.environment}"
ROLLBAR_SERVER_SIDE_ACCESS_TOKEN = "${var.rollbar_server_side_access_token}"
}
}

tags {
Project = "${var.project}"
Environment = "${var.environment}"
}
}

resource "aws_cloudwatch_event_rule" "alert_batch_failures" {
name = "rule${var.environment}AlertBatchFailures"
description = "Rule to send alerts when batch jobs fail."

event_pattern = <<PATTERN
{
"source": ["aws.batch"],
"detail-type": ["Batch Job State Change"],
"detail": {
"status": ["FAILED"],
"jobQueue": [
"${aws_batch_job_queue.default.arn}"
]
}
}
PATTERN
}

resource "aws_cloudwatch_event_target" "alert_batch_failures" {
rule = "${aws_cloudwatch_event_rule.alert_batch_failures.name}"
target_id = "target${var.environment}AlertBatchFailures"
arn = "${aws_lambda_function.alert_batch_failures.arn}"
}

resource "aws_lambda_permission" "alert_batch_failures" {
statement_id = "perm${var.environment}AlertBatchFailures"
action = "lambda:InvokeFunction"
function_name = "${aws_lambda_function.alert_batch_failures.function_name}"
principal = "events.amazonaws.com"
source_arn = "${aws_cloudwatch_event_rule.alert_batch_failures.arn}"
}
4 changes: 4 additions & 0 deletions deployment/terraform/variables.tf
Expand Up @@ -252,3 +252,7 @@ variable "batch_service_role_policy_arn" {
variable "spot_fleet_service_role_policy_arn" {
default = "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole"
}

variable "aws_lambda_service_role_policy_arn" {
default = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
}
3 changes: 3 additions & 0 deletions scripts/infra
Expand Up @@ -39,6 +39,9 @@ if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
plan)
DEFAULT_BATCH_CE_DESIRED_CPU=$(aws batch describe-compute-environments --output text --compute-environments "batch${OAR_DEPLOYMENT_ENVIRONMENT^}DefaultComputeEnvironment" --query "computeEnvironments[].computeResources.desiredvCpus")

# Build Lambda function archives
make -sC lambda-functions/alert_batch_failures

# Clear stale modules & remote state, then re-initialize
rm -rf .terraform terraform.tfstate*
terraform init \
Expand Down

0 comments on commit 843fc25

Please sign in to comment.