-
Notifications
You must be signed in to change notification settings - Fork 74
/
function.py
531 lines (425 loc) · 16.6 KB
/
function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
"""
Copyright 2017 New Relic, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This Lambda function receives log entries from CloudWatch Logs
and pushes them to New Relic Infrastructure - Cloud integrations.
It expects to be invoked based on CloudWatch Logs streams.
New Relic's license key must be encrypted using KMS following these
instructions:
1. After creating te Lambda based on the Blueprint, select it and open the
Environment Variables section.
2. Check that the "LICENSE_KEY" environment variable if properly filled-in.
3. If you changed anything, go to the start of the page and press "Save".
Logs should start to be processed by the Lambda. To check if everything is
functioning properly you can check the Monitoring tab and CloudWatch Logs.
For more detailed documentation, check New Relic's documentation site:
https://docs.newrelic.com/
"""
import datetime
import gzip
import json
import os
import re
from base64 import b64decode
from enum import Enum
from urllib import request
import aiohttp
import asyncio
try:
import newrelic.agent
except ImportError:
pass
else:
# The agent shouldn't be running on this function. Ensure it is shutdown.
newrelic.agent.shutdown_agent()
# Retrying configuration.
# Increasing these numbers will make the function longer in case of
# communication failures and that will increase the cost.
# Decreasing these number could increase the probility of data loss.
# Maximum number of retries
MAX_RETRIES = 3
# Initial backoff (in seconds) between retries
INITIAL_BACKOFF = 1
# Multiplier factor for the backoff between retries
BACKOFF_MULTIPLIER = 2
# Max length in bytes of the payload
MAX_PAYLOAD_SIZE = 1000 * 1024
LAMBDA_LOG_GROUP_PREFIX = "/aws/lambda"
VPC_LOG_GROUP_PREFIX = "/aws/vpc/flow-logs"
LAMBDA_NR_MONITORING_PATTERN = re.compile(r'.*"NR_LAMBDA_MONITORING')
REPORT_PATTERN = re.compile("REPORT RequestId:")
TIMEOUT_PATTERN = re.compile(
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d+Z\s[\d\w-]+\sTask timed out after [\d.]+ seconds"
)
# Lines like this are generated by the Lambda service when it has to kill the function's runtime,
# e.g. for an out of memory error.
REQUEST_ID_PATTERN = re.compile(r"RequestId:\s([-a-zA-Z0-9]{36})\s(.*)", re.DOTALL)
class EntryType(Enum):
VPC = "vpc"
LAMBDA = "lambda"
OTHER = "other"
INGEST_SERVICE_VERSION = "v1"
US_LOGGING_ENDPOINT = "https://log-api.newrelic.com/log/v1"
EU_LOGGING_ENDPOINT = "https://log-api.eu.newrelic.com/log/v1"
US_INFRA_ENDPOINT = "https://cloud-collector.newrelic.com"
EU_INFRA_ENDPOINT = "https://cloud-collector.eu01.nr-data.net"
INFRA_INGEST_SERVICE_PATHS = {
EntryType.LAMBDA: "/aws/lambda",
EntryType.VPC: "/aws/vpc",
EntryType.OTHER: "/aws",
}
LAMBDA_REQUEST_ID_REGEX = re.compile(
r"RequestId:\s"
r"(?P<request_id>[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})"
)
LOGGING_LAMBDA_VERSION = "1.0.2"
LOGGING_PLUGIN_METADATA = {"type": "lambda", "version": LOGGING_LAMBDA_VERSION}
class MaxRetriesException(Exception):
pass
class BadRequestException(Exception):
pass
async def http_post(session, url, data, headers):
def _format_error(e, text):
return "{}. {}".format(e, text)
backoff = INITIAL_BACKOFF
retries = 0
while retries < MAX_RETRIES:
if retries > 0:
print("Retrying in {} seconds".format(backoff))
await asyncio.sleep(backoff)
backoff *= BACKOFF_MULTIPLIER
retries += 1
try:
resp = await session.post(url, data=data, headers=headers)
resp.raise_for_status()
return resp.status, resp.url
except aiohttp.ClientResponseError as e:
if e.status == 400:
raise BadRequestException(_format_error(e, "Unexpected payload"))
elif e.status == 403:
raise BadRequestException(_format_error(e, "Review your license key"))
elif e.status == 404:
raise BadRequestException(
_format_error(e, "Review the region endpoint")
)
elif e.status == 429:
print("There was a {} error. Reason: {}".format(e.status, e.message))
# Now retry the request
continue
elif 400 <= e.status < 500:
raise BadRequestException(e)
raise MaxRetriesException()
def _filter_log_lines(log_entry):
"""
The EntryType.LAMBDA check guarantees that we'll be left with at least one log after filtering
"""
final_log_events = []
for event in log_entry["logEvents"]:
message = event["message"]
if REPORT_PATTERN.match(message) or _is_lambda_message(message):
final_log_events.append(event)
ret = log_entry.copy()
ret["logEvents"] = final_log_events
return ret
async def _send_log_entry(log_entry, context):
"""
This function sends the log entry to New Relic Infrastructure's ingest
server. If it is necessary, entries will be split in different payloads
Log entry is sent along with the Lambda function's execution context
"""
entry_type = _get_entry_type(log_entry)
context = {
"function_name": context.function_name,
"invoked_function_arn": context.invoked_function_arn,
"log_group_name": context.log_group_name,
"log_stream_name": context.log_stream_name,
}
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=3)) as session:
# Both Infrastructure and Logging require a "LICENSE_KEY" environment variable.
# In order to send data to the Infrastructure Pipeline, the customer doesn't need
# to do anything. To disable it, they'll set "INFRA_ENABLED" to "false".
# To send data to the Logging Pipeline, an environment variable called "LOGGING_ENABLED"
# is required and needs to be set to "true". To disable it, they don't need to do anything,
# it is disabled by default
# Instruction for how to find these keys are in the README.md
requests = []
if _infra_enabled():
if entry_type == EntryType.LAMBDA:
# If this is one of our lambda entries, we should only send the log lines we
# actually care about
data = {
"context": context,
"entry": json.dumps(_filter_log_lines(log_entry)),
}
else:
# VPC logs are infra requests that aren't Lambda invocations
data = {"context": context, "entry": json.dumps(log_entry)}
for payload in _generate_payloads(data, _split_infra_payload):
requests.append(
_send_payload(
_get_infra_request_creator(entry_type, payload), session, True
)
)
if _logging_enabled():
data = {"context": context, "entry": json.dumps(log_entry)}
for payload in _generate_payloads(
_package_log_payload(data), _split_log_payload
):
requests.append(
_send_payload(_get_logging_request_creator(payload), session)
)
return await asyncio.gather(*requests)
async def _send_payload(request_creator, session, retry=False):
try:
req = request_creator()
status, url = await http_post(
session, req.get_full_url(), req.data, req.headers
)
except MaxRetriesException as e:
print("Retry limit reached. Failed to send log entry.")
if retry:
raise e
except BadRequestException as e:
print(e)
else:
print("Log entry sent. Response code: {}. url: {}".format(status, url))
return status
def _generate_payloads(data, split_function):
"""
Return a list of payloads to be sent to New Relic.
This method usually returns a list of one element, but can be bigger if the
payload size is too big
"""
payload = gzip.compress(json.dumps(data).encode())
if len(payload) < MAX_PAYLOAD_SIZE:
return [payload]
split_data = split_function(data)
return _generate_payloads(split_data[0], split_function) + _generate_payloads(
split_data[1], split_function
)
def _get_license_key(license_key=None):
"""
This functions gets New Relic's license key from env vars.
"""
if license_key:
return license_key
return os.getenv("LICENSE_KEY", "")
def _get_newrelic_tags(payload):
"""
This functions gets New Relic's tags from env vars and adds it to the payload
A tag is a key value pair. Multiple tags can be specified.
Key and value are colon delimited. Multiple key value pairs are semi-colon delimited.
e.g. env:prod;team:myTeam
"""
nr_tags_str = os.getenv("NR_TAGS", "")
if nr_tags_str:
nr_tags = dict(
item.split(":")
for item in nr_tags_str.split(";")
if not item.startswith(tuple(["aws:", "plugin:"]))
)
payload[0]["common"]["attributes"].update(nr_tags)
def _debug_logging_enabled():
"""
Determines whether or not debug logging should be enabled based on the env var.
Defaults to false.
"""
return os.getenv("DEBUG_LOGGING_ENABLED", "false").lower() == "true"
##############
# NR Infra #
##############
def _infra_enabled():
"""
This function returns whether to send info to New Relic Infrastructure.
Enabled by default.
"""
return os.getenv("INFRA_ENABLED", "true").lower() == "true"
def _get_infra_request_creator(entry_type, payload, ingest_host=None, license_key=None):
def create_request():
req = request.Request(_get_infra_url(entry_type, ingest_host), payload)
req.add_header("X-License-Key", _get_license_key(license_key))
req.add_header("Content-Encoding", "gzip")
return req
return create_request
def _get_infra_url(entry_type, ingest_host=None):
"""
Returns the ingest_service_url.
This is a concatenation of the HOST + PATH + VERSION
"""
if ingest_host is None:
ingest_host = _get_infra_endpoint()
path = INFRA_INGEST_SERVICE_PATHS[entry_type]
return ingest_host + path + "/" + INGEST_SERVICE_VERSION
def _is_lambda_message(message):
"""
Matches messages that are sufficient to report a Lambda invocation.
REPORT lines are not sufficient, just nice to have.
"""
return (
LAMBDA_NR_MONITORING_PATTERN.match(message)
or TIMEOUT_PATTERN.match(message)
or REQUEST_ID_PATTERN.match(message)
)
def _get_entry_type(log_entry):
"""
Returns the EntryType of the entry based on some text found in its value.
"""
log_group = log_entry["logGroup"]
if log_group.startswith(VPC_LOG_GROUP_PREFIX):
return EntryType.VPC
elif log_group.startswith(LAMBDA_LOG_GROUP_PREFIX) and any(
_is_lambda_message(event["message"]) for event in log_entry["logEvents"]
):
return EntryType.LAMBDA
return EntryType.OTHER
def _get_infra_endpoint():
"""
Service url is determined by the license key's region.
Any other URL could be passed by using the NR_INFRA_ENDPOINT env var.
"""
if "NR_INFRA_ENDPOINT" in os.environ:
return os.environ["NR_INFRA_ENDPOINT"]
return (
EU_INFRA_ENDPOINT if _get_license_key().startswith("eu") else US_INFRA_ENDPOINT
)
def _split_infra_payload(data):
"""
When data size is bigger than supported payload, it is divided in two
different requests
"""
context = data["context"]
entry = json.loads(data["entry"])
logEvents = entry["logEvents"]
half = len(logEvents) // 2
return [
_reconstruct_infra_data(context, entry, logEvents[:half]),
_reconstruct_infra_data(context, entry, logEvents[half:]),
]
def _reconstruct_infra_data(context, entry, logEvents):
entry["logEvents"] = logEvents
return {"context": context, "entry": json.dumps(entry)}
################
# NR Logging #
################
def _logging_enabled():
"""
This function returns whether to send info to New Relic Logging.
Disabled by default.
"""
return os.getenv("LOGGING_ENABLED", "false").lower() == "true"
def _get_logging_request_creator(payload, ingest_url=None, license_key=None):
def create_request():
req = request.Request(_get_logging_endpoint(ingest_url), payload)
req.add_header("X-License-Key", _get_license_key(license_key))
req.add_header("X-Event-Source", "logs")
req.add_header("Content-Encoding", "gzip")
return req
return create_request
def _get_logging_endpoint(ingest_url=None):
"""
Service url is determined by the lincese key's region.
Any other URL could be passed by using the NR_LOGGING_ENDPOINT env var.
"""
if ingest_url:
return ingest_url
if "NR_LOGGING_ENDPOINT" in os.environ:
return os.environ["NR_LOGGING_ENDPOINT"]
return (
EU_LOGGING_ENDPOINT
if _get_license_key().startswith("eu")
else US_LOGGING_ENDPOINT
)
def _package_log_payload(data):
"""
Packages up a MELT request for log messages
"""
entry = json.loads(data["entry"])
log_events = entry["logEvents"]
log_messages = []
lambda_request_id = None
for log_event in log_events:
log_message = {
"message": log_event["message"],
"timestamp": log_event["timestamp"],
"attributes": {"aws": {}},
}
for event_key in log_event:
if event_key not in ("id", "message", "timestamp"):
log_message["attributes"][event_key] = log_event[event_key]
if entry["logGroup"].startswith(LAMBDA_LOG_GROUP_PREFIX):
match = LAMBDA_REQUEST_ID_REGEX.search(log_event["message"])
if match and match.group("request_id"):
lambda_request_id = match.group("request_id")
if lambda_request_id:
log_message["attributes"]["aws"][
"lambda_request_id"
] = lambda_request_id
log_messages.append(log_message)
packaged_payload = [
{
"common": {
"attributes": {
"plugin": LOGGING_PLUGIN_METADATA,
"aws": {
"logStream": entry["logStream"],
"logGroup": entry["logGroup"],
},
}
},
"logs": log_messages,
}
]
_get_newrelic_tags(packaged_payload)
return packaged_payload
def _split_log_payload(payload):
"""
When data size is bigger than supported payload, it is divided in two
different requests
"""
common = payload[0]["common"]
logs = payload[0]["logs"]
half = len(logs) // 2
return [
_reconstruct_log_payload(common, logs[:half]),
_reconstruct_log_payload(common, logs[half:]),
]
def _reconstruct_log_payload(common, logs):
return [{"common": common, "logs": logs}]
####################
# Lambda handler #
####################
def lambda_handler(event, context):
"""
This is the Lambda handler, which is called when the function is invoked.
Changing the name of this function will require changes in Lambda
function's configuration.
"""
# CloudWatch Log entries are compressed and encoded in Base64
event_data = b64decode(event["awslogs"]["data"])
log_entry_str = gzip.decompress(event_data).decode("utf-8")
log_entry = json.loads(log_entry_str)
# output additional helpful info if debug logging is enabled
# not enabled by default since parsing into json might be slow
if _debug_logging_enabled():
# calling '[0]' without a safety check looks sketchy, but Cloudwatch is never going
# to send us a log without at least one event
print(
"logGroup: {}, logStream: {}, timestamp: {}".format(
log_entry["logGroup"],
log_entry["logStream"],
datetime.datetime.fromtimestamp(
log_entry["logEvents"][0]["timestamp"] / 1000.0
),
)
)
asyncio.run(_send_log_entry(log_entry, context))
# This makes it possible to chain this CW log consumer with others using a success destination
return event