-
Notifications
You must be signed in to change notification settings - Fork 1
/
oom.py
94 lines (82 loc) · 3.51 KB
/
oom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Loosely inspired by
# https://github.com/stripe/datadog-checks/blob/master/checks.d/oom.py but with
# journald instead of kern.log, and as a count rather than a system check (so we
# can investigate distribution of OOMs over time rather than just declare the
# machine broken).
import re
import json
from utils.subprocess_output import get_subprocess_output
from checks import AgentCheck
# Note that this does *not* include processes being killed for outgrowing a
# memory control group, which is a slightly different error message.
oomRE = re.compile(
r'^Out of memory: Kill process (?P<pid>\d+) \((?P<pname>.*?)\) ' +
r'score (?P<score>.*?) or sacrifice child')
class OOM(AgentCheck):
def __init__(self, name, init_config, agentConfig):
AgentCheck.__init__(self, name, init_config, agentConfig)
self.cursor = self.cursor_for_end_of_journal()
def cursor_for_end_of_journal(self):
# Note that for some reason it's important that this cursor be specific
# to the filters we're using: if we get a cursor without the
# _TRANSPORT=kernel PRIORITY=3 it somehow seems to sometimes skip the
# next line when you pass it to --after-cursor.
entries = self.journalctl_entries(['-n', '1'])
if len(entries) == 0:
# The kernel hasn't had any errors yet. That's fine, we'll start at
# the beginning next time.
return None
if len(entries) > 1:
self.log.error(
'Too many results ({0}) for cursor_for_end_of_journal',
len(entries))
self.increment('oom.errors.cfeoj.empty')
return None
entry = entries[0]
if '__CURSOR' not in entry:
self.log.error('Missing __CURSOR for cursor_for_end_of_journal')
self.increment('oom.errors.cfeoj.nocursor')
return None
return entry['__CURSOR']
def journalctl_entries(self, args):
out, err, exitCode = get_subprocess_output(
['journalctl',
# One JSON object per line per entry.
'-o', 'json',
# No reason to look at non-system logs.
'--system',
# Kernel logs.
'_TRANSPORT=kernel',
# A the "error" level.
'PRIORITY=3'] + args, self.log)
if exitCode != 0:
self.log.error('journalctl failed, code {0}: {1}'.format(
exitCode, err))
self.increment('oom.errors.je.failure')
return []
try:
return [json.loads(line) for line in out.splitlines()]
except:
self.log.exception('json parsing failed')
self.increment('oom.errors.je.jsonfail')
return []
def check(self, instance):
args = []
if self.cursor is not None:
args = ['--after-cursor', self.cursor]
entries = self.journalctl_entries(args)
# Nothing at all happened? Great.
if not entries:
self.log.debug('Got nothing!')
return
self.log.debug('Got entries: %s' % entries)
for entry in entries:
# Start after this next time (whether or not it's an OOM).
self.cursor = entry['__CURSOR']
match = oomRE.match(entry['MESSAGE'])
if not match:
continue
groups = match.groupdict()
self.log.info('Detected OOM! {0}'.format(groups))
self.increment('oom.killed',
tags=['pname:{0}'.format(groups['pname'])])