/
parsers.py
519 lines (441 loc) · 19.7 KB
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
import json
import logging
import re
from html.parser import HTMLParser
import jsonschema
from django.conf import settings
logger = logging.getLogger(__name__)
class ParserBase:
"""
Base class for all parsers.
"""
def __init__(self, name):
"""Setup the artifact to hold the extracted data."""
self.name = name
self.clear()
def clear(self):
"""Reset this parser's values for another run."""
self.artifact = []
self.complete = False
def parse_line(self, line, lineno):
"""Parse a single line of the log"""
raise NotImplementedError # pragma no cover
def finish_parse(self, last_lineno_seen):
"""Clean-up/summary tasks run at the end of parsing."""
pass
def get_artifact(self):
"""By default, just return the artifact as-is."""
return self.artifact
class StepParser(ParserBase):
"""
Parse out individual job steps within a log.
Step format:
"steps": [
{
"errors": [],
"name": "set props: master", # the name of the process on start line
"started": "2013-06-05 12:39:57.838527",
"started_linenumber": 8,
"finished_linenumber": 10,
"finished": "2013-06-05 12:39:57.839226",
"result": 0
},
...
]
"""
# Matches the half-dozen 'key: value' header lines printed at the start of each
# Buildbot job log. The list of keys are taken from:
# https://hg.mozilla.org/build/buildbotcustom/file/644c3860300a/bin/log_uploader.py#l126
RE_HEADER_LINE = re.compile(r'(?:builder|slave|starttime|results|buildid|builduid|revision): .*')
# Step marker lines, eg:
# ========= Started foo (results: 0, elapsed: 0 secs) (at 2015-08-17 02:33:56.353866) =========
# ========= Finished foo (results: 0, elapsed: 0 secs) (at 2015-08-17 02:33:56.354301) =========
RE_STEP_MARKER = re.compile(r'={9} (?P<marker_type>Started|Finished) (?P<name>.*?) '
r'\(results: (?P<result_code>\d+), elapsed: .*?\) '
r'\(at (?P<timestamp>.*?)\)')
# Legacy result code to name mapping inherited from buildbot (duplicated in TextLogStep)
# TODO: Likely remove this and step handling entirely now that Taskcluster doesn't have steps.
RESULT_DICT = {
0: "success",
1: "testfailed",
2: "busted",
3: "skipped",
4: "exception",
5: "retry",
6: "usercancel",
7: "superseded",
}
STATES = {
# The initial state until we record the first step.
"awaiting_first_step": 0,
# We've started a step, but not yet seen the end of it.
"step_in_progress": 1,
# We've seen the end of the previous step.
"step_finished": 2,
}
# date format in a step started/finished header
DATE_FORMAT = '%Y-%m-%d %H:%M:%S.%f'
def __init__(self):
"""Setup the artifact to hold the header lines."""
super().__init__("step_data")
self.stepnum = -1
self.artifact = {
"steps": [],
"errors_truncated": False
}
self.sub_parser = ErrorParser()
self.state = self.STATES['awaiting_first_step']
def parse_line(self, line, lineno):
"""Parse a single line of the log.
We have to handle both buildbot style logs as well as Taskcluster logs. The latter
attempt to emulate the buildbot logs, but don't accurately do so, partly due
to the way logs are generated in Taskcluster (ie: on the workers themselves).
Buildbot logs:
builder: ...
slave: ...
starttime: ...
results: ...
buildid: ...
builduid: ...
revision: ...
======= <step START marker> =======
<step log output>
======= <step FINISH marker> =======
======= <step START marker> =======
<step log output>
======= <step FINISH marker> =======
Taskcluster logs (a worst-case example):
<log output outside a step>
======= <step START marker> =======
<step log output>
======= <step FINISH marker> =======
<log output outside a step>
======= <step START marker> =======
<step log output with no following finish marker>
As can be seen above, Taskcluster logs can have (a) log output that falls between
step markers, and (b) content at the end of the log, that is not followed by a
final finish step marker. We handle this by creating generic placeholder steps to
hold the log output that is not enclosed by step markers, and then by cleaning up
the final step in finish_parse() once all lines have been parsed.
"""
if not line.strip():
# Skip whitespace-only lines, since they will never contain an error line,
# so are not of interest. This also avoids creating spurious unnamed steps
# (which occurs when we find content outside of step markers) for the
# newlines that separate the steps in Buildbot logs.
return
if self.state == self.STATES['awaiting_first_step'] and self.RE_HEADER_LINE.match(line):
# The "key: value" job metadata header lines that appear at the top of
# Buildbot logs would result in the creation of an unnamed step at the
# start of the job, unless we skip them. (Which is not desired, since
# the lines are metadata and not test/build output.)
return
step_marker_match = self.RE_STEP_MARKER.match(line)
if not step_marker_match:
# This is a normal log line, rather than a step marker. (The common case.)
if self.state != self.STATES['step_in_progress']:
# We don't have an in-progress step, so need to start one, even though this
# isn't a "step started" marker line. We therefore create a new generic step,
# since we have no way of finding out the step metadata. This case occurs
# for the Taskcluster logs where content can fall between step markers.
self.start_step(lineno)
# Parse the line for errors, which if found, will be associated with the current step.
self.sub_parser.parse_line(line, lineno)
return
# This is either a "step started" or "step finished" marker line, eg:
# ========= Started foo (results: 0, elapsed: 0 secs) (at 2015-08-17 02:33:56.353866) =========
# ========= Finished foo (results: 0, elapsed: 0 secs) (at 2015-08-17 02:33:56.354301) =========
if step_marker_match.group('marker_type') == 'Started':
if self.state == self.STATES['step_in_progress']:
# We're partway through a step (ie: haven't seen a "step finished" marker line),
# but have now reached the "step started" marker for the next step. Before we
# can start the new step, we have to clean up the previous one - albeit using
# generic step metadata, since there was no "step finished" marker. This occurs
# in Taskcluster's logs when content falls between the step marker lines.
self.end_step(lineno)
# Start a new step using the extracted step metadata.
self.start_step(lineno,
name=step_marker_match.group('name'),
timestamp=step_marker_match.group('timestamp'))
return
# This is a "step finished" marker line.
if self.state != self.STATES['step_in_progress']:
# We're not in the middle of a step, so can't finish one. Just ignore the marker line.
return
# Close out the current step using the extracted step metadata.
self.end_step(lineno,
timestamp=step_marker_match.group('timestamp'),
result_code=int(step_marker_match.group('result_code')))
def start_step(self, lineno, name="Unnamed step", timestamp=None):
"""Create a new step and update the state to reflect we're now in the middle of a step."""
self.state = self.STATES['step_in_progress']
self.stepnum += 1
self.steps.append({
"name": name,
"started": timestamp,
"started_linenumber": lineno,
"errors": [],
})
def end_step(self, lineno, timestamp=None, result_code=None):
"""Fill in the current step's summary and update the state to show the current step has ended."""
self.state = self.STATES['step_finished']
step_errors = self.sub_parser.get_artifact()
step_error_count = len(step_errors)
if step_error_count > settings.PARSER_MAX_STEP_ERROR_LINES:
step_errors = step_errors[:settings.PARSER_MAX_STEP_ERROR_LINES]
self.artifact["errors_truncated"] = True
self.current_step.update({
"finished": timestamp,
"finished_linenumber": lineno,
# Whilst the result code is present on both the start and end buildbot-style step
# markers, for Taskcluster logs the start marker line lies about the result, since
# the log output is unbuffered, so Taskcluster does not know the real result at
# that point. As such, we only set the result when ending a step.
"result": self.RESULT_DICT.get(result_code, "unknown"),
"errors": step_errors
})
# reset the sub_parser for the next step
self.sub_parser.clear()
def finish_parse(self, last_lineno_seen):
"""Clean-up/summary tasks run at the end of parsing."""
if self.state == self.STATES['step_in_progress']:
# We've reached the end of the log without seeing the final "step finish"
# marker, which would normally have triggered updating the step. As such we
# must manually close out the current step, so things like result, finish
# time are set for it. This ensures that the error summary for Taskcluster
# infra failures actually lists the error that occurs at the
# end of the log.
self.end_step(last_lineno_seen)
@property
def steps(self):
"""Return the list of steps in the artifact"""
return self.artifact["steps"]
@property
def current_step(self):
"""Return the current step in the artifact"""
return self.steps[self.stepnum]
class TinderboxPrintParser(ParserBase):
RE_TINDERBOXPRINT = re.compile(r'.*TinderboxPrint: ?(?P<line>.*)$')
RE_UPLOADED_TO = re.compile(
r"<a href=['\"](?P<url>http(s)?://.*)['\"]>(?P<value>.+)</a>: uploaded"
)
RE_LINK_HTML = re.compile(
(r"((?P<title>[A-Za-z/\.0-9\-_ ]+): )?"
r"<a .*href=['\"](?P<url>http(s)?://.+)['\"].*>(?P<value>.+)</a>")
)
RE_LINK_TEXT = re.compile(
r"((?P<title>[A-Za-z/\.0-9\-_ ]+): )?(?P<url>http(s)?://.*)"
)
TINDERBOX_REGEXP_TUPLE = (
{
're': RE_UPLOADED_TO,
'base_dict': {
"content_type": "link",
"title": "artifact uploaded"
},
'duplicates_fields': {}
},
{
're': RE_LINK_HTML,
'base_dict': {
"content_type": "link"
},
'duplicates_fields': {}
},
{
're': RE_LINK_TEXT,
'base_dict': {
"content_type": "link"
},
'duplicates_fields': {'value': 'url'}
}
)
def __init__(self):
"""Setup the artifact to hold the job details."""
super().__init__("job_details")
def parse_line(self, line, lineno):
"""Parse a single line of the log"""
match = self.RE_TINDERBOXPRINT.match(line) if line else None
if match:
line = match.group('line')
for regexp_item in self.TINDERBOX_REGEXP_TUPLE:
match = regexp_item['re'].match(line)
if match:
artifact = match.groupdict()
# handle duplicate fields
for to_field, from_field in regexp_item['duplicates_fields'].items():
# if to_field not present or None copy form from_field
if to_field not in artifact or artifact[to_field] is None:
artifact[to_field] = artifact[from_field]
artifact.update(regexp_item['base_dict'])
self.artifact.append(artifact)
return
# default case: consider it html content
# try to detect title/value splitting on <br/>
artifact = {"content_type": "raw_html", }
if "<br/>" in line:
title, value = line.split("<br/>", 1)
artifact["title"] = title
artifact["value"] = value
# or similar long lines if they contain a url
elif "href" in line and "title" in line:
def parse_url_line(line_data):
class TpLineParser(HTMLParser):
def handle_starttag(self, tag, attrs):
d = dict(attrs)
artifact["url"] = d['href']
artifact["title"] = d['title']
def handle_data(self, data):
artifact["value"] = data
p = TpLineParser()
p.feed(line_data)
p.close()
# strip ^M returns on windows lines otherwise
# handle_data will yield no data 'value'
parse_url_line(line.replace('\r', ''))
else:
artifact["value"] = line
self.artifact.append(artifact)
class ErrorParser(ParserBase):
"""A generic error detection sub-parser"""
IN_SEARCH_TERMS = (
"TEST-UNEXPECTED-",
"fatal error",
"FATAL ERROR",
"REFTEST ERROR",
"PROCESS-CRASH",
"Assertion failure:",
"Assertion failed:",
"###!!! ABORT:",
"E/GeckoLinker",
"SUMMARY: AddressSanitizer",
"SUMMARY: LeakSanitizer",
"SUMMARY: ThreadSanitizer",
"Automation Error:",
"command timed out:",
"wget: unable ",
"TEST-VALGRIND-ERROR",
"[ FAILED ] ",
"bash.exe: *** ",
"bash: fork: Resource temporarily unavailable",
)
RE_ERR_MATCH = re.compile((
r"^error: TEST FAILED"
r"|^g?make(?:\[\d+\])?: \*\*\*"
r"|^Remote Device Error:"
r"|^[A-Za-z.]+Error: "
r"|^[A-Za-z.]*Exception: "
r"|^remoteFailed:"
r"|^rm: cannot "
r"|^abort:"
r"|^Output exceeded \d+ bytes"
r"|^The web-page 'stop build' button was pressed"
r"|.*\.js: line \d+, col \d+, Error -"
r"|^\[taskcluster\] Error:"
r"|^\[[\w._-]+:(?:error|exception)\]"
))
RE_ERR_SEARCH = re.compile((
r" error\(\d*\):"
r"|:\d+: error:"
r"| error R?C\d*:"
r"|ERROR [45]\d\d:"
r"|mozmake\.(?:exe|EXE)(?:\[\d+\])?: \*\*\*"
))
RE_EXCLUDE_1_SEARCH = re.compile(r"TEST-(?:INFO|PASS) ")
RE_EXCLUDE_2_SEARCH = re.compile(
r"I[ /](Gecko|Robocop|TestRunner).*TEST-UNEXPECTED-"
r"|^TimeoutException: "
r"|^ImportError: No module named pygtk$"
)
RE_ERR_1_MATCH = re.compile(r"^\d+:\d+:\d+ +(?:ERROR|CRITICAL|FATAL) - ")
# Looks for a leading value inside square brackets containing a "YYYY-"
# year pattern but isn't a TaskCluster error indicator (like
# ``taskcluster:error``.
#
# This matches the following:
# [task 2016-08-18T17:50:56.955523Z]
# [2016- task]
#
# But not:
# [taskcluster:error]
# [taskcluster:something 2016-]
RE_TASKCLUSTER_NORMAL_PREFIX = re.compile(r"^\[(?!taskcluster:)[^\]]*20\d{2}-[^\]]+\]\s")
RE_MOZHARNESS_PREFIX = re.compile(r"^\d+:\d+:\d+ +(?:DEBUG|INFO|WARNING) - +")
def __init__(self):
"""A simple error detection sub-parser"""
super().__init__("errors")
self.is_taskcluster = False
def add(self, line, lineno):
self.artifact.append({
"linenumber": lineno,
"line": line.rstrip()
})
def parse_line(self, line, lineno):
"""Check a single line for an error. Keeps track of the linenumber"""
# TaskCluster logs are a bit wonky.
#
# TaskCluster logs begin with output coming from TaskCluster itself,
# before it has transitioned control of the task to the configured
# process. These "internal" logs look like the following:
#
# [taskcluster 2016-09-09 17:41:43.544Z] Worker Group: us-west-2b
#
# If an error occurs during this "setup" phase, TaskCluster may emit
# lines beginning with ``[taskcluster:error]``.
#
# Once control has transitioned from TaskCluster to the configured
# task process, lines can be whatever the configured process emits.
# The popular ``run-task`` wrapper prefixes output to emulate
# TaskCluster's "internal" logs. e.g.
#
# [vcs 2016-09-09T17:45:02.842230Z] adding changesets
#
# This prefixing can confuse error parsing. So, we strip it.
#
# Because regular expression matching and string manipulation can be
# expensive when performed on every line, we only strip the TaskCluster
# log prefix if we know we're in a TaskCluster log.
# First line of TaskCluster logs almost certainly has this.
if line.startswith('[taskcluster '):
self.is_taskcluster = True
# For performance reasons, only do this if we have identified as
# a TC task.
if self.is_taskcluster:
line = re.sub(self.RE_TASKCLUSTER_NORMAL_PREFIX, "", line)
if self.is_error_line(line):
self.add(line, lineno)
def is_error_line(self, line):
if self.RE_EXCLUDE_1_SEARCH.search(line):
return False
if self.RE_ERR_1_MATCH.match(line):
return True
# Remove mozharness prefixes prior to matching
trimline = re.sub(self.RE_MOZHARNESS_PREFIX, "", line).rstrip()
if self.RE_EXCLUDE_2_SEARCH.search(trimline):
return False
return bool(any(term for term in self.IN_SEARCH_TERMS if term in trimline) or
self.RE_ERR_MATCH.match(trimline) or self.RE_ERR_SEARCH.search(trimline))
with open('schemas/performance-artifact.json') as f:
PERF_SCHEMA = json.load(f)
class PerformanceParser(ParserBase):
"""a sub-parser to find generic performance data"""
# Using $ in the regex as an end of line bounds causes the
# regex to fail on windows logs. This is likely due to the
# ^M character representation of the windows end of line.
RE_PERFORMANCE = re.compile(r'.*?PERFHERDER_DATA:\s+({.*})')
def __init__(self):
super().__init__("performance_data")
def parse_line(self, line, lineno):
match = self.RE_PERFORMANCE.match(line)
if match:
try:
dict = json.loads(match.group(1))
jsonschema.validate(dict, PERF_SCHEMA)
self.artifact.append(dict)
except ValueError:
logger.warning("Unable to parse Perfherder data from line: %s",
line)
except jsonschema.ValidationError as e:
logger.warning("Perfherder line '%s' does not comply with "
"json schema: %s", line, e)
# Don't mark the parser as complete, in case there are multiple performance artifacts.