From 3fa859e8d42f039c3f830f4e5719e8425bad89ed Mon Sep 17 00:00:00 2001 From: Cameron Dawson Date: Thu, 23 Apr 2015 15:16:00 -0700 Subject: [PATCH] Bug 1080760 - generate bug suggestions for text_log_summary artifacts Whenever a ``text_log_summary`` with ``error_lines`` is submitted, generate an additional ``Bug suggestions`` artifact for it. --- treeherder/etl/mixins.py | 51 +------ treeherder/etl/th_publisher.py | 58 +++++++ treeherder/log_parser/utils.py | 192 ----------------------- treeherder/model/bug_suggestions.py | 227 ++++++++++++++++++++++++++++ treeherder/webapp/api/artifact.py | 18 ++- 5 files changed, 302 insertions(+), 244 deletions(-) create mode 100644 treeherder/etl/th_publisher.py create mode 100644 treeherder/model/bug_suggestions.py diff --git a/treeherder/etl/mixins.py b/treeherder/etl/mixins.py index c422c0629cb..300da97dc12 100644 --- a/treeherder/etl/mixins.py +++ b/treeherder/etl/mixins.py @@ -10,12 +10,10 @@ import simplejson as json -from thclient import TreeherderRequest - from django.core.urlresolvers import reverse from django.conf import settings -from django.utils.encoding import python_2_unicode_compatible -from treeherder.etl.oauth_utils import OAuthCredentials + +from treeherder.etl import th_publisher logger = logging.getLogger(__name__) @@ -121,50 +119,7 @@ def load(self, result_sets, project): logger.error("ResultSet loading failed: {0}".format(message['message'])) -@python_2_unicode_compatible -class CollectionNotLoadedException(Exception): - - def __init__(self, error_list, *args, **kwargs): - """ - error_list contains dictionaries, each containing - project, url and message - """ - super(CollectionNotLoadedException, self).__init__(args, kwargs) - self.error_list = error_list - - def __str__(self): - return "\n".join( - ["[{project}] Error posting data to {url}: {message}".format( - **error) for error in self.error_list] - ) - - class OAuthLoaderMixin(object): def load(self, th_collections): - errors = [] - for project in th_collections: - - credentials = OAuthCredentials.get_credentials(project) - - th_request = TreeherderRequest( - protocol=settings.TREEHERDER_REQUEST_PROTOCOL, - host=settings.TREEHERDER_REQUEST_HOST, - project=project, - oauth_key=credentials.get('consumer_key', None), - oauth_secret=credentials.get('consumer_secret', None) - ) - - logger.info( - "collection loading request: {0}".format( - th_request.get_uri(th_collections[project].endpoint_base))) - response = th_request.post(th_collections[project]) - - if not response or response.status != 200: - errors.append({ - "project": project, - "url": th_collections[project].endpoint_base, - "message": response.read() - }) - if errors: - raise CollectionNotLoadedException(errors) + th_publisher.post_treeherder_collections(th_collections) diff --git a/treeherder/etl/th_publisher.py b/treeherder/etl/th_publisher.py new file mode 100644 index 00000000000..416890ad3b3 --- /dev/null +++ b/treeherder/etl/th_publisher.py @@ -0,0 +1,58 @@ +import logging + +from django.utils.encoding import python_2_unicode_compatible +from django.conf import settings + +from thclient import TreeherderRequest + +from treeherder.etl.oauth_utils import OAuthCredentials + + +logger = logging.getLogger(__name__) + + +def post_treeherder_collections(th_collections): + errors = [] + for project in th_collections: + + credentials = OAuthCredentials.get_credentials(project) + + th_request = TreeherderRequest( + protocol=settings.TREEHERDER_REQUEST_PROTOCOL, + host=settings.TREEHERDER_REQUEST_HOST, + project=project, + oauth_key=credentials.get('consumer_key', None), + oauth_secret=credentials.get('consumer_secret', None) + ) + + logger.info( + "collection loading request: {0}".format( + th_request.get_uri(th_collections[project].endpoint_base))) + response = th_request.post(th_collections[project]) + + if not response or response.status == 200: + errors.append({ + "project": project, + "url": th_collections[project].endpoint_base, + "message": response.read() + }) + if errors: + raise CollectionNotLoadedException(errors) + + +@python_2_unicode_compatible +class CollectionNotLoadedException(Exception): + + def __init__(self, error_list, *args, **kwargs): + """ + error_list contains dictionaries, each containing + project, url and message + """ + super(CollectionNotLoadedException, self).__init__(args, kwargs) + self.error_list = error_list + + def __str__(self): + return "\n".join( + ["[{project}] Error posting data to {url}: {message}".format( + **error) for error in self.error_list] + ) diff --git a/treeherder/log_parser/utils.py b/treeherder/log_parser/utils.py index 4f32c8714dc..cedd3c8518a 100644 --- a/treeherder/log_parser/utils.py +++ b/treeherder/log_parser/utils.py @@ -2,15 +2,12 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, you can obtain one at http://mozilla.org/MPL/2.0/. -import re -import urllib import urllib2 import logging import time import simplejson as json from django.conf import settings -from django.core.urlresolvers import reverse from treeherder.log_parser.artifactbuildercollection import \ ArtifactBuilderCollection @@ -22,130 +19,6 @@ logger = logging.getLogger(__name__) -def is_helpful_search_term(search_term): - # Search terms that will match too many bug summaries - # and so not result in useful suggestions. - search_term = search_term.strip() - - blacklist = [ - 'automation.py', - 'remoteautomation.py', - 'Shutdown', - 'undefined', - 'Main app process exited normally', - 'Traceback (most recent call last):', - 'Return code: 0', - 'Return code: 1', - 'Return code: 2', - 'Return code: 9', - 'Return code: 10', - 'Exiting 1', - 'Exiting 9', - 'CrashingThread(void *)', - 'libSystem.B.dylib + 0xd7a', - 'linux-gate.so + 0x424', - 'TypeError: content is null', - 'leakcheck' - ] - - return len(search_term) > 4 and not (search_term in blacklist) - - -LEAK_RE = re.compile(r'\d+ bytes leaked \((.+)\)$') -CRASH_RE = re.compile(r'.+ application crashed \[@ (.+)\]$') - - -def get_error_search_term(error_line): - """ - retrieves bug suggestions from bugscache using search_term - in a full_text search. - """ - if not error_line: - return None - - # This is strongly inspired by - # https://hg.mozilla.org/webtools/tbpl/file/tip/php/inc/AnnotatedSummaryGenerator.php#l73 - - tokens = error_line.split(" | ") - search_term = None - - if len(tokens) >= 3: - # it's in the "FAILURE-TYPE | testNameOrFilePath | message" type format. - test_name_or_path = tokens[1] - message = tokens[2] - - # Leak failure messages are of the form: - # leakcheck | .*\d+ bytes leaked (Object-1, Object-2, Object-3, ...) - match = LEAK_RE.search(message) - if match: - search_term = match.group(1) - else: - for splitter in ("/", "\\"): - # if this is a path, we are interested in the last part - test_name_or_path = test_name_or_path.split(splitter)[-1] - search_term = test_name_or_path - - # If the failure line was not in the pipe symbol delimited format or the search term - # will likely return too many (or irrelevant) results (eg: too short or matches terms - # on the blacklist), then we fall back to searching for the entire failure line if - # it is suitable. - if not (search_term and is_helpful_search_term(search_term)): - if is_helpful_search_term(error_line): - search_term = error_line - else: - search_term = None - - # Searching for extremely long search terms is undesirable, since: - # a) Bugzilla's max summary length is 256 characters, and once "Intermittent " - # and platform/suite information is prefixed, there are even fewer characters - # left for us to use for the failure string against which we need to match. - # b) For long search terms, the additional length does little to prevent against - # false positives, but means we're more susceptible to false negatives due to - # run-to-run variances in the error messages (eg paths, process IDs). - if search_term: - search_term = search_term[:100] - - return search_term - - -def get_crash_signature(error_line): - """ - Detect if the error_line contains a crash signature - and return it if it's a helpful search term - """ - search_term = None - match = CRASH_RE.match(error_line) - if match and is_helpful_search_term(match.group(1)): - search_term = match.group(1) - return search_term - - -def get_bugs_for_search_term(search, base_uri): - """ - Fetch the base_uri endpoint filtering on search and status. - Status must be either 'open' or 'closed' - """ - from treeherder.etl.common import get_remote_content - - params = { - 'search': search - } - query_string = urllib.urlencode(params) - url = '{0}?{1}'.format( - base_uri, - query_string - ) - return get_remote_content(url) - -mozharness_pattern = re.compile( - r'^\d+:\d+:\d+[ ]+(?:DEBUG|INFO|WARNING|ERROR|CRITICAL|FATAL) - [ ]?' -) - - -def get_mozharness_substring(line): - return mozharness_pattern.sub('', line).strip() - - def is_parsed(job_log_url): # if parse_status is not available, consider it pending parse_status = job_log_url.get("parse_status", "pending") @@ -154,12 +27,6 @@ def is_parsed(job_log_url): def extract_text_log_artifacts(log_url, job_guid, check_errors): """Generate a summary artifact for the raw text log.""" - bug_suggestions = [] - bugscache_uri = '{0}{1}'.format( - settings.API_HOSTNAME, - reverse("bugscache-list") - ) - terms_requested = {} # parse a log given its url artifact_bc = ArtifactBuilderCollection(log_url, @@ -170,65 +37,6 @@ def extract_text_log_artifacts(log_url, job_guid, check_errors): for name, artifact in artifact_bc.artifacts.items(): artifact_list.append((job_guid, name, 'json', json.dumps(artifact))) - if check_errors: - all_errors = artifact_bc.artifacts\ - .get('text_log_summary', {})\ - .get('step_data', {})\ - .get('all_errors', []) - - for err in all_errors: - # remove the mozharness prefix - clean_line = get_mozharness_substring(err['line']) - search_terms = [] - # get a meaningful search term out of the error line - search_term = get_error_search_term(clean_line) - bugs = dict(open_recent=[], all_others=[]) - - # collect open recent and all other bugs suggestions - if search_term: - search_terms.append(search_term) - if search_term not in terms_requested: - # retrieve the list of suggestions from the api - bugs = get_bugs_for_search_term( - search_term, - bugscache_uri - ) - terms_requested[search_term] = bugs - else: - bugs = terms_requested[search_term] - - if not bugs or not (bugs['open_recent'] or - bugs['all_others']): - # no suggestions, try to use - # the crash signature as search term - crash_signature = get_crash_signature(clean_line) - if crash_signature: - search_terms.append(crash_signature) - if crash_signature not in terms_requested: - bugs = get_bugs_for_search_term( - crash_signature, - bugscache_uri - ) - terms_requested[crash_signature] = bugs - else: - bugs = terms_requested[crash_signature] - - # TODO: Rename 'search' to 'error_text' or similar, since that's - # closer to what it actually represents (bug 1091060). - bug_suggestions.append({ - "search": clean_line, - "search_terms": search_terms, - "bugs": bugs - }) - - artifact_list.append( - ( - job_guid, - 'Bug suggestions', - 'json', - json.dumps(bug_suggestions) - ) - ) return artifact_list diff --git a/treeherder/model/bug_suggestions.py b/treeherder/model/bug_suggestions.py new file mode 100644 index 00000000000..b287e907bcf --- /dev/null +++ b/treeherder/model/bug_suggestions.py @@ -0,0 +1,227 @@ +import logging +import re +import urllib +import json + +from django.core.urlresolvers import reverse +from django.conf import settings + + +logger = logging.getLogger(__name__) + + +def get_bug_suggestions_artifacts(artifact_list): + """ + Create bug suggestions artifact(s) for any text_log_summary artifacts. + + ``artifact_list`` here is a list of artifacts that may contain one or more + ``text_log_artifact`` objects. If it does, we extract the error lines + from it. If there ARE error lines, then we generate the + ``bug suggestions`` artifacts and return them. + """ + + bug_suggestion_artifacts = [] + + for artifact in artifact_list: + # this is the only artifact name eligible to trigger generation of bug + # suggestions. + assert artifact['name'] == 'text_log_summary' + + all_errors = get_all_errors(artifact) + if all_errors: + bug_suggestion_artifacts.append({ + "job_guid": artifact['job_guid'], + "name": 'Bug suggestions', + "type": 'json', + "blob": json.dumps(get_bug_suggestions(all_errors)) + }) + + return bug_suggestion_artifacts + + +def get_all_errors(artifact): + """Extract the error lines from an artifact's blob field""" + + artifact_blob = json.loads(artifact['blob']) + if isinstance(artifact_blob, dict): + return artifact_blob.get('step_data', {}).get('all_errors', []) + + +def get_bug_suggestions(all_errors): + bug_suggestions = [] + bugscache_uri = '{0}{1}'.format( + settings.API_HOSTNAME, + reverse("bugscache-list") + ) + terms_requested = {} + + for err in all_errors: + # remove the mozharness prefix + clean_line = get_mozharness_substring(err['line']) + search_terms = [] + # get a meaningful search term out of the error line + search_term = get_error_search_term(clean_line) + bugs = dict(open_recent=[], all_others=[]) + + # collect open recent and all other bugs suggestions + if search_term: + search_terms.append(search_term) + if search_term not in terms_requested: + # retrieve the list of suggestions from the api + bugs = get_bugs_for_search_term( + search_term, + bugscache_uri + ) + terms_requested[search_term] = bugs + else: + bugs = terms_requested[search_term] + + if not bugs or not (bugs['open_recent'] or + bugs['all_others']): + # no suggestions, try to use + # the crash signature as search term + crash_signature = get_crash_signature(clean_line) + if crash_signature: + search_terms.append(crash_signature) + if crash_signature not in terms_requested: + bugs = get_bugs_for_search_term( + crash_signature, + bugscache_uri + ) + terms_requested[crash_signature] = bugs + else: + bugs = terms_requested[crash_signature] + + # TODO: Rename 'search' to 'error_text' or similar, since that's + # closer to what it actually represents (bug 1091060). + bug_suggestions.append({ + "search": clean_line, + "search_terms": search_terms, + "bugs": bugs + }) + + return bug_suggestions + + +def get_bugs_for_search_term(search, base_uri): + """ + Fetch the base_uri endpoint filtering on search and status. + Status must be either 'open' or 'closed' + """ + from treeherder.etl.common import get_remote_content + + params = { + 'search': search + } + query_string = urllib.urlencode(params) + url = '{0}?{1}'.format( + base_uri, + query_string + ) + return get_remote_content(url) + +LEAK_RE = re.compile(r'\d+ bytes leaked \((.+)\)$') +CRASH_RE = re.compile(r'.+ application crashed \[@ (.+)\]$') + + +def get_error_search_term(error_line): + """ + retrieves bug suggestions from bugscache using search_term + in a full_text search. + """ + if not error_line: + return None + + # This is strongly inspired by + # https://hg.mozilla.org/webtools/tbpl/file/tip/php/inc/AnnotatedSummaryGenerator.php#l73 + + tokens = error_line.split(" | ") + search_term = None + + if len(tokens) >= 3: + # it's in the "FAILURE-TYPE | testNameOrFilePath | message" type format. + test_name_or_path = tokens[1] + message = tokens[2] + + # Leak failure messages are of the form: + # leakcheck | .*\d+ bytes leaked (Object-1, Object-2, Object-3, ...) + match = LEAK_RE.search(message) + if match: + search_term = match.group(1) + else: + for splitter in ("/", "\\"): + # if this is a path, we are interested in the last part + test_name_or_path = test_name_or_path.split(splitter)[-1] + search_term = test_name_or_path + + # If the failure line was not in the pipe symbol delimited format or the search term + # will likely return too many (or irrelevant) results (eg: too short or matches terms + # on the blacklist), then we fall back to searching for the entire failure line if + # it is suitable. + if not (search_term and is_helpful_search_term(search_term)): + if is_helpful_search_term(error_line): + search_term = error_line + else: + search_term = None + + # Searching for extremely long search terms is undesirable, since: + # a) Bugzilla's max summary length is 256 characters, and once "Intermittent " + # and platform/suite information is prefixed, there are even fewer characters + # left for us to use for the failure string against which we need to match. + # b) For long search terms, the additional length does little to prevent against + # false positives, but means we're more susceptible to false negatives due to + # run-to-run variances in the error messages (eg paths, process IDs). + if search_term: + search_term = search_term[:100] + + return search_term + + +def get_crash_signature(error_line): + """ + Detect if the error_line contains a crash signature + and return it if it's a helpful search term + """ + search_term = None + match = CRASH_RE.match(error_line) + if match and is_helpful_search_term(match.group(1)): + search_term = match.group(1) + return search_term + + +mozharness_pattern = re.compile( + r'^\d+:\d+:\d+[ ]+(?:DEBUG|INFO|WARNING|ERROR|CRITICAL|FATAL) - [ ]?' +) + + +def get_mozharness_substring(line): + return mozharness_pattern.sub('', line).strip() + + +def is_helpful_search_term(search_term): + # Search terms that will match too many bug summaries + # and so not result in useful suggestions. + search_term = search_term.strip() + + blacklist = [ + 'automation.py', + 'remoteautomation.py', + 'Shutdown', + 'undefined', + 'Main app process exited normally', + 'Traceback (most recent call last):', + 'Return code: 0', + 'Return code: 1', + 'Return code: 2', + 'Return code: 9', + 'Return code: 10', + 'Exiting 1', + 'Exiting 9', + 'CrashingThread(void *)', + 'libSystem.B.dylib + 0xd7a', + 'linux-gate.so + 0x424', + 'TypeError: content is null', + 'leakcheck' + ] + + return len(search_term) > 4 and not (search_term in blacklist) diff --git a/treeherder/webapp/api/artifact.py b/treeherder/webapp/api/artifact.py index 0079b277eaa..035370c4b65 100644 --- a/treeherder/webapp/api/artifact.py +++ b/treeherder/webapp/api/artifact.py @@ -6,6 +6,7 @@ from rest_framework.response import Response from treeherder.webapp.api.utils import UrlQueryFilter, oauth_required from treeherder.model.derived import JobsModel, ArtifactsModel +from treeherder.model.bug_suggestions import get_bug_suggestions_artifacts class ArtifactViewSet(viewsets.ViewSet): @@ -54,11 +55,20 @@ def list(self, request, project): @oauth_required def create(self, request, project): + artifacts = request.DATA - job_guids = [x['job_guid'] for x in request.DATA] - with JobsModel(project) as jobsModel, ArtifactsModel(project) as artifacts_model: + job_guids = [x['job_guid'] for x in artifacts] + with JobsModel(project) as jobs_model, ArtifactsModel(project) as artifacts_model: - job_id_lookup = jobsModel.get_job_ids_by_guid(job_guids) - artifacts_model.load_job_artifacts(request.DATA, job_id_lookup) + # create an accompanying ``Bug suggestions`` artifact for any + # eligible artifacts. + tls_list = [x for x in artifacts if x['name'] == 'text_log_summary'] + bsa = get_bug_suggestions_artifacts(tls_list) + + if bsa: + artifacts.extend(bsa) + + job_id_lookup = jobs_model.get_job_ids_by_guid(job_guids) + artifacts_model.load_job_artifacts(artifacts, job_id_lookup) return Response({'message': 'Artifacts stored successfully'})