Bug 1080760 - generate bug suggestions for text_log_summary artifacts

Whenever a ``text_log_summary`` with ``error_lines`` is submitted, generate an additional ``Bug suggestions`` artifact for it.
mozilla · Apr 28, 2015 · 3fa859e · 3fa859e
1 parent 2d9629f
commit 3fa859e
Show file tree

Hide file tree

Showing 5 changed files with 302 additions and 244 deletions.
diff --git a/treeherder/etl/mixins.py b/treeherder/etl/mixins.py
@@ -10,12 +10,10 @@
 
 import simplejson as json
 
-from thclient import TreeherderRequest
-
 from django.core.urlresolvers import reverse
 from django.conf import settings
-from django.utils.encoding import python_2_unicode_compatible
-from treeherder.etl.oauth_utils import OAuthCredentials
+
+from treeherder.etl import th_publisher
 
 
 logger = logging.getLogger(__name__)
@@ -121,50 +119,7 @@ def load(self, result_sets, project):
                 logger.error("ResultSet loading failed: {0}".format(message['message']))
 
 
-@python_2_unicode_compatible
-class CollectionNotLoadedException(Exception):
-
-    def __init__(self, error_list, *args, **kwargs):
-        """
-        error_list contains dictionaries, each containing
-        project, url and message
-        """
-        super(CollectionNotLoadedException, self).__init__(args, kwargs)
-        self.error_list = error_list
-
-    def __str__(self):
-        return "\n".join(
-            ["[{project}] Error posting data to {url}: {message}".format(
-                **error) for error in self.error_list]
-        )
-
-
 class OAuthLoaderMixin(object):
 
     def load(self, th_collections):
-        errors = []
-        for project in th_collections:
-
-            credentials = OAuthCredentials.get_credentials(project)
-
-            th_request = TreeherderRequest(
-                protocol=settings.TREEHERDER_REQUEST_PROTOCOL,
-                host=settings.TREEHERDER_REQUEST_HOST,
-                project=project,
-                oauth_key=credentials.get('consumer_key', None),
-                oauth_secret=credentials.get('consumer_secret', None)
-            )
-
-            logger.info(
-                "collection loading request: {0}".format(
-                    th_request.get_uri(th_collections[project].endpoint_base)))
-            response = th_request.post(th_collections[project])
-
-            if not response or response.status != 200:
-                errors.append({
-                    "project": project,
-                    "url": th_collections[project].endpoint_base,
-                    "message": response.read()
-                })
-        if errors:
-            raise CollectionNotLoadedException(errors)
+        th_publisher.post_treeherder_collections(th_collections)
diff --git a/treeherder/etl/th_publisher.py b/treeherder/etl/th_publisher.py
@@ -0,0 +1,58 @@
+import logging
+
+from django.utils.encoding import python_2_unicode_compatible
+from django.conf import settings
+
+from thclient import TreeherderRequest
+
+from treeherder.etl.oauth_utils import OAuthCredentials
+
+
+logger = logging.getLogger(__name__)
+
+
+def post_treeherder_collections(th_collections):
+    errors = []
+    for project in th_collections:
+
+        credentials = OAuthCredentials.get_credentials(project)
+
+        th_request = TreeherderRequest(
+            protocol=settings.TREEHERDER_REQUEST_PROTOCOL,
+            host=settings.TREEHERDER_REQUEST_HOST,
+            project=project,
+            oauth_key=credentials.get('consumer_key', None),
+            oauth_secret=credentials.get('consumer_secret', None)
+        )
+
+        logger.info(
+            "collection loading request: {0}".format(
+                th_request.get_uri(th_collections[project].endpoint_base)))
+        response = th_request.post(th_collections[project])
+
+        if not response or response.status == 200:
+            errors.append({
+                "project": project,
+                "url": th_collections[project].endpoint_base,
+                "message": response.read()
+            })
+    if errors:
+        raise CollectionNotLoadedException(errors)
+
+
+@python_2_unicode_compatible
+class CollectionNotLoadedException(Exception):
+
+    def __init__(self, error_list, *args, **kwargs):
+        """
+        error_list contains dictionaries, each containing
+        project, url and message
+        """
+        super(CollectionNotLoadedException, self).__init__(args, kwargs)
+        self.error_list = error_list
+
+    def __str__(self):
+        return "\n".join(
+            ["[{project}] Error posting data to {url}: {message}".format(
+                **error) for error in self.error_list]
+        )
diff --git a/treeherder/log_parser/utils.py b/treeherder/log_parser/utils.py
@@ -2,15 +2,12 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, you can obtain one at http://mozilla.org/MPL/2.0/.
 
-import re
-import urllib
 import urllib2
 import logging
 import time
 
 import simplejson as json
 from django.conf import settings
-from django.core.urlresolvers import reverse
 
 from treeherder.log_parser.artifactbuildercollection import \
     ArtifactBuilderCollection
@@ -22,130 +19,6 @@
 logger = logging.getLogger(__name__)
 
 
-def is_helpful_search_term(search_term):
-    # Search terms that will match too many bug summaries
-    # and so not result in useful suggestions.
-    search_term = search_term.strip()
-
-    blacklist = [
-        'automation.py',
-        'remoteautomation.py',
-        'Shutdown',
-        'undefined',
-        'Main app process exited normally',
-        'Traceback (most recent call last):',
-        'Return code: 0',
-        'Return code: 1',
-        'Return code: 2',
-        'Return code: 9',
-        'Return code: 10',
-        'Exiting 1',
-        'Exiting 9',
-        'CrashingThread(void *)',
-        'libSystem.B.dylib + 0xd7a',
-        'linux-gate.so + 0x424',
-        'TypeError: content is null',
-        'leakcheck'
-    ]
-
-    return len(search_term) > 4 and not (search_term in blacklist)
-
-
-LEAK_RE = re.compile(r'\d+ bytes leaked \((.+)\)$')
-CRASH_RE = re.compile(r'.+ application crashed \[@ (.+)\]$')
-
-
-def get_error_search_term(error_line):
-    """
-    retrieves bug suggestions from bugscache using search_term
-    in a full_text search.
-    """
-    if not error_line:
-        return None
-
-    # This is strongly inspired by
-    # https://hg.mozilla.org/webtools/tbpl/file/tip/php/inc/AnnotatedSummaryGenerator.php#l73
-
-    tokens = error_line.split(" | ")
-    search_term = None
-
-    if len(tokens) >= 3:
-        # it's in the "FAILURE-TYPE | testNameOrFilePath | message" type format.
-        test_name_or_path = tokens[1]
-        message = tokens[2]
-
-        # Leak failure messages are of the form:
-        # leakcheck | .*\d+ bytes leaked (Object-1, Object-2, Object-3, ...)
-        match = LEAK_RE.search(message)
-        if match:
-            search_term = match.group(1)
-        else:
-            for splitter in ("/", "\\"):
-                # if this is a path, we are interested in the last part
-                test_name_or_path = test_name_or_path.split(splitter)[-1]
-            search_term = test_name_or_path
-
-    # If the failure line was not in the pipe symbol delimited format or the search term
-    # will likely return too many (or irrelevant) results (eg: too short or matches terms
-    # on the blacklist), then we fall back to searching for the entire failure line if
-    # it is suitable.
-    if not (search_term and is_helpful_search_term(search_term)):
-        if is_helpful_search_term(error_line):
-            search_term = error_line
-        else:
-            search_term = None
-
-    # Searching for extremely long search terms is undesirable, since:
-    # a) Bugzilla's max summary length is 256 characters, and once "Intermittent "
-    # and platform/suite information is prefixed, there are even fewer characters
-    # left for us to use for the failure string against which we need to match.
-    # b) For long search terms, the additional length does little to prevent against
-    # false positives, but means we're more susceptible to false negatives due to
-    # run-to-run variances in the error messages (eg paths, process IDs).
-    if search_term:
-        search_term = search_term[:100]
-
-    return search_term
-
-
-def get_crash_signature(error_line):
-    """
-    Detect if the error_line contains a crash signature
-    and return it if it's a helpful search term
-    """
-    search_term = None
-    match = CRASH_RE.match(error_line)
-    if match and is_helpful_search_term(match.group(1)):
-        search_term = match.group(1)
-    return search_term
-
-
-def get_bugs_for_search_term(search, base_uri):
-    """
-    Fetch the base_uri endpoint filtering on search and status.
-    Status must be either 'open' or 'closed'
-    """
-    from treeherder.etl.common import get_remote_content
-
-    params = {
-        'search': search
-    }
-    query_string = urllib.urlencode(params)
-    url = '{0}?{1}'.format(
-        base_uri,
-        query_string
-    )
-    return get_remote_content(url)
-
-mozharness_pattern = re.compile(
-    r'^\d+:\d+:\d+[ ]+(?:DEBUG|INFO|WARNING|ERROR|CRITICAL|FATAL) - [ ]?'
-)
-
-
-def get_mozharness_substring(line):
-    return mozharness_pattern.sub('', line).strip()
-
-
 def is_parsed(job_log_url):
     # if parse_status is not available, consider it pending
     parse_status = job_log_url.get("parse_status", "pending")
@@ -154,12 +27,6 @@ def is_parsed(job_log_url):
 
 def extract_text_log_artifacts(log_url, job_guid, check_errors):
     """Generate a summary artifact for the raw text log."""
-    bug_suggestions = []
-    bugscache_uri = '{0}{1}'.format(
-        settings.API_HOSTNAME,
-        reverse("bugscache-list")
-    )
-    terms_requested = {}
 
     # parse a log given its url
     artifact_bc = ArtifactBuilderCollection(log_url,
@@ -170,65 +37,6 @@ def extract_text_log_artifacts(log_url, job_guid, check_errors):
     for name, artifact in artifact_bc.artifacts.items():
         artifact_list.append((job_guid, name, 'json',
                               json.dumps(artifact)))
-    if check_errors:
-        all_errors = artifact_bc.artifacts\
-            .get('text_log_summary', {})\
-            .get('step_data', {})\
-            .get('all_errors', [])
-
-        for err in all_errors:
-            # remove the mozharness prefix
-            clean_line = get_mozharness_substring(err['line'])
-            search_terms = []
-            # get a meaningful search term out of the error line
-            search_term = get_error_search_term(clean_line)
-            bugs = dict(open_recent=[], all_others=[])
-
-            # collect open recent and all other bugs suggestions
-            if search_term:
-                search_terms.append(search_term)
-                if search_term not in terms_requested:
-                    # retrieve the list of suggestions from the api
-                    bugs = get_bugs_for_search_term(
-                        search_term,
-                        bugscache_uri
-                    )
-                    terms_requested[search_term] = bugs
-                else:
-                    bugs = terms_requested[search_term]
-
-            if not bugs or not (bugs['open_recent'] or
-                                bugs['all_others']):
-                # no suggestions, try to use
-                # the crash signature as search term
-                crash_signature = get_crash_signature(clean_line)
-                if crash_signature:
-                    search_terms.append(crash_signature)
-                    if crash_signature not in terms_requested:
-                        bugs = get_bugs_for_search_term(
-                            crash_signature,
-                            bugscache_uri
-                        )
-                        terms_requested[crash_signature] = bugs
-                    else:
-                        bugs = terms_requested[crash_signature]
-
-            # TODO: Rename 'search' to 'error_text' or similar, since that's
-            # closer to what it actually represents (bug 1091060).
-            bug_suggestions.append({
-                "search": clean_line,
-                "search_terms": search_terms,
-                "bugs": bugs
-            })
-
-    artifact_list.append(
-        (
-            job_guid,
-            'Bug suggestions',
-            'json',
-            json.dumps(bug_suggestions)
-        )
-    )
 
     return artifact_list