From 3fa859e8d42f039c3f830f4e5719e8425bad89ed Mon Sep 17 00:00:00 2001
From: Cameron Dawson <cdawson@mozilla.com>
Date: Thu, 23 Apr 2015 15:16:00 -0700
Subject: [PATCH] Bug 1080760 - generate bug suggestions for text_log_summary
 artifacts

Whenever a ``text_log_summary`` with ``error_lines`` is submitted, generate an
additional ``Bug suggestions`` artifact for it.
---
 treeherder/etl/mixins.py            |  51 +------
 treeherder/etl/th_publisher.py      |  58 +++++++
 treeherder/log_parser/utils.py      | 192 -----------------------
 treeherder/model/bug_suggestions.py | 227 ++++++++++++++++++++++++++++
 treeherder/webapp/api/artifact.py   |  18 ++-
 5 files changed, 302 insertions(+), 244 deletions(-)
 create mode 100644 treeherder/etl/th_publisher.py
 create mode 100644 treeherder/model/bug_suggestions.py

diff --git a/treeherder/etl/mixins.py b/treeherder/etl/mixins.py
index c422c0629cb..300da97dc12 100644
--- a/treeherder/etl/mixins.py
+++ b/treeherder/etl/mixins.py
@@ -10,12 +10,10 @@
 
 import simplejson as json
 
-from thclient import TreeherderRequest
-
 from django.core.urlresolvers import reverse
 from django.conf import settings
-from django.utils.encoding import python_2_unicode_compatible
-from treeherder.etl.oauth_utils import OAuthCredentials
+
+from treeherder.etl import th_publisher
 
 
 logger = logging.getLogger(__name__)
@@ -121,50 +119,7 @@ def load(self, result_sets, project):
                 logger.error("ResultSet loading failed: {0}".format(message['message']))
 
 
-@python_2_unicode_compatible
-class CollectionNotLoadedException(Exception):
-
-    def __init__(self, error_list, *args, **kwargs):
-        """
-        error_list contains dictionaries, each containing
-        project, url and message
-        """
-        super(CollectionNotLoadedException, self).__init__(args, kwargs)
-        self.error_list = error_list
-
-    def __str__(self):
-        return "\n".join(
-            ["[{project}] Error posting data to {url}: {message}".format(
-                **error) for error in self.error_list]
-        )
-
-
 class OAuthLoaderMixin(object):
 
     def load(self, th_collections):
-        errors = []
-        for project in th_collections:
-
-            credentials = OAuthCredentials.get_credentials(project)
-
-            th_request = TreeherderRequest(
-                protocol=settings.TREEHERDER_REQUEST_PROTOCOL,
-                host=settings.TREEHERDER_REQUEST_HOST,
-                project=project,
-                oauth_key=credentials.get('consumer_key', None),
-                oauth_secret=credentials.get('consumer_secret', None)
-            )
-
-            logger.info(
-                "collection loading request: {0}".format(
-                    th_request.get_uri(th_collections[project].endpoint_base)))
-            response = th_request.post(th_collections[project])
-
-            if not response or response.status != 200:
-                errors.append({
-                    "project": project,
-                    "url": th_collections[project].endpoint_base,
-                    "message": response.read()
-                })
-        if errors:
-            raise CollectionNotLoadedException(errors)
+        th_publisher.post_treeherder_collections(th_collections)
diff --git a/treeherder/etl/th_publisher.py b/treeherder/etl/th_publisher.py
new file mode 100644
index 00000000000..416890ad3b3
--- /dev/null
+++ b/treeherder/etl/th_publisher.py
@@ -0,0 +1,58 @@
+import logging
+
+from django.utils.encoding import python_2_unicode_compatible
+from django.conf import settings
+
+from thclient import TreeherderRequest
+
+from treeherder.etl.oauth_utils import OAuthCredentials
+
+
+logger = logging.getLogger(__name__)
+
+
+def post_treeherder_collections(th_collections):
+    errors = []
+    for project in th_collections:
+
+        credentials = OAuthCredentials.get_credentials(project)
+
+        th_request = TreeherderRequest(
+            protocol=settings.TREEHERDER_REQUEST_PROTOCOL,
+            host=settings.TREEHERDER_REQUEST_HOST,
+            project=project,
+            oauth_key=credentials.get('consumer_key', None),
+            oauth_secret=credentials.get('consumer_secret', None)
+        )
+
+        logger.info(
+            "collection loading request: {0}".format(
+                th_request.get_uri(th_collections[project].endpoint_base)))
+        response = th_request.post(th_collections[project])
+
+        if not response or response.status == 200:
+            errors.append({
+                "project": project,
+                "url": th_collections[project].endpoint_base,
+                "message": response.read()
+            })
+    if errors:
+        raise CollectionNotLoadedException(errors)
+
+
+@python_2_unicode_compatible
+class CollectionNotLoadedException(Exception):
+
+    def __init__(self, error_list, *args, **kwargs):
+        """
+        error_list contains dictionaries, each containing
+        project, url and message
+        """
+        super(CollectionNotLoadedException, self).__init__(args, kwargs)
+        self.error_list = error_list
+
+    def __str__(self):
+        return "\n".join(
+            ["[{project}] Error posting data to {url}: {message}".format(
+                **error) for error in self.error_list]
+        )
diff --git a/treeherder/log_parser/utils.py b/treeherder/log_parser/utils.py
index 4f32c8714dc..cedd3c8518a 100644
--- a/treeherder/log_parser/utils.py
+++ b/treeherder/log_parser/utils.py
@@ -2,15 +2,12 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, you can obtain one at http://mozilla.org/MPL/2.0/.
 
-import re
-import urllib
 import urllib2
 import logging
 import time
 
 import simplejson as json
 from django.conf import settings
-from django.core.urlresolvers import reverse
 
 from treeherder.log_parser.artifactbuildercollection import \
     ArtifactBuilderCollection
@@ -22,130 +19,6 @@
 logger = logging.getLogger(__name__)
 
 
-def is_helpful_search_term(search_term):
-    # Search terms that will match too many bug summaries
-    # and so not result in useful suggestions.
-    search_term = search_term.strip()
-
-    blacklist = [
-        'automation.py',
-        'remoteautomation.py',
-        'Shutdown',
-        'undefined',
-        'Main app process exited normally',
-        'Traceback (most recent call last):',
-        'Return code: 0',
-        'Return code: 1',
-        'Return code: 2',
-        'Return code: 9',
-        'Return code: 10',
-        'Exiting 1',
-        'Exiting 9',
-        'CrashingThread(void *)',
-        'libSystem.B.dylib + 0xd7a',
-        'linux-gate.so + 0x424',
-        'TypeError: content is null',
-        'leakcheck'
-    ]
-
-    return len(search_term) > 4 and not (search_term in blacklist)
-
-
-LEAK_RE = re.compile(r'\d+ bytes leaked \((.+)\)$')
-CRASH_RE = re.compile(r'.+ application crashed \[@ (.+)\]$')
-
-
-def get_error_search_term(error_line):
-    """
-    retrieves bug suggestions from bugscache using search_term
-    in a full_text search.
-    """
-    if not error_line:
-        return None
-
-    # This is strongly inspired by
-    # https://hg.mozilla.org/webtools/tbpl/file/tip/php/inc/AnnotatedSummaryGenerator.php#l73
-
-    tokens = error_line.split(" | ")
-    search_term = None
-
-    if len(tokens) >= 3:
-        # it's in the "FAILURE-TYPE | testNameOrFilePath | message" type format.
-        test_name_or_path = tokens[1]
-        message = tokens[2]
-
-        # Leak failure messages are of the form:
-        # leakcheck | .*\d+ bytes leaked (Object-1, Object-2, Object-3, ...)
-        match = LEAK_RE.search(message)
-        if match:
-            search_term = match.group(1)
-        else:
-            for splitter in ("/", "\\"):
-                # if this is a path, we are interested in the last part
-                test_name_or_path = test_name_or_path.split(splitter)[-1]
-            search_term = test_name_or_path
-
-    # If the failure line was not in the pipe symbol delimited format or the search term
-    # will likely return too many (or irrelevant) results (eg: too short or matches terms
-    # on the blacklist), then we fall back to searching for the entire failure line if
-    # it is suitable.
-    if not (search_term and is_helpful_search_term(search_term)):
-        if is_helpful_search_term(error_line):
-            search_term = error_line
-        else:
-            search_term = None
-
-    # Searching for extremely long search terms is undesirable, since:
-    # a) Bugzilla's max summary length is 256 characters, and once "Intermittent "
-    # and platform/suite information is prefixed, there are even fewer characters
-    # left for us to use for the failure string against which we need to match.
-    # b) For long search terms, the additional length does little to prevent against
-    # false positives, but means we're more susceptible to false negatives due to
-    # run-to-run variances in the error messages (eg paths, process IDs).
-    if search_term:
-        search_term = search_term[:100]
-
-    return search_term
-
-
-def get_crash_signature(error_line):
-    """
-    Detect if the error_line contains a crash signature
-    and return it if it's a helpful search term
-    """
-    search_term = None
-    match = CRASH_RE.match(error_line)
-    if match and is_helpful_search_term(match.group(1)):
-        search_term = match.group(1)
-    return search_term
-
-
-def get_bugs_for_search_term(search, base_uri):
-    """
-    Fetch the base_uri endpoint filtering on search and status.
-    Status must be either 'open' or 'closed'
-    """
-    from treeherder.etl.common import get_remote_content
-
-    params = {
-        'search': search
-    }
-    query_string = urllib.urlencode(params)
-    url = '{0}?{1}'.format(
-        base_uri,
-        query_string
-    )
-    return get_remote_content(url)
-
-mozharness_pattern = re.compile(
-    r'^\d+:\d+:\d+[ ]+(?:DEBUG|INFO|WARNING|ERROR|CRITICAL|FATAL) - [ ]?'
-)
-
-
-def get_mozharness_substring(line):
-    return mozharness_pattern.sub('', line).strip()
-
-
 def is_parsed(job_log_url):
     # if parse_status is not available, consider it pending
     parse_status = job_log_url.get("parse_status", "pending")
@@ -154,12 +27,6 @@ def is_parsed(job_log_url):
 
 def extract_text_log_artifacts(log_url, job_guid, check_errors):
     """Generate a summary artifact for the raw text log."""
-    bug_suggestions = []
-    bugscache_uri = '{0}{1}'.format(
-        settings.API_HOSTNAME,
-        reverse("bugscache-list")
-    )
-    terms_requested = {}
 
     # parse a log given its url
     artifact_bc = ArtifactBuilderCollection(log_url,
@@ -170,65 +37,6 @@ def extract_text_log_artifacts(log_url, job_guid, check_errors):
     for name, artifact in artifact_bc.artifacts.items():
         artifact_list.append((job_guid, name, 'json',
                               json.dumps(artifact)))
-    if check_errors:
-        all_errors = artifact_bc.artifacts\
-            .get('text_log_summary', {})\
-            .get('step_data', {})\
-            .get('all_errors', [])
-
-        for err in all_errors:
-            # remove the mozharness prefix
-            clean_line = get_mozharness_substring(err['line'])
-            search_terms = []
-            # get a meaningful search term out of the error line
-            search_term = get_error_search_term(clean_line)
-            bugs = dict(open_recent=[], all_others=[])
-
-            # collect open recent and all other bugs suggestions
-            if search_term:
-                search_terms.append(search_term)
-                if search_term not in terms_requested:
-                    # retrieve the list of suggestions from the api
-                    bugs = get_bugs_for_search_term(
-                        search_term,
-                        bugscache_uri
-                    )
-                    terms_requested[search_term] = bugs
-                else:
-                    bugs = terms_requested[search_term]
-
-            if not bugs or not (bugs['open_recent'] or
-                                bugs['all_others']):
-                # no suggestions, try to use
-                # the crash signature as search term
-                crash_signature = get_crash_signature(clean_line)
-                if crash_signature:
-                    search_terms.append(crash_signature)
-                    if crash_signature not in terms_requested:
-                        bugs = get_bugs_for_search_term(
-                            crash_signature,
-                            bugscache_uri
-                        )
-                        terms_requested[crash_signature] = bugs
-                    else:
-                        bugs = terms_requested[crash_signature]
-
-            # TODO: Rename 'search' to 'error_text' or similar, since that's
-            # closer to what it actually represents (bug 1091060).
-            bug_suggestions.append({
-                "search": clean_line,
-                "search_terms": search_terms,
-                "bugs": bugs
-            })
-
-    artifact_list.append(
-        (
-            job_guid,
-            'Bug suggestions',
-            'json',
-            json.dumps(bug_suggestions)
-        )
-    )
 
     return artifact_list
 
diff --git a/treeherder/model/bug_suggestions.py b/treeherder/model/bug_suggestions.py
new file mode 100644
index 00000000000..b287e907bcf
--- /dev/null
+++ b/treeherder/model/bug_suggestions.py
@@ -0,0 +1,227 @@
+import logging
+import re
+import urllib
+import json
+
+from django.core.urlresolvers import reverse
+from django.conf import settings
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_bug_suggestions_artifacts(artifact_list):
+    """
+    Create bug suggestions artifact(s) for any text_log_summary artifacts.
+
+    ``artifact_list`` here is a list of artifacts that may contain one or more
+        ``text_log_artifact`` objects.  If it does, we extract the error lines
+        from it.  If there ARE error lines, then we generate the
+        ``bug suggestions`` artifacts and return them.
+    """
+
+    bug_suggestion_artifacts = []
+
+    for artifact in artifact_list:
+        # this is the only artifact name eligible to trigger generation of bug
+        # suggestions.
+        assert artifact['name'] == 'text_log_summary'
+
+        all_errors = get_all_errors(artifact)
+        if all_errors:
+            bug_suggestion_artifacts.append({
+                "job_guid": artifact['job_guid'],
+                "name": 'Bug suggestions',
+                "type": 'json',
+                "blob": json.dumps(get_bug_suggestions(all_errors))
+            })
+
+    return bug_suggestion_artifacts
+
+
+def get_all_errors(artifact):
+    """Extract the error lines from an artifact's blob field"""
+
+    artifact_blob = json.loads(artifact['blob'])
+    if isinstance(artifact_blob, dict):
+        return artifact_blob.get('step_data', {}).get('all_errors', [])
+
+
+def get_bug_suggestions(all_errors):
+    bug_suggestions = []
+    bugscache_uri = '{0}{1}'.format(
+        settings.API_HOSTNAME,
+        reverse("bugscache-list")
+    )
+    terms_requested = {}
+
+    for err in all_errors:
+        # remove the mozharness prefix
+        clean_line = get_mozharness_substring(err['line'])
+        search_terms = []
+        # get a meaningful search term out of the error line
+        search_term = get_error_search_term(clean_line)
+        bugs = dict(open_recent=[], all_others=[])
+
+        # collect open recent and all other bugs suggestions
+        if search_term:
+            search_terms.append(search_term)
+            if search_term not in terms_requested:
+                # retrieve the list of suggestions from the api
+                bugs = get_bugs_for_search_term(
+                    search_term,
+                    bugscache_uri
+                )
+                terms_requested[search_term] = bugs
+            else:
+                bugs = terms_requested[search_term]
+
+        if not bugs or not (bugs['open_recent'] or
+                            bugs['all_others']):
+            # no suggestions, try to use
+            # the crash signature as search term
+            crash_signature = get_crash_signature(clean_line)
+            if crash_signature:
+                search_terms.append(crash_signature)
+                if crash_signature not in terms_requested:
+                    bugs = get_bugs_for_search_term(
+                        crash_signature,
+                        bugscache_uri
+                    )
+                    terms_requested[crash_signature] = bugs
+                else:
+                    bugs = terms_requested[crash_signature]
+
+        # TODO: Rename 'search' to 'error_text' or similar, since that's
+        # closer to what it actually represents (bug 1091060).
+        bug_suggestions.append({
+            "search": clean_line,
+            "search_terms": search_terms,
+            "bugs": bugs
+        })
+
+    return bug_suggestions
+
+
+def get_bugs_for_search_term(search, base_uri):
+    """
+    Fetch the base_uri endpoint filtering on search and status.
+    Status must be either 'open' or 'closed'
+    """
+    from treeherder.etl.common import get_remote_content
+
+    params = {
+        'search': search
+    }
+    query_string = urllib.urlencode(params)
+    url = '{0}?{1}'.format(
+        base_uri,
+        query_string
+    )
+    return get_remote_content(url)
+
+LEAK_RE = re.compile(r'\d+ bytes leaked \((.+)\)$')
+CRASH_RE = re.compile(r'.+ application crashed \[@ (.+)\]$')
+
+
+def get_error_search_term(error_line):
+    """
+    retrieves bug suggestions from bugscache using search_term
+    in a full_text search.
+    """
+    if not error_line:
+        return None
+
+    # This is strongly inspired by
+    # https://hg.mozilla.org/webtools/tbpl/file/tip/php/inc/AnnotatedSummaryGenerator.php#l73
+
+    tokens = error_line.split(" | ")
+    search_term = None
+
+    if len(tokens) >= 3:
+        # it's in the "FAILURE-TYPE | testNameOrFilePath | message" type format.
+        test_name_or_path = tokens[1]
+        message = tokens[2]
+
+        # Leak failure messages are of the form:
+        # leakcheck | .*\d+ bytes leaked (Object-1, Object-2, Object-3, ...)
+        match = LEAK_RE.search(message)
+        if match:
+            search_term = match.group(1)
+        else:
+            for splitter in ("/", "\\"):
+                # if this is a path, we are interested in the last part
+                test_name_or_path = test_name_or_path.split(splitter)[-1]
+            search_term = test_name_or_path
+
+    # If the failure line was not in the pipe symbol delimited format or the search term
+    # will likely return too many (or irrelevant) results (eg: too short or matches terms
+    # on the blacklist), then we fall back to searching for the entire failure line if
+    # it is suitable.
+    if not (search_term and is_helpful_search_term(search_term)):
+        if is_helpful_search_term(error_line):
+            search_term = error_line
+        else:
+            search_term = None
+
+    # Searching for extremely long search terms is undesirable, since:
+    # a) Bugzilla's max summary length is 256 characters, and once "Intermittent "
+    # and platform/suite information is prefixed, there are even fewer characters
+    # left for us to use for the failure string against which we need to match.
+    # b) For long search terms, the additional length does little to prevent against
+    # false positives, but means we're more susceptible to false negatives due to
+    # run-to-run variances in the error messages (eg paths, process IDs).
+    if search_term:
+        search_term = search_term[:100]
+
+    return search_term
+
+
+def get_crash_signature(error_line):
+    """
+    Detect if the error_line contains a crash signature
+    and return it if it's a helpful search term
+    """
+    search_term = None
+    match = CRASH_RE.match(error_line)
+    if match and is_helpful_search_term(match.group(1)):
+        search_term = match.group(1)
+    return search_term
+
+
+mozharness_pattern = re.compile(
+    r'^\d+:\d+:\d+[ ]+(?:DEBUG|INFO|WARNING|ERROR|CRITICAL|FATAL) - [ ]?'
+)
+
+
+def get_mozharness_substring(line):
+    return mozharness_pattern.sub('', line).strip()
+
+
+def is_helpful_search_term(search_term):
+    # Search terms that will match too many bug summaries
+    # and so not result in useful suggestions.
+    search_term = search_term.strip()
+
+    blacklist = [
+        'automation.py',
+        'remoteautomation.py',
+        'Shutdown',
+        'undefined',
+        'Main app process exited normally',
+        'Traceback (most recent call last):',
+        'Return code: 0',
+        'Return code: 1',
+        'Return code: 2',
+        'Return code: 9',
+        'Return code: 10',
+        'Exiting 1',
+        'Exiting 9',
+        'CrashingThread(void *)',
+        'libSystem.B.dylib + 0xd7a',
+        'linux-gate.so + 0x424',
+        'TypeError: content is null',
+        'leakcheck'
+    ]
+
+    return len(search_term) > 4 and not (search_term in blacklist)
diff --git a/treeherder/webapp/api/artifact.py b/treeherder/webapp/api/artifact.py
index 0079b277eaa..035370c4b65 100644
--- a/treeherder/webapp/api/artifact.py
+++ b/treeherder/webapp/api/artifact.py
@@ -6,6 +6,7 @@
 from rest_framework.response import Response
 from treeherder.webapp.api.utils import UrlQueryFilter, oauth_required
 from treeherder.model.derived import JobsModel, ArtifactsModel
+from treeherder.model.bug_suggestions import get_bug_suggestions_artifacts
 
 
 class ArtifactViewSet(viewsets.ViewSet):
@@ -54,11 +55,20 @@ def list(self, request, project):
 
     @oauth_required
     def create(self, request, project):
+        artifacts = request.DATA
 
-        job_guids = [x['job_guid'] for x in request.DATA]
-        with JobsModel(project) as jobsModel, ArtifactsModel(project) as artifacts_model:
+        job_guids = [x['job_guid'] for x in artifacts]
+        with JobsModel(project) as jobs_model, ArtifactsModel(project) as artifacts_model:
 
-            job_id_lookup = jobsModel.get_job_ids_by_guid(job_guids)
-            artifacts_model.load_job_artifacts(request.DATA, job_id_lookup)
+            # create an accompanying ``Bug suggestions`` artifact for any
+            # eligible artifacts.
+            tls_list = [x for x in artifacts if x['name'] == 'text_log_summary']
+            bsa = get_bug_suggestions_artifacts(tls_list)
+
+            if bsa:
+                artifacts.extend(bsa)
+
+            job_id_lookup = jobs_model.get_job_ids_by_guid(job_guids)
+            artifacts_model.load_job_artifacts(artifacts, job_id_lookup)
 
             return Response({'message': 'Artifacts stored successfully'})