openhatch · paulproteus · Apr 16, 2014 · Apr 15, 2014 · Apr 15, 2014 · Apr 15, 2014
diff --git a/bugimporters/base.py b/bugimporters/base.py
@@ -91,11 +91,13 @@ def push_urls_onto_reactor(self, result=None):
     # Importer functions that may require overloading #
     ###################################################
     def __init__(self, tracker_model, reactor_manager=None, bug_parser=None,
-            data_transits=None):
+            data_transits=None, extended_scrape=False):
         # Store the tracker model
         self.tm = tracker_model
         # Store the reactor manager
         self.rm = reactor_manager
+        # Store whether or not to scrape messages, keywords, etc
+        self.extended_scrape = extended_scrape
         # Create a dictionary that maps URLs to a callback/errback pair. This
         # dictionary is used to store URLs that have been found and require
         # downloading, along with the callback and errback that handle the

diff --git a/bugimporters/items.py b/bugimporters/items.py
@@ -24,3 +24,4 @@ class ParsedBug(scrapy.item.Item):
     as_appears_in_distribution = scrapy.item.Field()
     good_for_newcomers = scrapy.item.Field()
     concerns_just_documentation = scrapy.item.Field()
+    raw_data = scrapy.item.Field()
diff --git a/bugimporters/main.py b/bugimporters/main.py
@@ -27,8 +27,10 @@ def main(raw_arguments):
 
     parser.add_argument('-i', action="store", dest="input")
     parser.add_argument('-o', action="store", dest="output")
+    parser.add_argument('--extended-scrape', action="store_const", dest="extended_scrape", const=1, default=0)
     args = parser.parse_args(raw_arguments)
 
+
     args_for_scrapy = ['scrapy',
                        'runspider',
                        'bugimporters/main.py',
@@ -44,6 +46,7 @@ def main(raw_arguments):
                        '-s', 'DEPTH_PRIORITY=1',
                        '-s', 'SCHEDULER_DISK_QUEUE=scrapy.squeue.PickleFifoDiskQueue',
                        '-s', 'SCHEDULER_MEMORY_QUEUE=scrapy.squeue.FifoMemoryQueue',
+                       '-a', 'extended_scrape=%s' % (args.extended_scrape)
                        ]
     return scrapy.cmdline.execute(args_for_scrapy)
 
@@ -104,7 +107,8 @@ def get_bugimporters(self):
             bug_importer = bug_import_class(
                 obj, reactor_manager=None,
                 bug_parser=bug_parser_class,
-                data_transits=None)
+                data_transits=None,
+                extended_scrape=self.extended_scrape)
             yield (obj, bug_importer)
 
     def start_requests(self):
@@ -130,7 +134,9 @@ def start_requests(self):
                 logging.error("FYI, this bug importer does not support "
                               "process_bugs(). Fix it.")
 
-    def __init__(self, input_filename=None):
+    def __init__(self, input_filename=None, extended_scrape=0):
+        self.extended_scrape = int(extended_scrape)
+
         if input_filename is None:
             return
 

diff --git a/bugimporters/roundup.py b/bugimporters/roundup.py
@@ -19,6 +19,7 @@
 import datetime
 import logging
 import lxml.html
+import lxml.etree
 import re
 import urlparse
 import scrapy.http
@@ -40,7 +41,7 @@ def __init__(self, *args, **kwargs):
         # Call the parent __init__.
 
         if self.bug_parser is None:
-            self.bug_parser = RoundupBugParser 
+            self.bug_parser = RoundupBugParser
 
     def process_queries(self, queries):
         # Add all the queries to the waiting list
@@ -81,7 +82,7 @@ def process_bugs(self, bug_list):
 
     def handle_bug_html_response(self, response):
         # Create a RoundupBugParser instance to store the bug data
-        rbp = RoundupBugParser(response.request.url)
+        rbp = RoundupBugParser(response.request.url, self.extended_scrape)
         return self.handle_bug_html(response.body, rbp)
 
     def handle_bug_html(self, bug_html, rbp):
@@ -99,9 +100,11 @@ def handle_bug_html(self, bug_html, rbp):
 
 
 class RoundupBugParser(object):
-    def __init__(self, bug_url):
+    def __init__(self, bug_url, extended_scrape=False):
         self.bug_html = None
         self.bug_url = bug_url
+        self.submitter_realname_map = {}
+        self.extended_scrape = extended_scrape
 
     @cached_property
     def bug_html_url(self):
@@ -132,6 +135,37 @@ def roundup_tree2metadata_dict(tree):
             value = td.text_content().strip()
             ret[key] = value
 
+        ret["files"] = []
+        files = tree.find_class("files")  # Grab files table by classname
+        if files:  # if I find an actual table (dosen't exist if no files)
+            files = files[0]  # grab table, then tbody
+            files = files[2:]  # Strip off the two header TRs
+            for file_entry in files:
+                ret["files"].append({
+                        "url": file_entry[0][0].attrib['href'],
+                        "author": file_entry[1][0].text
+                    })
+
+        ret["messages"] = []
+        messages = tree.find_class("messages")[0]
+        if messages:
+            if "tbody" in lxml.html.tostring(messages):
+                messages = messages[0]
+            messages = messages[1:]
+            count = 0
+            author = ""
+            while count != len(messages):
+                if count % 2 == 0:
+                    author = messages[count][1].text.replace("Author: ",'')
+                else:
+                    content = lxml.etree.tostring(messages[count][0][0],
+                        pretty_print=True)
+                    ret["messages"].append({
+                            "author": author,
+                            "message": content
+                        })
+                count += 1
+
         return ret
 
     def get_all_submitter_realname_pairs(self, tree):
@@ -148,7 +182,9 @@ def get_all_submitter_realname_pairs(self, tree):
 
     def get_submitter_realname(self, tree, submitter_username):
         try:
-            return self.get_all_submitter_realname_pairs(tree)[submitter_username]
+            if self.submitter_realname_map == {}:
+                self.submitter_realname_map = self.get_all_submitter_realname_pairs(tree)
+            return self.submitter_realname_map[submitter_username]
         except KeyError:
             return None
 
@@ -196,6 +232,10 @@ def get_parsed_data_dict(self, tm):
         for status_name in tm.closed_status.split(','):
             closed_status_set.add(status_name.strip().lower())
 
+        # NOTE: If you add more values to metadata_dict (or to raw_data in general) you need to rebuild 
+        # tests/sample-data/closed-mercurial-bug-rawdata.json using json.dumps so the test will not fail
+        # because you have a different raw_data
+
         ret = bugimporters.items.ParsedBug()
         ret.update({'title': metadata_dict['Title'],
                'description': description,
@@ -212,7 +252,11 @@ def get_parsed_data_dict(self, tm):
                'canonical_bug_link': self.bug_url,
                'last_polled': datetime.datetime.utcnow().isoformat(),
                '_project_name': tm.tracker_name,
+               'raw_data':{},
                })
+        if self.extended_scrape:
+            logging.info("Adding Extended Scrape Values")
+            ret['raw_data'] = metadata_dict
 
         # Update status for trackers that set it differently
         self.update_bug_status(ret, metadata_dict)

diff --git a/bugimporters/tests/sample-data/closed-mercurial-bug-rawdata.json b/bugimporters/tests/sample-data/closed-mercurial-bug-rawdata.json
@@ -0,0 +1,44 @@
+{
+    "files": [
+        {
+            "url": "file8863/pydocbug.patch",
+            "author": "benjhayden"
+        }
+    ],
+    "Title": "help('modules') broken by several 3rd party libraries (svn patch attached)",
+    "msg59852 - (view)": "Author: Ka-Ping Yee (ping)",
+    "Priority": "normal",
+    "Superseder": "",
+    "Status": "resolved",
+    "Description": "Edit",
+    "Type": "behavior",
+    "Author": "Date: 2008-01-13 11:32",
+    "Date: 2007-12-03 16": "",
+    "Dependencies": "",
+    "User": "Action",
+    "Date": "User",
+    "Date: 2008-01-13 11": "",
+    "Stage": "",
+    "Assigned To": "ping",
+    "Uploaded": "Description",
+    "Nosy List": "benjhayden, ping\n     (2)",
+    "Versions": "Python 3.0, Python 2.6, Python 2.5",
+    "Edit": "Remove",
+    "File name": "Uploaded",
+    "Components": "Demos and Tools, Library (Lib)",
+    "Action": "Args",
+    "msg58131 - (view)": "Author: Ben Hayden (benjhayden)",
+    "Resolution": "accepted",
+    "Topics": "bitesized",
+    "messages": [
+        {
+            "message": "<pre>Instead of listing installed modules, help('modules') prints a \"please\nwait\" message, then a traceback noting that a module raised an exception\nduring import, then nothing else.\nThis happens in 2.5 and 2.6a0, but not in 2.4, which apparently doesn't\n__import__() EVERY module.\nTested only on Gentoo Linux 2.6.19, but same behavior is probable on\nother platforms because pydoc and pkgutil are written in cross-platform\nPython.\n\nProminent 3rd party libraries that break help('modules') include Django,\nPyglet, wxPython, SymPy, and Pypy. Arguably, the bug is in those\nlibraries, but they have good reasons for their behavior. Also, the Unix\nphilosophy of forgiving input is a good one. Also, __import__()ing every\nmodule takes a significant run-time hit, especially if libraries compute\neg. configuration.\n\nThe patch utilizes a pre-existing hook in pkgutil to simply quietly add\nthe module to the output. (Long live lambda.)</pre>\n   \n",
+            "author": "Ben Hayden (benjhayden)"
+        },
+        {
+            "message": "<pre>Committed the patch in <a href=\"http://svn.python.org/view?rev=59939&amp;view=rev\">revision 59939</a>.\n\nI'm not clear how it was determined that importing every module was\nnecessary in order to list the modules or scan their synopsis lines\n(this seems to have happened in <a href=\"http://svn.python.org/view?rev=45510&amp;view=rev\">revision 45510</a>).  This can probably\nbe made more efficient in the future.</pre>\n   \n",
+            "author": "Ka-Ping Yee (ping)"
+        }
+    ],
+    "Keywords": "easy, patch"
+}
diff --git a/bugimporters/tests/test_roundup.py b/bugimporters/tests/test_roundup.py
@@ -1,5 +1,6 @@
 import datetime
 import os
+import json
 
 import bugimporters.roundup
 from bugimporters.tests import ObjectFromDict
@@ -129,6 +130,19 @@ def test_new_mercurial_bug_import(self):
         assert bug['looks_closed']
         return bug
 
+    def test_raw_data_dump_with_mercurial(self):
+        self.setup_class()
+        # Check the number of Bugs present.
+        rbp = bugimporters.roundup.RoundupBugParser(
+                bug_url='http://mercurial.selenic.com/bts/issue1550',
+                extended_scrape=True)
+        # Parse HTML document as if we got it from the web
+        bug = self.im.handle_bug_html(open(os.path.join(
+                    HERE, 'sample-data',
+                    'closed-mercurial-bug.html')).read(), rbp )
+        self.assertEqual(bug['raw_data'],
+            json.load(open('bugimporters/tests/sample-data/closed-mercurial-bug-rawdata.json')))
+
     def test_reimport_same_bug_works(self):
         self.setup_class()
         bug1 = self.test_new_mercurial_bug_import()

diff --git a/examples/roundup.yaml b/examples/roundup.yaml
@@ -0,0 +1,12 @@
+- as_appears_in_distribution: !!python/unicode ''
+  base_url: !!python/unicode 'http://openhatch.org/bugs/'
+  bitesized_field: !!python/unicode ''
+  bitesized_text: !!python/unicode ''
+  bugimporter: !!python/unicode 'roundup'
+  closed_status: !!python/unicode 'closed'
+  custom_parser: !!python/unicode ''
+  documentation_field: !!python/unicode 'Component'
+  documentation_text: !!python/unicode 'Documentation'
+  existing_bug_urls: ['http://openhatch.org/bugs/issue955']
+  queries: []
+  tracker_name: !!python/unicode 'Roundup'