Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add full data dump framework and implement for roundup #47

Merged
merged 32 commits into from Apr 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9aa2af6
Added file loading for roundup
602p Apr 15, 2014
3530bc4
Add file scraping test
602p Apr 15, 2014
c60b826
Added message scraping functionality (to roundup)
602p Apr 15, 2014
c13f8bb
Add --extended-scrape option to scrape messages, keywords, etc..
602p Apr 15, 2014
085bb7b
Add raw_data dumping option to bug parser (for use with converse)
602p Apr 15, 2014
481ea3b
Add additional YAML example
602p Apr 15, 2014
bae7c2c
Fixed, added tests
602p Apr 15, 2014
e3fbb89
Remove a debug command
602p Apr 16, 2014
710fb0d
Added file loading for roundup
602p Apr 15, 2014
92432df
Add file scraping test
602p Apr 15, 2014
69fbc7b
Added message scraping functionality (to roundup)
602p Apr 15, 2014
d87877f
Add --extended-scrape option to scrape messages, keywords, etc..
602p Apr 15, 2014
59906d1
Add raw_data dumping option to bug parser (for use with converse)
602p Apr 15, 2014
0167fc1
Add additional YAML example
602p Apr 15, 2014
1b266c1
Fixed, added tests
602p Apr 15, 2014
ad8081f
Remove a debug command
602p Apr 16, 2014
41def37
Merge roundup.py
602p Apr 16, 2014
689ed2b
Move rawdata test JSON blob to its own file
602p Apr 16, 2014
ffa932e
Made JSON loading cleaner
602p Apr 16, 2014
bfb71be
Cleaned up JSON blob
602p Apr 16, 2014
cbee1f3
Added a note about raw_data testing
602p Apr 16, 2014
6f2c164
Im bad at spelling
602p Apr 16, 2014
be36d67
Added Asheesh's frackalackadingdong comma
602p Apr 16, 2014
53999bf
Removed an offending try-catch and added some whitespace
602p Apr 16, 2014
a62fb01
Cleaned up extended_scrape passing
602p Apr 16, 2014
03ae989
PEP-8 ifiying
602p Apr 16, 2014
0b228ae
PEP-8 ifiying further
602p Apr 16, 2014
fe81194
PEP-8 ifiying even further
602p Apr 16, 2014
2e06ceb
PEP-8 ifiying EVEN further
602p Apr 16, 2014
f1b2cba
PEP-8 ifiying comments
602p Apr 16, 2014
271e569
PEP-8 changes (AKA Guido is very opinionated)
602p Apr 16, 2014
bd2df90
PEP-8
602p Apr 16, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion bugimporters/base.py
Expand Up @@ -91,11 +91,13 @@ def push_urls_onto_reactor(self, result=None):
# Importer functions that may require overloading #
###################################################
def __init__(self, tracker_model, reactor_manager=None, bug_parser=None,
data_transits=None):
data_transits=None, extended_scrape=False):
# Store the tracker model
self.tm = tracker_model
# Store the reactor manager
self.rm = reactor_manager
# Store whether or not to scrape messages, keywords, etc
self.extended_scrape = extended_scrape
# Create a dictionary that maps URLs to a callback/errback pair. This
# dictionary is used to store URLs that have been found and require
# downloading, along with the callback and errback that handle the
Expand Down
1 change: 1 addition & 0 deletions bugimporters/items.py
Expand Up @@ -24,3 +24,4 @@ class ParsedBug(scrapy.item.Item):
as_appears_in_distribution = scrapy.item.Field()
good_for_newcomers = scrapy.item.Field()
concerns_just_documentation = scrapy.item.Field()
raw_data = scrapy.item.Field()
10 changes: 8 additions & 2 deletions bugimporters/main.py
Expand Up @@ -27,8 +27,10 @@ def main(raw_arguments):

parser.add_argument('-i', action="store", dest="input")
parser.add_argument('-o', action="store", dest="output")
parser.add_argument('--extended-scrape', action="store_const", dest="extended_scrape", const=1, default=0)
args = parser.parse_args(raw_arguments)


args_for_scrapy = ['scrapy',
'runspider',
'bugimporters/main.py',
Expand All @@ -44,6 +46,7 @@ def main(raw_arguments):
'-s', 'DEPTH_PRIORITY=1',
'-s', 'SCHEDULER_DISK_QUEUE=scrapy.squeue.PickleFifoDiskQueue',
'-s', 'SCHEDULER_MEMORY_QUEUE=scrapy.squeue.FifoMemoryQueue',
'-a', 'extended_scrape=%s' % (args.extended_scrape)
]
return scrapy.cmdline.execute(args_for_scrapy)

Expand Down Expand Up @@ -104,7 +107,8 @@ def get_bugimporters(self):
bug_importer = bug_import_class(
obj, reactor_manager=None,
bug_parser=bug_parser_class,
data_transits=None)
data_transits=None,
extended_scrape=self.extended_scrape)
yield (obj, bug_importer)

def start_requests(self):
Expand All @@ -130,7 +134,9 @@ def start_requests(self):
logging.error("FYI, this bug importer does not support "
"process_bugs(). Fix it.")

def __init__(self, input_filename=None):
def __init__(self, input_filename=None, extended_scrape=0):
self.extended_scrape = int(extended_scrape)

if input_filename is None:
return

Expand Down
52 changes: 48 additions & 4 deletions bugimporters/roundup.py
Expand Up @@ -19,6 +19,7 @@
import datetime
import logging
import lxml.html
import lxml.etree
import re
import urlparse
import scrapy.http
Expand All @@ -40,7 +41,7 @@ def __init__(self, *args, **kwargs):
# Call the parent __init__.

if self.bug_parser is None:
self.bug_parser = RoundupBugParser
self.bug_parser = RoundupBugParser

def process_queries(self, queries):
# Add all the queries to the waiting list
Expand Down Expand Up @@ -81,7 +82,7 @@ def process_bugs(self, bug_list):

def handle_bug_html_response(self, response):
# Create a RoundupBugParser instance to store the bug data
rbp = RoundupBugParser(response.request.url)
rbp = RoundupBugParser(response.request.url, self.extended_scrape)
return self.handle_bug_html(response.body, rbp)

def handle_bug_html(self, bug_html, rbp):
Expand All @@ -99,9 +100,11 @@ def handle_bug_html(self, bug_html, rbp):


class RoundupBugParser(object):
def __init__(self, bug_url):
def __init__(self, bug_url, extended_scrape=False):
self.bug_html = None
self.bug_url = bug_url
self.submitter_realname_map = {}
self.extended_scrape = extended_scrape

@cached_property
def bug_html_url(self):
Expand Down Expand Up @@ -132,6 +135,37 @@ def roundup_tree2metadata_dict(tree):
value = td.text_content().strip()
ret[key] = value

ret["files"] = []
files = tree.find_class("files") # Grab files table by classname
if files: # if I find an actual table (dosen't exist if no files)
files = files[0] # grab table, then tbody
files = files[2:] # Strip off the two header TRs
for file_entry in files:
ret["files"].append({
"url": file_entry[0][0].attrib['href'],
"author": file_entry[1][0].text
})

ret["messages"] = []
messages = tree.find_class("messages")[0]
if messages:
if "tbody" in lxml.html.tostring(messages):
messages = messages[0]
messages = messages[1:]
count = 0
author = ""
while count != len(messages):
if count % 2 == 0:
author = messages[count][1].text.replace("Author: ",'')
else:
content = lxml.etree.tostring(messages[count][0][0],
pretty_print=True)
ret["messages"].append({
"author": author,
"message": content
})
count += 1

return ret

def get_all_submitter_realname_pairs(self, tree):
Expand All @@ -148,7 +182,9 @@ def get_all_submitter_realname_pairs(self, tree):

def get_submitter_realname(self, tree, submitter_username):
try:
return self.get_all_submitter_realname_pairs(tree)[submitter_username]
if self.submitter_realname_map == {}:
self.submitter_realname_map = self.get_all_submitter_realname_pairs(tree)
return self.submitter_realname_map[submitter_username]
except KeyError:
return None

Expand Down Expand Up @@ -196,6 +232,10 @@ def get_parsed_data_dict(self, tm):
for status_name in tm.closed_status.split(','):
closed_status_set.add(status_name.strip().lower())

# NOTE: If you add more values to metadata_dict (or to raw_data in general) you need to rebuild
# tests/sample-data/closed-mercurial-bug-rawdata.json using json.dumps so the test will not fail
# because you have a different raw_data

ret = bugimporters.items.ParsedBug()
ret.update({'title': metadata_dict['Title'],
'description': description,
Expand All @@ -212,7 +252,11 @@ def get_parsed_data_dict(self, tm):
'canonical_bug_link': self.bug_url,
'last_polled': datetime.datetime.utcnow().isoformat(),
'_project_name': tm.tracker_name,
'raw_data':{},
})
if self.extended_scrape:
logging.info("Adding Extended Scrape Values")
ret['raw_data'] = metadata_dict

# Update status for trackers that set it differently
self.update_bug_status(ret, metadata_dict)
Expand Down
44 changes: 44 additions & 0 deletions bugimporters/tests/sample-data/closed-mercurial-bug-rawdata.json
@@ -0,0 +1,44 @@
{
"files": [
{
"url": "file8863/pydocbug.patch",
"author": "benjhayden"
}
],
"Title": "help('modules') broken by several 3rd party libraries (svn patch attached)",
"msg59852 - (view)": "Author: Ka-Ping Yee (ping)",
"Priority": "normal",
"Superseder": "",
"Status": "resolved",
"Description": "Edit",
"Type": "behavior",
"Author": "Date: 2008-01-13 11:32",
"Date: 2007-12-03 16": "",
"Dependencies": "",
"User": "Action",
"Date": "User",
"Date: 2008-01-13 11": "",
"Stage": "",
"Assigned To": "ping",
"Uploaded": "Description",
"Nosy List": "benjhayden, ping\n (2)",
"Versions": "Python 3.0, Python 2.6, Python 2.5",
"Edit": "Remove",
"File name": "Uploaded",
"Components": "Demos and Tools, Library (Lib)",
"Action": "Args",
"msg58131 - (view)": "Author: Ben Hayden (benjhayden)",
"Resolution": "accepted",
"Topics": "bitesized",
"messages": [
{
"message": "<pre>Instead of listing installed modules, help('modules') prints a \"please\nwait\" message, then a traceback noting that a module raised an exception\nduring import, then nothing else.\nThis happens in 2.5 and 2.6a0, but not in 2.4, which apparently doesn't\n__import__() EVERY module.\nTested only on Gentoo Linux 2.6.19, but same behavior is probable on\nother platforms because pydoc and pkgutil are written in cross-platform\nPython.\n\nProminent 3rd party libraries that break help('modules') include Django,\nPyglet, wxPython, SymPy, and Pypy. Arguably, the bug is in those\nlibraries, but they have good reasons for their behavior. Also, the Unix\nphilosophy of forgiving input is a good one. Also, __import__()ing every\nmodule takes a significant run-time hit, especially if libraries compute\neg. configuration.\n\nThe patch utilizes a pre-existing hook in pkgutil to simply quietly add\nthe module to the output. (Long live lambda.)</pre>\n \n",
"author": "Ben Hayden (benjhayden)"
},
{
"message": "<pre>Committed the patch in <a href=\"http://svn.python.org/view?rev=59939&amp;view=rev\">revision 59939</a>.\n\nI'm not clear how it was determined that importing every module was\nnecessary in order to list the modules or scan their synopsis lines\n(this seems to have happened in <a href=\"http://svn.python.org/view?rev=45510&amp;view=rev\">revision 45510</a>). This can probably\nbe made more efficient in the future.</pre>\n \n",
"author": "Ka-Ping Yee (ping)"
}
],
"Keywords": "easy, patch"
}
14 changes: 14 additions & 0 deletions bugimporters/tests/test_roundup.py
@@ -1,5 +1,6 @@
import datetime
import os
import json

import bugimporters.roundup
from bugimporters.tests import ObjectFromDict
Expand Down Expand Up @@ -129,6 +130,19 @@ def test_new_mercurial_bug_import(self):
assert bug['looks_closed']
return bug

def test_raw_data_dump_with_mercurial(self):
self.setup_class()
# Check the number of Bugs present.
rbp = bugimporters.roundup.RoundupBugParser(
bug_url='http://mercurial.selenic.com/bts/issue1550',
extended_scrape=True)
# Parse HTML document as if we got it from the web
bug = self.im.handle_bug_html(open(os.path.join(
HERE, 'sample-data',
'closed-mercurial-bug.html')).read(), rbp )
self.assertEqual(bug['raw_data'],
json.load(open('bugimporters/tests/sample-data/closed-mercurial-bug-rawdata.json')))

def test_reimport_same_bug_works(self):
self.setup_class()
bug1 = self.test_new_mercurial_bug_import()
Expand Down
12 changes: 12 additions & 0 deletions examples/roundup.yaml
@@ -0,0 +1,12 @@
- as_appears_in_distribution: !!python/unicode ''
base_url: !!python/unicode 'http://openhatch.org/bugs/'
bitesized_field: !!python/unicode ''
bitesized_text: !!python/unicode ''
bugimporter: !!python/unicode 'roundup'
closed_status: !!python/unicode 'closed'
custom_parser: !!python/unicode ''
documentation_field: !!python/unicode 'Component'
documentation_text: !!python/unicode 'Documentation'
existing_bug_urls: ['http://openhatch.org/bugs/issue955']
queries: []
tracker_name: !!python/unicode 'Roundup'