Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 224 lines (183 sloc) 7.46 KB
#!/usr/bin/env python
##########################################################################
# #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
##########################################################################
from datetime import date
## PROJECT LIBS
from sonet.mediawiki import HistoryPageProcessor, explode_dump_filename, \
get_translations, get_tags, getUsersGroup
from sonet import lib
from sonet.timr import Timr
from sonet.models import get_events_table
from base64 import b64encode
from zlib import compress
from wbin import serialize
from datetime import datetime as dt
class HistoryEventsPageProcessor(HistoryPageProcessor):
queue = None
connection = None
insert = None
bots = []
start_date = None
end_date = None
_skip_revision = None
def __init__(self, **kwargs):
super(HistoryEventsPageProcessor, self).__init__(**kwargs)
self.queue = []
events, self.connection = get_events_table()
self.insert = events.insert()
def flush(self):
data = [{'title': page['title'],
'lang': self.lang,
'talk': (page['type'] == 'talk'),
'data': b64encode(compress(serialize(page['counter']))),
'desired': page['desired'],
'total_editors': page['total_editors'],
'bot_editors': page['bot_editors'],
'anonymous_editors': page['anon_editors']
} for page in self.queue]
self.connection.execute(self.insert, data)
self.queue = []
def save(self):
if not self._title:
return
data = {
'title': self._title,
'type': self._type,
'desired': self._desired,
'counter': self._counter,
'total_editors': self.get_number_of_editors(),
'bot_editors': self.get_number_of_editors('bot'),
'anon_editors': self.get_number_of_editors('anonymous')
}
self.queue.append(data)
self.counter_pages += 1
def set_bots(self):
self.bots = frozenset(getUsersGroup(lang=self.lang, edits_only=True))
def process_timestamp(self, elem):
if self._skip:
return
timestamp = elem.text
year = int(timestamp[:4])
month = int(timestamp[5:7])
day = int(timestamp[8:10])
revision_time = date(year, month, day)
if (self.start_date and revision_time < dt.date(self.start_date)):
self._skip_revision = True
return
if (self.end_date and revision_time > dt.date(self.end_date)):
self._skip_revision = True
return
self._date = (revision_time - self.s_date).days
## default value for self._date is a list where
## first element is for total revisions, the second
## for revisions made by bot and the last one for
## anonymous' revisions
t = self._counter.get(self._date, [0, 0, 0])
t[0] += 1 # increment total revisions
self._counter[self._date] = t
del revision_time, t
self.count += 1
if not self.count % 500000:
self.flush()
print 'PAGES:', self.counter_pages, 'REVS:', self.count
def process_username(self, elem):
if self._skip_revision:
return
try:
u = elem.text.encode('utf-8')
## whether user is a bot or not
role = 'bot' if u in self.bots else None
if not u in self._editors:
self._editors[u] = role
if role: # in case of a bot's contribution increment bot's edits
self._counter[self._date][1] += 1
except AttributeError:
pass
def process_ip(self, elem):
if self._skip_revision:
return
if not elem.text in self._editors:
self._editors[elem.text] = 'anonymous'
## Contributor is anonymous, thus increments anonymous' contribution
self._counter[self._date][2] += 1
def process_redirect(self, elem):
self._skip = True
if self._desired is True:
raise ValueError(
"The page %s is a redirect. " % self._title + \
"Pages in the desired list must not be redirects."
)
def process_revision(self, _):
self._skip_revison = False
def process_page(self, _):
if not self._skip:
self.save()
self._skip = False
self._skip_revision = False
def main():
import optparse
from sonet.lib import SonetOption
p = optparse.OptionParser(
usage="usage: %prog [options] file desired_list acceptance_ratio",
option_class=SonetOption
)
p.add_option('-v', action="store_true", dest="verbose", default=False,
help="Verbose output (like timings)")
p.add_option('-E', '--encoding', action="store", dest="encoding",
default="latin-1", help="encoding of the desired_list file")
p.add_option('-d', '--delimiter', action="store", dest="delimiter",
default=",", help="CSV delimiter")
p.add_option('-s', '--start', action="store", dest='start',
type="yyyymmdd", metavar="YYYYMMDD", default=None,
help="Look for revisions starting from this date")
p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd",
metavar="YYYYMMDD", default=None,
help="Look for revisions until this date")
opts, files = p.parse_args()
if opts.verbose:
import sys
import logging
logging.basicConfig(stream=sys.stderr,
level=logging.DEBUG)
if len(files) != 3:
p.error("Wrong parameters")
xml = files[0]
desired_pages_fn = files[1]
threshold = float(files[2])
lang, _, _ = explode_dump_filename(xml)
deflate, _lineno = lib.find_open_for_this_file(xml)
if _lineno:
src = deflate(xml, 51) # Read first 51 lines to extract namespaces
else:
src = deflate(xml)
translation = get_translations(src)
tag = get_tags(src, tags='page,title,revision,' + \
'minor,timestamp,redirect,ip,username')
src.close()
src = deflate(xml)
processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
processor.talkns = translation['Talk']
processor.threshold = threshold
processor.start_date = opts.start
processor.end_date = opts.end
processor.set_desired_from_csv(desired_pages_fn,
encoding=opts.encoding,
delimiter=opts.delimiter)
with Timr('Retrieving bots'):
processor.set_bots()
print "BEGIN PARSING"
with Timr('Parsing'):
processor.start(src)
processor.flush()
if __name__ == "__main__":
main()