-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
201 lines (180 loc) · 7.55 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# From [config.py](config.html).
import config
# From [models.py](models.html).
from models import User, Page, Change
# External imports
from wikitools import wiki, api, APIError
from peewee import DoesNotExist
from isodate.isodatetime import parse_datetime as parse_iso_dt
def request(params):
"""Requests data from the MediaWiki API and returns the API response."""
site_url = config.WikiConfig.site
site = wiki.Wiki(site_url)
# `while True` makes this act like a do...while loop to ensure login on
# failure.
while True:
# Perform the request if possible.
try:
return api.APIRequest(site, params).query()
# `APIError` with arg[0] of `readapidenied` indicates the user
# needs to be logged in first. Attempt to log in once, then redo the
# request. On failure, raise the exception again.
except APIError, e:
# We can't handle non-`readapidenied` errors.
if e.args[0] != 'readapidenied':
raise e
username = config.WikiConfig.login['username']
print('Logging in as %s...' % username),
if site.login(**config.WikiConfig.login):
print 'done.'
else:
print 'failed.'
raise e
def get_raw_page_by_id(page_id):
"""
Request a page from the MediaWiki API using a page ID and
return the raw API response.
"""
action = {'action': 'query',
'pageids': page_id}
data = request(action)['query']['pages']
if not str(page_id) in data:
raise KeyError('Page with ID %s not found' % page_id)
return data[str(page_id)]
def get_raw_user_by_username(username):
"""
Request a user from the MediaWiki API using a username and
return the raw API response.
"""
action = {'action': 'query',
'list': 'users',
'ususers': username}
data = request(action)['query']['users']
for result in data:
if result['name'] == username:
if 'missing' in result:
raise KeyError('User with username %s not found' % username)
return result
raise KeyError('User with username %s not found' % username)
def get_page_object(page_id):
"""
Retrieves a Page from a given page ID, getting page data from
the API and saving the Page if it doesn't already exist in the DB.
"""
try:
return Page.get(Page.page_id == page_id)
except DoesNotExist:
data = get_raw_page_by_id(page_id)
return Page.create(page_id=data['pageid'],
page_title=data['title'])
def get_user_object(username):
"""
Retrieves a User from a given username, getting user data from
the API and saving the User if it doesn't already exist in the DB.
"""
try:
return User.get(User.username == username)
except DoesNotExist:
data = get_raw_user_by_username(username)
return User.create(username=data['name'],
user_id=data['userid'])
def get_change_object(change):
"""
Retrieves a Change from a given change API object, getting user data from
the API and saving the User if it doesn't already exist in the DB.
"""
try:
return Change.get(Change.change_id == change['rcid'])
except DoesNotExist:
size_diff = change['newlen'] - change['oldlen']
timestamp_dt = parse_iso_dt(change['timestamp'])
return Change.create(change_id=change['rcid'],
change_type=change['type'],
user=get_user_object(change['user']),
timestamp=timestamp_dt,
page=get_page_object(change['pageid']),
comment=change['comment'],
size_diff=size_diff)
def scrape_mediawiki():
"""Scrape MediaWiki for new changes and save them to the database."""
# Create DB tables if they don't already exist.
User.create_table(fail_silently=True)
Page.create_table(fail_silently=True)
Change.create_table(fail_silently=True)
# Parse changes from the MediaWiki API.
print 'Parsing recent changes from MediaWiki API.'
# Properties to request from the MediaWiki API.
recent_changes_props = ['user', 'ids', 'title', 'comment', 'sizes',
'timestamp']
# `rctype` specifies that we only want new page and edit page events.
recent_changes_action = {'action': 'query',
'list': 'recentchanges',
'rctype': 'new|edit',
'rcprop': '|'.join(recent_changes_props)}
new_changes = []
req_num = 1
while True:
print 'Request %s' % req_num
# Make the API request.
resp = request(recent_changes_action)
# Parse the response data.
resp_data = resp['query']['recentchanges']
# Append the response data to the existing data.
new_changes = new_changes + resp_data
req_num += 1
# `changes_exist` is a flag that lets us break out of the double loop.
changes_exist = False
for resp_item in resp_data:
change_id = resp_item['rcid']
try:
# `Change.get` either returns a `Change` object if one exists
# for `change_id` or raises a `DoesNotExist` exception if a
# `Change` object does not exist.
Change.get(Change.change_id == change_id)
# If we've gotten this far, a `DoesNotExist` exception has not
# been raised, so the `Change` object exists for `change_id`.
# Set the breakout flag and break out of the inner loop.
changes_exist = True
break
# We're expecting the exception if a `Change` object does not
# already exist in the DB for the given change. Ignore it.
except DoesNotExist:
pass
# Two conditions break the continuing `recentchanges` request loop:
#
# * Reaching changes that exist in the DB
# * Reaching the end of data, indicated by receiving data that doesn't
# contain the `query-continue` key
if changes_exist:
print 'Reached changes that already exist in DB.'
break
if not 'query-continue' in resp:
print 'No more query-continue; reached end of data.'
break
# The request loop hasn't been broken, and the data contains a
# `query-continue` key. Use it to request the next page of data.
next_start_point = resp['query-continue']['recentchanges']['rcstart']
recent_changes_action['rcstart'] = next_start_point
print ' query-continue: %s' % recent_changes_action['rcstart']
# Done!
print 'Done scraping.'
# Add all User objects to the DB if they don't already exist.
usernames = {change['user'] for change in new_changes}
print('Verifying %s users...' % len(usernames)),
for username in usernames:
get_user_object(username)
print 'done.'
# Add all Page objects to the DB if they don't already exist.
page_ids = {change['pageid'] for change in new_changes}
print('Verifying %s pages...' % len(page_ids)),
for page_id in page_ids:
get_page_object(page_id)
print 'done.'
# Add all new Change objects to the DB.
print('Verifying %s changes...' % len(new_changes)),
for change in new_changes:
get_change_object(change)
print 'done.'
print 'Finished!'
if __name__ == '__main__':
scrape_mediawiki()