This repository has been archived by the owner on Aug 26, 2022. It is now read-only.
/
base.py
247 lines (212 loc) · 8.65 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
"""Shared interface for data sources."""
from __future__ import absolute_import, unicode_literals
import re
from django.utils.six import (binary_type,
python_2_unicode_compatible,
text_type)
from django.utils.six.moves.urllib.parse import unquote
class Source(object):
"""
An MDN data source.
This represents data that requires a request to a running MDN server.
Derived classes should set some class variables to customize behaviour:
PARAM_NAME - The attribute name of the "key" parameter for the data source
OPTIONS - A dictionary of option names to (type, default value) pairs.
"""
# Scraping states
STATES = (
'Initializing',
'Gathering Requirements',
'Done',
'Error',
)
STATE_INIT, STATE_PREREQ, STATE_DONE, STATE_ERROR = STATES
# Freshness of scraped data
FRESHNESS = (
'Unknown',
'Fresh',
'Existing',
)
FRESH_UNKNOWN, FRESH_YES, FRESH_NO = FRESHNESS
# Friendly name of the source's key parameter
PARAM_NAME = 'param'
# Types of source options. Used when merging options, to determine if
# the new or existing option value should win, possible resetting scraping.
OPTION_TYPES = set((
'bool', # True > False
'int', # 2 > 1 > 0
'int_all', # 'all' > 2 > 0
'text', # any new value > old value > ''
))
# The scrape options for this source, defaulting to no valid settings
# Format is name -> (option_type, default value)
OPTIONS = {}
@python_2_unicode_compatible
class SourceError(Exception):
"""An error raised during gathering."""
def __init__(self, *args, **kwargs):
self.format = args[0]
self.format_args = args[1:]
super(Source.SourceError, self).__init__(*args, **kwargs)
def __str__(self):
return self.format % self.format_args
def __init__(self, param, **options):
"""Initialize an MDN data source."""
setattr(self, self.PARAM_NAME, param)
self.state = self.STATE_INIT
self.freshness = self.FRESH_UNKNOWN
for name, params in self.OPTIONS.items():
option_type, default = params
assert option_type in self.OPTION_TYPES
self.assert_option_value_allowed(option_type, default)
setattr(self, name, default)
self.merge_options(**options)
def assert_option_value_allowed(self, option_type, value):
"""Assert that an option value is valid."""
if option_type == 'bool':
valid = (value is True or value is False)
elif option_type == 'int':
valid = (value == int(value))
elif option_type == 'int_all':
valid = (value == 'all' or value == int(value))
else:
assert option_type == 'text'
valid = isinstance(value, text_type)
if not valid:
raise ValueError('invalid value "%s" for type "%s"' %
(repr(value), option_type))
def merge_options(self, **options):
"""
Merge new options with current options, returning changed options.
The new option wins if it is "higher" than the existing option.
"""
changed = {}
for name, value in options.items():
option_type = self.OPTIONS[name][0]
self.assert_option_value_allowed(option_type, value)
current = getattr(self, name)
if option_type == 'bool':
if value and not current:
changed[name] = value
setattr(self, name, True)
elif option_type == 'int':
value = int(value)
if value > current:
changed[name] = value
setattr(self, name, value)
elif option_type == 'int_all':
if text_type(value).lower() == 'all':
if current != 'all':
changed[name] = 'all'
setattr(self, name, 'all')
else:
value = int(value)
if value > current:
changed[name] = value
setattr(self, name, value)
else:
assert option_type == 'text'
if value and value != current:
changed[name] = value
setattr(self, name, value)
if changed:
self.state = self.STATE_INIT
return changed
def current_options(self):
"""Return the current non-default options."""
current = {}
for name, spec in self.OPTIONS.items():
opt_type, default = spec
value = getattr(self, name, default)
if value != default:
current[name] = value
return current
def decode_href(self, href):
"""Convert URL-escaped href attributes to unicode."""
decoded = unquote(binary_type(href))
assert isinstance(decoded, binary_type)
decoded = decoded.decode('utf8')
return decoded
def gather(self, requester, storage):
"""
Gather prerequisites and store data for a source.
Return is a list of source specifications, such as prerequisites
needed to load the source, or follow-on sources identified by the
source.
"""
if self.state == self.STATE_INIT:
# If possible, load and validate existing data for the source
try:
load_return = self.load_and_validate_existing(storage)
except self.SourceError as error:
self.error = error
self.state = self.STATE_ERROR
else:
has_existing, next_sources = load_return
if has_existing:
self.state = self.STATE_DONE
self.freshness = self.FRESH_NO
return next_sources
else:
self.state = self.STATE_PREREQ
if self.state == self.STATE_PREREQ:
try:
has_prereqs, data = self.load_prereqs(requester, storage)
except self.SourceError as error:
self.error = error
self.state = self.STATE_ERROR
else:
if has_prereqs:
self.freshness = self.FRESH_YES
# Save the data and load follow-on sources
try:
next_sources = self.save_data(storage, data)
except self.SourceError as error:
self.error = error
self.state = self.STATE_ERROR
else:
self.state = self.STATE_DONE
return next_sources
else:
# Return the additional prerequisite sources
return data['needs']
# Load no more sources in a "done" state
assert self.state in [self.STATE_ERROR, self.STATE_DONE]
return []
def load_and_validate_existing(self, storage):
"""Default: Can not load existing data from storage."""
return False, []
class DocumentBaseSource(Source):
"""Shared functionality for MDN Document sources."""
PARAM_NAME = 'path'
re_path = re.compile(r"/(?P<locale>[^/]+)/docs/(?P<slug>.*)")
# Standard options for Document-based sources
STANDARD_DOC_OPTIONS = {
'force': ('bool', False), # Update existing Document records
'depth': ('int_all', 0), # Scrape the topic tree to this depth
'revisions': ('int', 1), # Scrape this many past revisions
'translations': ('bool', False), # Scrape the alternate translations
}
def __init__(self, path, **options):
super(DocumentBaseSource, self).__init__(path, **options)
if path != unquote(path):
raise ValueError('URL-encoded path "%s"' % path)
try:
self.locale, self.slug = self.locale_and_slug(path)
except ValueError:
self.locale, self.slug = None, None
def locale_and_slug(self, path):
"""Extract a document locale and slug from a path."""
match = self.re_path.match(path)
if match:
return match.groups()
else:
raise ValueError('Not a valid document path "%s"' % path)
@property
def parent_slug(self):
if self.slug and '/' in self.slug:
return '/'.join(self.slug.split('/')[:-1])
@property
def parent_path(self):
if self.parent_slug:
return '/%s/docs/%s' % (self.locale, self.parent_slug)