/
ftpscraper.py
603 lines (520 loc) · 21.1 KB
/
ftpscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
import datetime
import sys
import re
import os
import json
import urlparse
import fnmatch
import mock
import lxml.html
import requests
from requests.adapters import HTTPAdapter
from configman import Namespace
from configman.converters import class_converter, str_to_list
from crontabber.base import BaseCronApp
from crontabber.mixins import (
as_backfill_cron_app,
with_postgres_transactions
)
from socorro.cron import buildutil
from socorrolib.app.socorro_app import App, main
from socorrolib.lib.datetimeutil import string_to_datetime
class ScrapersMixin(object):
"""
Mixin that requires to be able to call `self.download(some_url)`
and `self.skip_json_file(json_url)`.
"""
def get_links(self, url, starts_with=None, ends_with=None):
results = []
content = self.download(url)
if not content:
return []
if not (starts_with or ends_with):
raise NotImplementedError(
'get_links requires either `startswith` or `endswith`'
)
html = lxml.html.document_fromstring(content)
path = urlparse.urlparse(url).path
def url_match(link):
# The link might be something like "/pub/mobile/nightly/"
# but we're looking for a path that starts with "nightly".
# So first we need to remove what's part of the base URL
# to make a fair comparison.
if starts_with is not None:
# If the current URL is http://example.com/some/dir/
# and the link is /some/dir/mypage/ and the thing
# we're looking for is "myp" then this should be true
if link.startswith(path):
link = link.replace(path, '')
return link.startswith(starts_with)
elif ends_with:
return link.endswith(ends_with)
return False
for _, _, link, _ in html.iterlinks():
if url_match(link):
results.append(urlparse.urljoin(url, link))
return results
def parse_build_json_file(self, url, nightly=False):
content = self.download(url)
if content:
try:
kvpairs = json.loads(content)
kvpairs['repository'] = kvpairs.get('moz_source_repo')
if kvpairs['repository']:
kvpairs['repository'] = kvpairs['repository'].split(
'/', -1
)[-1]
kvpairs['build_type'] = kvpairs.get('moz_update_channel')
kvpairs['buildID'] = kvpairs.get('buildid')
# bug 1065071 - ignore JSON files that have keys with
# missing values.
if None in kvpairs.values():
self.config.logger.warning(
'warning, unsupported JSON file: %s', url
)
return kvpairs
# bug 963431 - it is valid to have an empty file
# due to a quirk in our build system
except ValueError:
self.config.logger.warning(
'Unable to JSON parse content %r',
content,
exc_info=True
)
def parse_info_file(self, url, nightly=False):
self.config.logger.debug('Opening %s', url)
content = self.download(url)
results = {}
bad_lines = []
if not content:
return results, bad_lines
contents = content.splitlines()
if nightly:
results = {'buildID': contents[0], 'rev': contents[1]}
if len(contents) > 2:
results['altrev'] = contents[2]
elif contents:
results = {}
for line in contents:
if line == '':
continue
try:
key, value = line.split('=')
results[key] = value
except ValueError:
bad_lines.append(line)
return results, bad_lines
def parse_b2g_file(self, url):
"""
Parse the B2G manifest JSON file
Example: {"buildid": "20130125070201", "update_channel":
"nightly", "version": "18.0"}
TODO handle exception if file does not exist
"""
content = self.download(url)
if not content:
return
results = json.loads(content)
# bug 869564: Return None if update_channel is 'default'
if results['update_channel'] == 'default':
self.config.logger.warning(
"Found default update_channel for buildid: %s. Skipping.",
results['buildid']
)
return
# Default 'null' channels to nightly
results['build_type'] = results['update_channel'] or 'nightly'
# Default beta_number to 1 for beta releases
if results['update_channel'] == 'beta':
results['beta_number'] = results.get('beta_number', 1)
return results
def get_json_release(self, candidate_url, dirname):
version = dirname.split('-candidates')[0]
builds = self.get_links(candidate_url, starts_with='build')
if not builds:
return
latest_build = builds.pop()
version_build = os.path.basename(os.path.normpath(latest_build))
possible_platforms = (
'linux', 'mac', 'win', 'debug', # for Firefox
'android-api-15', 'android-x86', # for mobile
)
for platform in possible_platforms:
platform_urls = self.get_links(latest_build, starts_with=platform)
for platform_url in platform_urls:
platform_local_url = urlparse.urljoin(platform_url, 'en-US/')
json_files = self.get_links(
platform_local_url,
ends_with='.json'
)
for json_url in json_files:
if self.skip_json_file(json_url):
continue
kvpairs = self.parse_build_json_file(json_url)
if not kvpairs:
continue
kvpairs['version_build'] = version_build
yield (platform, version, kvpairs)
def get_json_nightly(self, nightly_url, dirname):
json_files = self.get_links(nightly_url, ends_with='.json')
for url in json_files:
if self.skip_json_file(url):
continue
basename = os.path.basename(url)
if '.en-US.' in url:
pv, platform = re.sub('\.json$', '', basename).split('.en-US.')
elif '.multi.' in url:
pv, platform = re.sub('\.json$', '', basename).split('.multi.')
else:
continue
version = pv.split('-')[-1]
repository = []
for field in dirname.split('-'):
# Skip until something is not a digit and once we've
# appended at least one, keep adding.
if not field.isdigit() or repository:
repository.append(field)
repository = '-'.join(repository).strip('/')
kvpairs = self.parse_build_json_file(url, nightly=True)
yield (platform, repository, version, kvpairs)
def get_release(self, candidate_url):
builds = self.get_links(candidate_url, starts_with='build')
if not builds:
self.config.logger.info('No build dirs in %s', candidate_url)
return
latest_build = builds.pop()
version_build = os.path.basename(os.path.normpath(latest_build))
info_files = self.get_links(latest_build, ends_with='_info.txt')
for info_url in info_files:
kvpairs, bad_lines = self.parse_info_file(info_url)
# os.path.basename works on URL looking things too
# and not just file path
platform = os.path.basename(info_url).split('_info.txt')[0]
# suppose the `info_url` is something like
# "https://archive.moz.../40.0.3-candidates/..11_info.txt"
# then look for the "40.0.3-candidates" part and remove
# "-candidates" part.
version, = [
x.split('-candidates')[0]
for x in urlparse.urlparse(info_url).path.split('/')
if x.endswith('-candidates')
]
kvpairs['version_build'] = version_build
yield (platform, version, kvpairs, bad_lines)
def get_b2g(self, url, backfill_date=None):
"""
Last mile of B2G scraping, calls parse_b2g on .json
Files look like: socorro_unagi-stable_2013-01-25-07.json
"""
info_files = self.get_links(url, ends_with='.json')
platform = None
version = None
repository = 'b2g-release'
for url in info_files:
# Pull platform out of the filename
jsonfilename = os.path.basename(url).split('_')
# We only want to consider .json files that look like this:
# socorro_something_YYYY-MM-DD.json
# So, basically it needs to be at least 3 parts split by _
# and the first part must be 'socorro'
# Skip if this file isn't for socorro!
if jsonfilename[0] != 'socorro' or len(jsonfilename) < 3:
continue
platform = jsonfilename[1]
kvpairs = self.parse_b2g_file(url)
# parse_b2g_file() returns None when a file is
# unable to be parsed or we ignore the file
if kvpairs is None:
continue
version = kvpairs['version']
yield (platform, repository, version, kvpairs)
#==============================================================================
@with_postgres_transactions()
@as_backfill_cron_app
class FTPScraperCronApp(BaseCronApp, ScrapersMixin):
app_name = 'ftpscraper'
app_description = 'FTP Scraper'
app_version = '0.1'
required_config = Namespace()
required_config.add_option(
'products',
default='firefox,mobile,thunderbird,seamonkey',
from_string_converter=lambda line: tuple(
[x.strip() for x in line.split(',') if x.strip()]
),
doc='a comma-delimited list of URIs for each product')
required_config.add_option(
'base_url',
default='https://archive.mozilla.org/pub/',
doc='The base url to use for fetching builds')
required_config.add_option(
'dry_run',
default=False,
doc='Print instead of storing builds')
required_config.add_option(
'retries',
default=5,
doc='Number of times the requests sessions should retry')
required_config.add_option(
'read_timeout',
default=10, # seconds
doc='Number of seconds wait for a full read')
required_config.add_option(
'connect_timeout',
default=3.5, # seconds, ideally something slightly larger than 3
doc='Number of seconds wait for a connection')
required_config.add_option(
'json_files_to_ignore',
default='*.mozinfo.json, *test_packages.json',
from_string_converter=str_to_list
)
def __init__(self, *args, **kwargs):
super(FTPScraperCronApp, self).__init__(*args, **kwargs)
self.session = requests.Session()
if urlparse.urlparse(self.config.base_url).scheme == 'https':
mount = 'https://'
else:
mount = 'http://'
self.session.mount(
mount,
HTTPAdapter(max_retries=self.config.retries)
)
def download(self, url):
response = self.session.get(
url,
timeout=(self.config.connect_timeout, self.config.read_timeout)
)
if response.status_code == 404:
# Legacy. Return None on any 404 error.
return
assert response.status_code == 200, response.status_code
return response.content
def skip_json_file(self, json_url):
basename = os.path.basename(json_url)
for file_pattern in self.config.json_files_to_ignore:
if fnmatch.fnmatch(basename, file_pattern):
return True
return False
def run(self, date):
# record_associations
for product_name in self.config.products:
self.config.logger.debug(
'scraping %s releases for date %s',
product_name,
date
)
if product_name == 'b2g':
self.database_transaction_executor(
self.scrape_b2g,
product_name,
date
)
else:
self.database_transaction_executor(
self._scrape_json_releases_and_nightlies,
product_name,
date
)
def _scrape_json_releases_and_nightlies(
self,
connection,
product_name,
date
):
self.scrape_json_releases(connection, product_name)
self.scrape_json_nightlies(connection, product_name, date)
def _insert_build(self, cursor, *args, **kwargs):
if self.config.dry_run:
print "INSERT BUILD"
print args
print kwargs
else:
buildutil.insert_build(cursor, *args, **kwargs)
def _is_final_beta(self, version):
# If this is a XX.0 version in the release channel,
# return True otherwise, False
# Make a special exception for the out-of-cycle 38.0.5
return version.endswith('.0') or version == '38.0.5'
def scrape_json_releases(self, connection, product_name):
prod_url = urlparse.urljoin(self.config.base_url, product_name + '/')
logger = self.config.logger
cursor = connection.cursor()
for directory in ('nightly', 'candidates'):
try:
url, = self.get_links(prod_url, starts_with=directory)
except IndexError:
logger.debug('Dir %s not found for %s',
directory, product_name)
continue
releases = self.get_links(url, ends_with='-candidates/')
for release in releases:
dirname = release.replace(url, '')
if dirname.endswith('/'):
dirname = dirname[:-1]
for info in self.get_json_release(release, dirname):
platform, version, kvpairs = info
build_type = 'release'
beta_number = None
repository = kvpairs['repository']
if 'b' in version:
build_type = 'beta'
version, beta_number = version.split('b')
if kvpairs.get('buildID'):
build_id = kvpairs['buildID']
version_build = kvpairs['version_build']
self._insert_build(
cursor,
product_name,
version,
platform,
build_id,
build_type,
beta_number,
repository,
version_build,
ignore_duplicates=True
)
if (
self._is_final_beta(version) and
build_type == 'release' and
version > '26.0' and
kvpairs.get('buildID')
):
logger.debug('is final beta version %s', version)
repository = 'mozilla-beta'
build_id = kvpairs['buildID']
build_type = 'beta'
version_build = kvpairs['version_build']
# just force this to 99 until
# we deal with version_build properly
beta_number = 99
self._insert_build(
cursor,
product_name,
version,
platform,
build_id,
build_type,
beta_number,
repository,
version_build,
ignore_duplicates=True
)
def scrape_json_nightlies(self, connection, product_name, date):
directories = (
product_name,
'nightly',
date.strftime('%Y'),
date.strftime('%m'),
)
nightly_url = self.config.base_url
for part in directories:
nightly_url = urlparse.urljoin(
nightly_url, part + '/'
)
cursor = connection.cursor()
dir_prefix = date.strftime('%Y-%m-%d')
nightlies = self.get_links(nightly_url, starts_with=dir_prefix)
for nightly in nightlies:
dirname = nightly.replace(nightly_url, '')
if dirname.endswith('/'):
dirname = dirname[:-1]
for info in self.get_json_nightly(nightly, dirname):
platform, repository, version, kvpairs = info
build_type = 'nightly'
if version.endswith('a2'):
build_type = 'aurora'
if kvpairs.get('buildID'):
build_id = kvpairs['buildID']
self._insert_build(
cursor,
product_name,
version,
platform,
build_id,
build_type,
kvpairs.get('beta_number', None),
repository,
ignore_duplicates=True
)
def scrape_b2g(self, connection, product_name, date):
if product_name != 'b2g':
return
directories = (
product_name,
'manifests',
'nightly',
)
b2g_manifests = self.config.base_url
for part in directories:
b2g_manifests = urlparse.urljoin(b2g_manifests, part + '/')
dir_prefix = date.strftime('%Y-%m-%d')
cursor = connection.cursor()
version_dirs = self.get_links(b2g_manifests, ends_with='/')
for version_dir in version_dirs:
prod_url = urlparse.urljoin(
version_dir, date.strftime('%Y/%m/')
)
nightlies = self.get_links(prod_url, starts_with=dir_prefix)
for nightly in nightlies:
b2gs = self.get_b2g(
nightly,
backfill_date=None,
)
for info in b2gs:
platform, repository, version, kvpairs = info
build_id = kvpairs['buildid']
build_type = kvpairs['build_type']
self._insert_build(
cursor,
product_name,
version,
platform,
build_id,
build_type,
kvpairs.get('beta_number', None),
repository,
ignore_duplicates=True
)
class FTPScraperCronAppDryRunner(App): # pragma: no cover
"""This is a utility class that makes it easy to run the scraping
and ALWAYS do so in a "dry run" fashion such that stuff is never
stored in the database but instead found releases are just printed
out stdout.
To run it, simply execute this file:
$ python socorro/cron/jobs/ftpscraper.py
If you want to override what date to run it for (by default it's
"now") you simply use this format:
$ python socorro/cron/jobs/ftpscraper.py --date=2015-10-23
By default it runs for every, default configured, product
(see the configuration set up in the FTPScraperCronApp above). You
can override that like this:
$ python socorro/cron/jobs/ftpscraper.py --product=mobile,b2g
"""
required_config = Namespace()
required_config.add_option(
'date',
default=datetime.datetime.utcnow().date(),
doc='Date to run for',
from_string_converter=string_to_datetime
)
required_config.add_option(
'crontabber_job_class',
default='socorro.cron.jobs.ftpscraper.FTPScraperCronApp',
doc='bla',
from_string_converter=class_converter,
)
@staticmethod
def get_application_defaults():
return {
'database.database_class': mock.MagicMock()
}
def __init__(self, config):
self.config = config
self.config.dry_run = True
self.ftpscraper = config.crontabber_job_class(config, {})
def main(self):
assert self.config.dry_run
self.ftpscraper.run(self.config.date)
if __name__ == '__main__': # pragma: no cover
sys.exit(main(FTPScraperCronAppDryRunner))