This repository has been archived by the owner on Aug 26, 2022. It is now read-only.
/
links.py
101 lines (77 loc) · 3.11 KB
/
links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""LinksSource gathers wiki document links from a rendered page."""
from __future__ import absolute_import, unicode_literals
import logging
from django.conf import settings
from django.utils.six.moves.urllib.parse import urlparse
from pyquery import PyQuery as pq
from .base import Source
logger = logging.getLogger('kuma.scraper')
class LinksSource(Source):
"""Gather document links from a rendered page for scraping.
Links are scraped from the header, footer, and content. Links that look
like documents are queued for download. This will not include the current
page, which should be requested with a DocumentSource if applicable.
"""
OPTIONS = {
'depth': ('int_all', 0), # Scrape the topic tree to this depth
'revisions': ('int', 1), # Scrape this many past revisions
'translations': ('bool', False), # Scrape the alternate translations
}
PARAM_NAME = 'path'
ignored_slugs = set((
'dashboards',
'profiles',
'search',
'users/signin',
'docs/tag/',
))
def __init__(self, path=None, **options):
"""Process and validate the initial path."""
if (not path) or (path == '/'):
path = '/en-US/' # Default to English homepage
path = urlparse(path).path
assert path.startswith('/')
self.locale = path.split('/')[1]
assert self.locale in settings.ENABLED_LOCALES
super(LinksSource, self).__init__(path, **options)
def load_prereqs(self, requester, storage):
"""Request the page and gather document links."""
response = requester.request(self.path)
parsed = pq(response.content)
options = self.current_options()
requirements = []
seen_paths = set()
for link in parsed('a'):
doc_path = self.doc_path_for_href(link.attrib.get('href', ''))
if doc_path and doc_path not in seen_paths:
seen_paths.add(doc_path)
requirements.append(('document', doc_path, options))
return True, requirements
def doc_path_for_href(self, href):
"""
Return a Document path for the given <a href="url">.
If the href doesn't look like a wiki document, then return None.
"""
href = self.decode_href(href)
path = urlparse(href).path
# Strip trailing slashes
if path.endswith('/'):
path = path[:-1]
# Skip anchors and non-absolute links
# The URLAbsolutionFilter should convert to absolute links
if not path.startswith('/'):
return
# Skip API endpoints
if '$' in path:
return
# Skip other locales, non-translated pages, and the homepage
if not path.startswith('/' + self.locale + '/'):
return
# Skip known non-wiki documents
slug = path.split('/', 2)[2]
if any([slug.startswith(ignore) for ignore in self.ignored_slugs]):
return
return path
def save_data(self, storage, data):
"""Return the links on the page as post-sources."""
return data