Skip to content

Commit

Permalink
follow seamlessly redirections from archive, whether declared or with…
Browse files Browse the repository at this point in the history
…in html + rewrite all links found + store archive date as meta + skip internal archive links (WIP #372)
  • Loading branch information
boogheta committed May 26, 2021
1 parent def2b0b commit 0c0a538
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 14 deletions.
65 changes: 53 additions & 12 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import WebDriverException, TimeoutException as SeleniumTimeout

from ural import normalize_url
from ural.lru import LRUTrie

from hcicrawler.linkextractor import RegexpLinkExtractor
Expand All @@ -33,6 +34,15 @@
def timeout_alarm(*args):
raise SeleniumTimeout

RE_ARCHIVE_REDIRECT = re.compile(r'function go\(\) \{.*document.location.href = "(%s[^"]*)".*<p class="code shift red">Got an HTTP (\d+) response at crawl time</p>.*<p class="code">Redirecting to...</p>' % ARCHIVES["URL_PREFIX"], re.I|re.S)

def normalize(url):
return normalize_url(
url,
strip_index=False,
strip_irrelevant_subdomains=False
)

class PagesCrawler(Spider):

name = 'pages'
Expand Down Expand Up @@ -73,7 +83,9 @@ def __init__(self, **kwargs):

if ARCHIVES["ENABLED"]:
self.archivedate = re.sub(r"\D", "", str(ARCHIVES["DATE"])) + "120000"
self.archiveprefix = "%s/%s/" % (ARCHIVES["URL_PREFIX"].rstrip('/'), self.archivedate)
archiveprefix = ARCHIVES["URL_PREFIX"].rstrip('/')
self.archiveprefix = "%s/%s/" % (archiveprefix, self.archivedate)
self.archiveregexp = re.compile(r"^%s/(\d{14})/" % archiveprefix, re.I)

self.cookies = None
if 'cookies' in args and args["cookies"]:
Expand Down Expand Up @@ -192,7 +204,21 @@ def handle_response(self, response):
except:
pass

if 300 < response.status < 400 or isinstance(response, HtmlResponse):
if ARCHIVES["ENABLED"]:
if response.status == 302:
redir_url = response.headers['Location']
real_url = self.archiveregexp.sub("", redir_url)
orig_url = self.archiveregexp.sub("", response.url)
if self.archiveregexp.match(redir_url) and normalize(real_url) == normalize(orig_url):
if "depth" in response.meta:
response.meta['depth'] -= 1
else:
response.meta['depth'] = -1
return self._request(redir_url)
if response.status >= 400:
return self._make_raw_page(response)

if 300 <= response.status < 400 or isinstance(response, HtmlResponse):
return self.parse_html(response)
else:
return self._make_raw_page(response)
Expand All @@ -214,20 +240,23 @@ def handle_error(self, failure, response=None):
def parse_html(self, response):
orig_url = response.url
if ARCHIVES["ENABLED"]:
orig_url = orig_url.replace(self.archiveprefix, "")
orig_url = self.archiveregexp.sub("", orig_url)
lru = url_to_lru_clean(orig_url, TLDS_TREE)
lrulinks = []

# handle redirects
realdepth = response.meta['depth']
if 300 < response.status < 400:
if ARCHIVES["ENABLED"]:
redir_url = RE_ARCHIVE_REDIRECT.search(response.body)
if redir_url:
response.headers['Location'] = redir_url.group(1)
response.status = int(redir_url.group(2))

if 300 <= response.status < 400:
redir_url = response.headers['Location']
# TODO !
# + handle skipping redirection to same page
if ARCHIVES["ENABLED"]:
pass
# rewrite redir_url
# p['archive_date_obtained'] = "TODO"

if ARCHIVES["ENABLED"] and self.archiveregexp.match(redir_url):
redir_url = self.archiveregexp.sub("", redir_url)

if redir_url.startswith('/'):
redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
Expand All @@ -239,6 +268,7 @@ def parse_html(self, response):
redir_url = "%s/%s" % (lru_to_url(lrustart+'|'), redir_url)
elif redir_url.startswith('./') or not redir_url.startswith('http'):
redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])

links = [{'url': redir_url}]
response.meta['depth'] -= 1

Expand All @@ -254,6 +284,10 @@ def parse_html(self, response):
url = link.url
except AttributeError:
url = link['url']
if ARCHIVES["ENABLED"]:
url = self.archiveregexp.sub("", url)
if url.startswith(ARCHIVES["URL_PREFIX"]):
continue
try:
lrulink = url_to_lru_clean(url, TLDS_TREE)
except (ValueError, IndexError) as e:
Expand All @@ -277,9 +311,11 @@ def _make_raw_page(self, response):
p = Page()
p['url'] = response.url
if ARCHIVES["ENABLED"]:
p['url'] = p['url'].replace(self.archiveprefix, "")
p['url'] = self.archiveregexp.sub("", response.url)
p['archive_url'] = response.url
p['archive_date_requested'] = self.archivedate
if 'archive_timestamp' in response.meta:
p['archive_date_obtained'] = response.meta['archive_timestamp']
p['lru'] = url_to_lru_clean(p['url'], TLDS_TREE)
p['depth'] = 0
p['timestamp'] = int(time.time()*1000)
Expand Down Expand Up @@ -308,7 +344,12 @@ def _request(self, url, noproxy=False, **kw):
if self.phantom:
kw['method'] = 'HEAD'
if ARCHIVES["ENABLED"]:
return Request(self.archiveprefix + url, **kw)
if url.startswith(ARCHIVES["URL_PREFIX"]):
kw["meta"]["archive_timestamp"] = self.archiveregexp.search(url).group(1)
return Request(url, **kw)
else:
kw["meta"]["archive_timestamp"] = self.archivedate
return Request(self.archiveprefix + url, **kw)
return Request(url, **kw)


Expand Down
4 changes: 2 additions & 2 deletions hyphe_backend/crawler/requirements-scrapyd.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ selenium==2.42.1
pymongo==3.8
queuelib==1.4.2
txmongo==19.2.0
ural==0.20.0
tld==0.11.10
ural==0.30.0
tld==0.12.1

0 comments on commit 0c0a538

Please sign in to comment.