From 922d7cb00c3bc3337bd6cc95b567ff35fdd5c3dd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 30 Nov 2015 10:35:59 +0000 Subject: [PATCH] pywb: cleanup, remove unused code (urlrewriter), check for rewritten urls in Location or Content-Location headers and unrewrite --- pywb/Dockerfile | 2 -- pywb/archivereplayview.py | 69 ++++++++------------------------------- pywb/config.yaml | 2 +- 3 files changed, 15 insertions(+), 58 deletions(-) diff --git a/pywb/Dockerfile b/pywb/Dockerfile index ec23cd5..63b8479 100644 --- a/pywb/Dockerfile +++ b/pywb/Dockerfile @@ -11,8 +11,6 @@ ENV PYWB_VERSION git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.11. RUN pip install -U $PYWB_VERSION -RUN pip install -U certauth - COPY config.yaml /webarchive/ COPY . /webarchive/ diff --git a/pywb/archivereplayview.py b/pywb/archivereplayview.py index 7e2bacc..942d6fd 100644 --- a/pywb/archivereplayview.py +++ b/pywb/archivereplayview.py @@ -19,7 +19,7 @@ #============================================================================= -WBURL_RX = re.compile('(.*/)([0-9]{1,14})(\w{2}_)?(/https?://.*)') +WBURL_RX = re.compile('(.*/)([0-9]{1,14})(\w{2}_)?/(https?://.*)') EXTRACT_ORIG_LINK = re.compile(r'<([^>]+)>;\s*rel=\"original\"') NO_GZIP_UAS = ['NCSA_Mosaic'] @@ -75,6 +75,15 @@ def __init__(self, config): # init redis here only redisclient.init_redis(config) + def unrewrite_header(self, response, name): + value = response.headers.get(name) + + # extract orig url from redirect, if any + if value: + m = WBURL_RX.match(value) + if m: + response.headers[name] = m.group(4) + def _do_req(self, urls, host, env, skip_hosts): response = None @@ -135,6 +144,10 @@ def __call__(self, cdx, skip_hosts, cdx_loader, wbrequest): print(skip_hosts) raise CaptureException('Content Could Not Be Loaded') + if response.status_code >= 300 and response.status_code < 400: + self.unrewrite_header(response, 'Location') + self.unrewrite_header(response, 'Content-Location') + remote = wbrequest.env.get('REMOTE_ADDR') req_ts = wbrequest.wb_url.timestamp base_key = remote + ':' + req_ts @@ -289,57 +302,3 @@ def _get_urls_to_try(self, cdx, skip_hosts, wbrequest): wbrequest.urlrewriter.rewrite_opts['archive_info'] = info return try_urls, src_url, name - -#============================================================================= -class ReUrlRewriter(UrlRewriter): - def __init__(self, *args, **kwargs): - self.session = None - super(ReUrlRewriter, self).__init__(*args, **kwargs) - - def rewrite(self, url, mod=None): - info = self.rewrite_opts.get('archive_info') - - # if archive info exists, and unrewrtten api exists, - # or archive is not rewritten, use as is - # (but add regex check for rewritten urls just in case, as they - # may pop up in Location headers) - if info and (info.get('unrewritten_url') or not info.get('rewritten')): - m = WBURL_RX.match(url) - if m: - if not mod: - mod = self.wburl.mod - return self.prefix + m.group(2) + mod + m.group(4) - else: - return super(ReUrlRewriter, self).rewrite(url, mod) - - # Use HEAD request to get original url - else: - # don't rewrite certain urls at all - if not url.startswith(self.NO_REWRITE_URI_PREFIX): - url = self.urljoin(self.rewrite_opts.get('orig_src_url'), url) - url = self.head_memento_orig(url) - - return super(ReUrlRewriter, self).rewrite(url, mod) - - def head_memento_orig(self, url): - try: - if not self.session: - self.session = requests.Session() - - logging.debug('Loading HEAD Memento Headers from ' + url) - r = self.session.head(url) - link = r.headers.get('Link') - if link: - m = EXTRACT_ORIG_LINK.search(link) - if m: - url = m.group(1) - logging.debug('Found Original: ' + url) - - except Exception as e: - logging.debug(e) - - finally: - return url - - def _create_rebased_rewriter(self, new_wburl, prefix): - return ReUrlRewriter(new_wburl, prefix) diff --git a/pywb/config.yaml b/pywb/config.yaml index b11a730..c9ff79b 100644 --- a/pywb/config.yaml +++ b/pywb/config.yaml @@ -38,7 +38,7 @@ memento_archive_json: '/archives.json' reverse_proxy_prefix: http://netcapsule_nginx_1:1210/ buffer_response: false -urlrewriter_class: !!python/name:archivereplayview.ReUrlRewriter +#urlrewriter_class: !!python/name:archivereplayview.ReUrlRewriter enable_cdx_api: true framed_replay: false