Skip to content
This repository has been archived by the owner on Dec 23, 2020. It is now read-only.

Commit

Permalink
pywb: cleanup, remove unused code (urlrewriter), check for rewritten …
Browse files Browse the repository at this point in the history
…urls in Location or Content-Location headers and unrewrite
  • Loading branch information
ikreymer committed Nov 30, 2015
1 parent d280fca commit 922d7cb
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 58 deletions.
2 changes: 0 additions & 2 deletions pywb/Dockerfile
Expand Up @@ -11,8 +11,6 @@ ENV PYWB_VERSION git+https://github.com/ikreymer/pywb.git@develop#egg=pywb-0.11.

RUN pip install -U $PYWB_VERSION

RUN pip install -U certauth

COPY config.yaml /webarchive/

COPY . /webarchive/
Expand Down
69 changes: 14 additions & 55 deletions pywb/archivereplayview.py
Expand Up @@ -19,7 +19,7 @@


#=============================================================================
WBURL_RX = re.compile('(.*/)([0-9]{1,14})(\w{2}_)?(/https?://.*)')
WBURL_RX = re.compile('(.*/)([0-9]{1,14})(\w{2}_)?/(https?://.*)')
EXTRACT_ORIG_LINK = re.compile(r'<([^>]+)>;\s*rel=\"original\"')

NO_GZIP_UAS = ['NCSA_Mosaic']
Expand Down Expand Up @@ -75,6 +75,15 @@ def __init__(self, config):
# init redis here only
redisclient.init_redis(config)

def unrewrite_header(self, response, name):
value = response.headers.get(name)

# extract orig url from redirect, if any
if value:
m = WBURL_RX.match(value)
if m:
response.headers[name] = m.group(4)

def _do_req(self, urls, host, env, skip_hosts):
response = None

Expand Down Expand Up @@ -135,6 +144,10 @@ def __call__(self, cdx, skip_hosts, cdx_loader, wbrequest):
print(skip_hosts)
raise CaptureException('Content Could Not Be Loaded')

if response.status_code >= 300 and response.status_code < 400:
self.unrewrite_header(response, 'Location')
self.unrewrite_header(response, 'Content-Location')

remote = wbrequest.env.get('REMOTE_ADDR')
req_ts = wbrequest.wb_url.timestamp
base_key = remote + ':' + req_ts
Expand Down Expand Up @@ -289,57 +302,3 @@ def _get_urls_to_try(self, cdx, skip_hosts, wbrequest):
wbrequest.urlrewriter.rewrite_opts['archive_info'] = info
return try_urls, src_url, name


#=============================================================================
class ReUrlRewriter(UrlRewriter):
def __init__(self, *args, **kwargs):
self.session = None
super(ReUrlRewriter, self).__init__(*args, **kwargs)

def rewrite(self, url, mod=None):
info = self.rewrite_opts.get('archive_info')

# if archive info exists, and unrewrtten api exists,
# or archive is not rewritten, use as is
# (but add regex check for rewritten urls just in case, as they
# may pop up in Location headers)
if info and (info.get('unrewritten_url') or not info.get('rewritten')):
m = WBURL_RX.match(url)
if m:
if not mod:
mod = self.wburl.mod
return self.prefix + m.group(2) + mod + m.group(4)
else:
return super(ReUrlRewriter, self).rewrite(url, mod)

# Use HEAD request to get original url
else:
# don't rewrite certain urls at all
if not url.startswith(self.NO_REWRITE_URI_PREFIX):
url = self.urljoin(self.rewrite_opts.get('orig_src_url'), url)
url = self.head_memento_orig(url)

return super(ReUrlRewriter, self).rewrite(url, mod)

def head_memento_orig(self, url):
try:
if not self.session:
self.session = requests.Session()

logging.debug('Loading HEAD Memento Headers from ' + url)
r = self.session.head(url)
link = r.headers.get('Link')
if link:
m = EXTRACT_ORIG_LINK.search(link)
if m:
url = m.group(1)
logging.debug('Found Original: ' + url)

except Exception as e:
logging.debug(e)

finally:
return url

def _create_rebased_rewriter(self, new_wburl, prefix):
return ReUrlRewriter(new_wburl, prefix)
2 changes: 1 addition & 1 deletion pywb/config.yaml
Expand Up @@ -38,7 +38,7 @@ memento_archive_json: '/archives.json'
reverse_proxy_prefix: http://netcapsule_nginx_1:1210/

buffer_response: false
urlrewriter_class: !!python/name:archivereplayview.ReUrlRewriter
#urlrewriter_class: !!python/name:archivereplayview.ReUrlRewriter

enable_cdx_api: true
framed_replay: false
Expand Down

0 comments on commit 922d7cb

Please sign in to comment.