Skip to content
This repository has been archived by the owner on Dec 23, 2020. It is now read-only.

Commit

Permalink
memgator: use memgator aggregator, support loading archive list from …
Browse files Browse the repository at this point in the history
…archives.json definition as well as xml definition
  • Loading branch information
ikreymer committed Nov 5, 2015
1 parent 317fffd commit 920c0fa
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 22 deletions.
19 changes: 18 additions & 1 deletion docker-compose.yml
Expand Up @@ -3,6 +3,24 @@ pywb:
restart: always
links:
- redis
- memgator

environment:
- ARCHIVE_JSON=http://webenact.rhizome.org/collinfo.json
#volumes:
# - ./archives.json:/archives.json

memgator:
# image: ibnesayeed/memgator
image: ikreymer/memgator
restart: always

command: --arcs=http://webenact.rhizome.org/collinfo.json server
ports:
- 1208:1208

#volumes:
# - ./archives.json:/archives.json

redis:
image: redis:latest
Expand All @@ -22,7 +40,6 @@ app:

volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ./app/browser:/browser

environment:
- SCREEN_WIDTH=1200
Expand Down
60 changes: 47 additions & 13 deletions pywb/archivereplayview.py
Expand Up @@ -14,6 +14,8 @@

import xml.etree.ElementTree as ElementTree
import urlparse
import json
import os


#=============================================================================
Expand Down Expand Up @@ -163,10 +165,40 @@ def _get_urls_to_try(self, cdx, skip_hosts, wbrequest):
class MementoUpstreamArchiveLoader(UpstreamArchiveLoader):
def __init__(self, config):
super(MementoUpstreamArchiveLoader, self).__init__(config)
self.load_archive_info_xml(config.get('memento_archive_xml'))
if config.get('memento_archive_json'):
self.load_archive_info_json(config.get('memento_archive_json'))
else:
self.load_archive_info_xml(config.get('memento_archive_xml'))

def load_archive_info_json(self, url):
self.archive_infos = {}
url = os.path.expandvars(url)
logging.debug('Loading XML from {0}'.format(url))
if not url:
return

try:
stream = BlockLoader().load(url)
except Exception as e:
logging.debug(e)
logging.debug('Proceeding without json archive info')

archives = json.loads(stream.read())
for arc in archives:
id_ = arc['id']
name = arc['name']
uri = arc['timegate']
unrewritten_url = uri + '{timestamp}id_/{url}'

self.archive_infos[id_] = {'uri': uri,
'name': name,
'rewritten': True,
'unrewritten_url': unrewritten_url}


def load_archive_info_xml(self, url):
self.archive_infos = {}
url = os.path.expandvars(url)
logging.debug('Loading XML from {0}'.format(url))
if not url:
return
Expand Down Expand Up @@ -199,39 +231,41 @@ def load_archive_info_xml(self, url):
'name': longname
}

def find_archive_info(self, host):
host = host.split(':')[0]
def find_archive_info(self, uri):
#uri = uri.split('://', 1)[-1]
for name, info in self.archive_infos.iteritems():
if host in info['uri']:
if info['uri'] in uri:
return info
return None


def _get_urls_to_try(self, cdx, skip_hosts, wbrequest):
src_url = cdx['src_url']
parts = urlparse.urlsplit(src_url)
archive_host = parts.netloc

if archive_host in skip_hosts:
raise CaptureException('Skipping already failed: ' + archive_host)
if src_url in skip_hosts:
raise CaptureException('Skipping already failed: ' + src_url)

info = self.find_archive_info(archive_host)
info = self.find_archive_info(src_url)
print(src_url)
print(info)

if info and info['unrewritten_url']:
if info and info.get('unrewritten_url'):
orig_url = info['unrewritten_url'].format(timestamp=cdx['timestamp'],
url=cdx['url'])
try_urls = [orig_url]
print(try_urls)
else:
try_urls = [src_url]

if info:
name = info.get('name', archive_host)
name = info.get('name', src_url)
else:
name = archive_host
name = src_url

wbrequest.urlrewriter.rewrite_opts['orig_src_url'] = cdx['src_url']
wbrequest.urlrewriter.rewrite_opts['archive_info'] = info
return try_urls, archive_host, name
return try_urls, src_url, name


#=============================================================================
Expand All @@ -247,7 +281,7 @@ def rewrite(self, url, mod=None):
# or archive is not rewritten, use as is
# (but add regex check for rewritten urls just in case, as they
# may pop up in Location headers)
if info and (info['unrewritten_url'] or not info['rewritten']):
if info and (info.get('unrewritten_url') or not info.get('rewritten')):
m = WBURL_RX.match(url)
if m:
if not mod:
Expand Down
13 changes: 8 additions & 5 deletions pywb/config.yaml
Expand Up @@ -13,21 +13,24 @@ collections:
# Memento Aggregator Collection: Specify paths to Timegate, Timemap
memento_reconstruct:
index_paths:
- http://timetravel.mementoweb.org/api/json/
- http://labs.mementoweb.org/timemap/json/
#- http://timetravel.mementoweb.org/api/json/
#- http://labs.mementoweb.org/timemap/json/
- http://netcapsule_memgator_1:1208/timenav/json/
- http://netcapsule_memgator_1:1208/timemap/json/

server_cls: !!python/name:mementoquery.MementoIndexServer

wb_handler_class: !!python/name:archivereplayview.MementoHandler

fallback: live
# fallback: live

live:
index_paths: $liveweb

# Specify memento archivelist XML
memento_archive_xml: 'http://labs.mementoweb.org/aggregator_config/archivelist.xml'

#memento_archive_xml: 'http://labs.mementoweb.org/aggregator_config/archivelist.xml'
#memento_archive_json: '/archives.json'
memento_archive_json: $ARCHIVE_JSON

buffer_response: false
urlrewriter_class: !!python/name:archivereplayview.ReUrlRewriter
Expand Down
7 changes: 4 additions & 3 deletions pywb/mementoquery.py
Expand Up @@ -45,9 +45,10 @@ def timegate_query(self, timestamp, url):
r = self.session.get(full)
result = r.json()
except Exception as e:
#if r.status_code == 404:
# return {}
logging.debug(e)
if r.status_code != 404:
import traceback
traceback.print_exc(e)

msg = 'No Mementos Found'
raise NotFoundException(msg, url=url)

Expand Down

0 comments on commit 920c0fa

Please sign in to comment.