memgator: use memgator aggregator, support loading archive list from …

…archives.json definition as well as xml definition
oldweb-today · Nov 5, 2015 · 920c0fa · 920c0fa
1 parent 317fffd
commit 920c0fa
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 22 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -3,6 +3,24 @@ pywb:
     restart: always
     links:
         - redis
+        - memgator
+
+    environment:
+        - ARCHIVE_JSON=http://webenact.rhizome.org/collinfo.json
+    #volumes:
+    #    - ./archives.json:/archives.json
+
+memgator:
+#    image: ibnesayeed/memgator
+    image: ikreymer/memgator
+    restart: always
+
+    command: --arcs=http://webenact.rhizome.org/collinfo.json server
+    ports:
+        - 1208:1208
+
+    #volumes:
+    #    - ./archives.json:/archives.json
 
 redis:
     image: redis:latest
@@ -22,7 +40,6 @@ app:
 
     volumes:
         - /var/run/docker.sock:/var/run/docker.sock
-        - ./app/browser:/browser
 
     environment:
         - SCREEN_WIDTH=1200

diff --git a/pywb/archivereplayview.py b/pywb/archivereplayview.py
@@ -14,6 +14,8 @@
 
 import xml.etree.ElementTree as ElementTree
 import urlparse
+import json
+import os
 
 
 #=============================================================================
@@ -163,10 +165,40 @@ def _get_urls_to_try(self, cdx, skip_hosts, wbrequest):
 class MementoUpstreamArchiveLoader(UpstreamArchiveLoader):
     def __init__(self, config):
         super(MementoUpstreamArchiveLoader, self).__init__(config)
-        self.load_archive_info_xml(config.get('memento_archive_xml'))
+        if config.get('memento_archive_json'):
+            self.load_archive_info_json(config.get('memento_archive_json'))
+        else:
+            self.load_archive_info_xml(config.get('memento_archive_xml'))
+
+    def load_archive_info_json(self, url):
+        self.archive_infos = {}
+        url = os.path.expandvars(url)
+        logging.debug('Loading XML from {0}'.format(url))
+        if not url:
+            return
+
+        try:
+            stream = BlockLoader().load(url)
+        except Exception as e:
+            logging.debug(e)
+            logging.debug('Proceeding without json archive info')
+
+        archives = json.loads(stream.read())
+        for arc in archives:
+            id_ = arc['id']
+            name = arc['name']
+            uri = arc['timegate']
+            unrewritten_url = uri + '{timestamp}id_/{url}'
+
+            self.archive_infos[id_] = {'uri': uri,
+                                       'name': name,
+                                       'rewritten': True,
+                                       'unrewritten_url': unrewritten_url}
+
 
     def load_archive_info_xml(self, url):
         self.archive_infos = {}
+        url = os.path.expandvars(url)
         logging.debug('Loading XML from {0}'.format(url))
         if not url:
             return
@@ -199,39 +231,41 @@ def load_archive_info_xml(self, url):
                                         'name': longname
                                         }
 
-    def find_archive_info(self, host):
-        host = host.split(':')[0]
+    def find_archive_info(self, uri):
+        #uri = uri.split('://', 1)[-1]
         for name, info in self.archive_infos.iteritems():
-            if host in info['uri']:
+            if info['uri'] in uri:
                 return info
         return None
 
 
     def _get_urls_to_try(self, cdx, skip_hosts, wbrequest):
         src_url = cdx['src_url']
         parts = urlparse.urlsplit(src_url)
-        archive_host = parts.netloc
 
-        if archive_host in skip_hosts:
-            raise CaptureException('Skipping already failed: ' + archive_host)
+        if src_url in skip_hosts:
+            raise CaptureException('Skipping already failed: ' + src_url)
 
-        info = self.find_archive_info(archive_host)
+        info = self.find_archive_info(src_url)
+        print(src_url)
+        print(info)
 
-        if info and info['unrewritten_url']:
+        if info and info.get('unrewritten_url'):
             orig_url = info['unrewritten_url'].format(timestamp=cdx['timestamp'],
                                                       url=cdx['url'])
             try_urls = [orig_url]
+            print(try_urls)
         else:
             try_urls = [src_url]
 
         if info:
-            name = info.get('name', archive_host)
+            name = info.get('name', src_url)
         else:
-            name = archive_host
+            name = src_url
 
         wbrequest.urlrewriter.rewrite_opts['orig_src_url'] = cdx['src_url']
         wbrequest.urlrewriter.rewrite_opts['archive_info'] = info
-        return try_urls, archive_host, name
+        return try_urls, src_url, name
 
 
 #=============================================================================
@@ -247,7 +281,7 @@ def rewrite(self, url, mod=None):
         # or archive is not rewritten, use as is
         # (but add regex check for rewritten urls just in case, as they
         # may pop up in Location headers)
-        if info and (info['unrewritten_url'] or not info['rewritten']):
+        if info and (info.get('unrewritten_url') or not info.get('rewritten')):
             m = WBURL_RX.match(url)
             if m:
                 if not mod:

diff --git a/pywb/config.yaml b/pywb/config.yaml
@@ -13,21 +13,24 @@ collections:
     # Memento Aggregator Collection: Specify paths to Timegate, Timemap
     memento_reconstruct:
         index_paths:
-            - http://timetravel.mementoweb.org/api/json/
-            - http://labs.mementoweb.org/timemap/json/
+            #- http://timetravel.mementoweb.org/api/json/
+            #- http://labs.mementoweb.org/timemap/json/
+            - http://netcapsule_memgator_1:1208/timenav/json/
+            - http://netcapsule_memgator_1:1208/timemap/json/
 
         server_cls: !!python/name:mementoquery.MementoIndexServer
 
         wb_handler_class: !!python/name:archivereplayview.MementoHandler
 
-        fallback: live
+#        fallback: live
 
     live:
         index_paths: $liveweb
 
 # Specify memento archivelist XML
-memento_archive_xml: 'http://labs.mementoweb.org/aggregator_config/archivelist.xml'
-
+#memento_archive_xml: 'http://labs.mementoweb.org/aggregator_config/archivelist.xml'
+#memento_archive_json: '/archives.json'
+memento_archive_json: $ARCHIVE_JSON
 
 buffer_response: false
 urlrewriter_class: !!python/name:archivereplayview.ReUrlRewriter

diff --git a/pywb/mementoquery.py b/pywb/mementoquery.py
@@ -45,9 +45,10 @@ def timegate_query(self, timestamp, url):
             r = self.session.get(full)
             result = r.json()
         except Exception as e:
-            #if r.status_code == 404:
-            #    return {}
-            logging.debug(e)
+            if r.status_code != 404:
+                import traceback
+                traceback.print_exc(e)
+
             msg = 'No Mementos Found'
             raise NotFoundException(msg, url=url)