Add an item_lookup tool for showing what was extracted for a given item.

mihaip · Jun 30, 2013 · ed16d86 · ed16d86
1 parent 0802a04
commit ed16d86
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 2 deletions.
diff --git a/base/api.py b/base/api.py
@@ -1,6 +1,7 @@
 import collections
 import json
 import logging
+import re
 import urllib
 import urllib2
 
@@ -411,6 +412,10 @@ class Comment(collections.namedtuple(
   def to_json(self):
     return self._asdict()
 
+  @staticmethod
+  def from_json(comment_json):
+    return Comment(**comment_json)
+
 UserInfo = collections.namedtuple('UserInfo', ['user_id', 'email'])
 
 class ItemRef(collections.namedtuple('ItemRef', ['item_id', 'timestamp_usec'])):
@@ -430,6 +435,18 @@ def to_json(self):
       },
     }
 
+  @staticmethod
+  def from_json(stream_json):
+    return Stream(
+        stream_id=stream_json['stream_id'],
+        item_refs=[
+          ItemRef(
+              item_id=ItemId.from_json(item_id_json),
+              timestamp_usec=timestamp_usec
+          ) for item_id_json, timestamp_usec
+          in stream_json['item_refs'].iteritems()
+        ])
+
 # See https://code.google.com/p/google-reader-api/wiki/ItemId for the two forms
 # item IDs.
 class ItemId(collections.namedtuple('ItemId', ['decimal_form', 'atom_form'])):
@@ -439,6 +456,10 @@ def to_json(self):
   def compact_form(self):
     return self.atom_form[len(_ITEM_ID_ATOM_FORM_PREFIX):]
 
+  @staticmethod
+  def from_json(item_id_json):
+    return item_id_from_compact_form(item_id_json)
+
 def item_id_from_decimal_form(decimal_form):
   int_form = int(decimal_form)
   if int_form < 0:
@@ -462,6 +483,21 @@ def item_id_from_compact_form(compact_form):
       decimal_form=decimal_form,
       atom_form=_ITEM_ID_ATOM_FORM_PREFIX + compact_form)
 
+def item_id_from_any_form(form):
+  if form.startswith(_ITEM_ID_ATOM_FORM_PREFIX):
+    return item_id_from_atom_form(form)
+
+  if form.startswith('0x'):
+    return item_id_from_compact_form(form[2:])
+
+  if re.match('^[0-9a-f]+$', form, re.I):
+    return item_id_from_compact_form(form)
+
+  if re.match('^-?[0-9]+$', form):
+    return item_id_from_decimal_form(form)
+
+  return None
+
 _TEST_DATA = [
   ('tag:google.com,2005:reader/item/5d0cfa30041d4348', '6705009029382226760'),
   ('tag:google.com,2005:reader/item/024025978b5e50d2', '162170919393841362'),

diff --git a/base/atom.py b/base/atom.py
@@ -15,8 +15,11 @@ def init():
   ET.register_namespace('media', 'http://search.yahoo.com/mrss/')
   ET.register_namespace('thr', 'http://purl.org/syndication/thread/1.0')
 
-def parse(xml_text):
-  feed_element = ET.fromstring(xml_text)
+def parse(xml_text_or_file):
+  if hasattr(xml_text_or_file, 'read'):
+    feed_element = ET.parse(xml_text_or_file)
+  else:
+    feed_element = ET.fromstring(xml_text_or_file)
   entry_elements = feed_element.findall('{%s}entry' % ATOM_NS)
   entries = []
   for entry_element in entry_elements:

diff --git a/bin/item_lookup b/bin/item_lookup
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+ROOT_DIR=$(dirname $0)/..
+PYTHONPATH=$ROOT_DIR /usr/bin/env python2.7 $ROOT_DIR/item_lookup/item_lookup.py "$@"
diff --git a/item_lookup/item_lookup.py b/item_lookup/item_lookup.py
@@ -0,0 +1,121 @@
+import argparse
+import datetime
+import json
+import logging
+import os
+import os.path
+import sys
+import xml.etree.cElementTree as ET
+
+import base.api
+import base.atom
+import base.paths
+import base.log
+
+def main():
+  global archive_directory
+  base.atom.init()
+  base.log.init()
+
+  parser = argparse.ArgumentParser(
+      description='HTTP server that allows the browsing of an archive of a'
+                  'Google Reader account')
+
+  parser.add_argument('item_ids', metavar='item_id', nargs='*', default=[],
+                      help='Item ID(s) to look up.')
+  parser.add_argument('--archive_directory',
+                      help='Path to archive directory generated by '
+                           'reader_archive to look up the item in.')
+
+  args = parser.parse_args()
+
+  if not args.archive_directory:
+    logging.error('--archive_directory was not specified')
+    sys.exit(1)
+  archive_directory = base.paths.normalize(args.archive_directory)
+  if not os.path.exists(archive_directory):
+    logging.error('Could not find archive directory %s', archive_directory)
+    syst.exit(1)
+
+  item_ids = []
+  for raw_item_id in args.item_ids:
+    item_id = base.api.item_id_from_any_form(raw_item_id)
+    if not item_id:
+      logging.error('%s is not a valid ID', raw_item_id)
+      sys.exit(1)
+    item_ids.append(item_id)
+  if not item_ids:
+      logging.error('No item IDs were specified.')
+      sys.exit(1)
+
+  logging.info('Looking up streams for items.')
+  streams_directory = os.path.join(archive_directory, 'streams')
+  item_ids_to_stream_ids_and_timestamps = {}
+  for stream_file_name in os.listdir(streams_directory):
+    with open(os.path.join(streams_directory, stream_file_name)) as stream_file:
+      stream_json = json.load(stream_file)
+      for item_id in item_ids:
+        timestamp_usec = stream_json['item_refs'].get(item_id.to_json())
+        if not timestamp_usec:
+          continue
+        item_ids_to_stream_ids_and_timestamps.setdefault(item_id, []).append(
+            (stream_json['stream_id'], timestamp_usec))
+
+  for item_id in item_ids:
+    logging.info('Item ID %s:', item_id)
+    stream_ids_and_timestamps = \
+        item_ids_to_stream_ids_and_timestamps.get(item_id)
+    if stream_ids_and_timestamps:
+      for stream_id, timestamp_usec in stream_ids_and_timestamps:
+        timestamp_date = datetime.datetime.utcfromtimestamp(
+            timestamp_usec/1000000.0)
+        logging.info('  In the stream %s with timestamp %d (%s)',
+            stream_id, timestamp_usec, timestamp_date.isoformat())
+    else:
+      logging.warn('  Not found in any streams')
+
+  logging.info('Looking up bodies for items.')
+  for item_id in item_ids:
+    item_body_path = base.paths.item_id_to_file_path(
+        os.path.join(archive_directory, 'items'), item_id)
+    if os.path.exists(item_body_path):
+      with open(item_body_path) as item_body_file:
+        feed = base.atom.parse(item_body_file)
+        found_entry = False
+        for entry in feed.entries:
+          if entry.item_id == item_id:
+            logging.info('Body for item %s:', item_id)
+            logging.info('  %s', ET.tostring(entry.element))
+            found_entry = True
+            break
+        if not found_entry:
+          logging.warning('Did not find item body for %s', item_id)
+    else:
+      logging.warning('No item body file found for %s', item_id)
+
+  logging.info('Looking up comments for items')
+  for item_id in item_ids:
+    item_comments_path = os.path.join(base.paths.item_id_to_file_path(
+        os.path.join(archive_directory, 'comments'), item_id),
+        item_id.compact_form())
+    if os.path.exists(item_comments_path):
+      logging.info('Comments on item %s:', item_id)
+      with open(item_comments_path) as item_comments_file:
+        comments_json = json.load(item_comments_file)
+        comments_by_venue = {}
+        for comment_json in comments_json:
+          comment = base.api.Comment.from_json(comment_json)
+          comments_by_venue.setdefault(comment.venue_stream_id, []).append(comment)
+
+        for venue_stream_id, comments in comments_by_venue.iteritems():
+          logging.info('  Venue %s', venue_stream_id)
+          for comment in comments:
+            logging.info('    "%s" by %s',
+                         comment.plain_content, comment.author_name)
+    else:
+      logging.info('No comments for item %s', item_id)
+
+
+
+if __name__ == '__main__':
+    main()