Add an item_lookup tool for showing what was extracted for a given item.

mihaip committed Jun 30, 2013
1 parent 0802a04 commit ed16d866c064fa2adea5622f484a094fa2c268db
Showing with 166 additions and 2 deletions.
  1. +36 −0 base/
  2. +5 −2 base/
  3. +4 −0 bin/item_lookup
  4. +121 −0 item_lookup/
@@ -1,6 +1,7 @@
import collections
import json
import logging
import re
import urllib
import urllib2

@@ -411,6 +412,10 @@ class Comment(collections.namedtuple(
def to_json(self):
return self._asdict()

def from_json(comment_json):
return Comment(**comment_json)

UserInfo = collections.namedtuple('UserInfo', ['user_id', 'email'])

class ItemRef(collections.namedtuple('ItemRef', ['item_id', 'timestamp_usec'])):
@@ -430,6 +435,18 @@ def to_json(self):

def from_json(stream_json):
return Stream(
) for item_id_json, timestamp_usec
in stream_json['item_refs'].iteritems()

# See for the two forms
# item IDs.
class ItemId(collections.namedtuple('ItemId', ['decimal_form', 'atom_form'])):
@@ -439,6 +456,10 @@ def to_json(self):
def compact_form(self):
return self.atom_form[len(_ITEM_ID_ATOM_FORM_PREFIX):]

def from_json(item_id_json):
return item_id_from_compact_form(item_id_json)

def item_id_from_decimal_form(decimal_form):
int_form = int(decimal_form)
if int_form < 0:
@@ -462,6 +483,21 @@ def item_id_from_compact_form(compact_form):
atom_form=_ITEM_ID_ATOM_FORM_PREFIX + compact_form)

def item_id_from_any_form(form):
if form.startswith(_ITEM_ID_ATOM_FORM_PREFIX):
return item_id_from_atom_form(form)

if form.startswith('0x'):
return item_id_from_compact_form(form[2:])

if re.match('^[0-9a-f]+$', form, re.I):
return item_id_from_compact_form(form)

if re.match('^-?[0-9]+$', form):
return item_id_from_decimal_form(form)

return None

(',2005:reader/item/5d0cfa30041d4348', '6705009029382226760'),
(',2005:reader/item/024025978b5e50d2', '162170919393841362'),
@@ -15,8 +15,11 @@ def init():
ET.register_namespace('media', '')
ET.register_namespace('thr', '')

def parse(xml_text):
feed_element = ET.fromstring(xml_text)
def parse(xml_text_or_file):
if hasattr(xml_text_or_file, 'read'):
feed_element = ET.parse(xml_text_or_file)
feed_element = ET.fromstring(xml_text_or_file)
entry_elements = feed_element.findall('{%s}entry' % ATOM_NS)
entries = []
for entry_element in entry_elements:
@@ -0,0 +1,4 @@

ROOT_DIR=$(dirname $0)/..
PYTHONPATH=$ROOT_DIR /usr/bin/env python2.7 $ROOT_DIR/item_lookup/ "$@"
@@ -0,0 +1,121 @@
import argparse
import datetime
import json
import logging
import os
import os.path
import sys
import xml.etree.cElementTree as ET

import base.api
import base.atom
import base.paths
import base.log

def main():
global archive_directory

parser = argparse.ArgumentParser(
description='HTTP server that allows the browsing of an archive of a'
'Google Reader account')

parser.add_argument('item_ids', metavar='item_id', nargs='*', default=[],
help='Item ID(s) to look up.')
help='Path to archive directory generated by '
'reader_archive to look up the item in.')

args = parser.parse_args()

if not args.archive_directory:
logging.error('--archive_directory was not specified')
archive_directory = base.paths.normalize(args.archive_directory)
if not os.path.exists(archive_directory):
logging.error('Could not find archive directory %s', archive_directory)

item_ids = []
for raw_item_id in args.item_ids:
item_id = base.api.item_id_from_any_form(raw_item_id)
if not item_id:
logging.error('%s is not a valid ID', raw_item_id)
if not item_ids:
logging.error('No item IDs were specified.')
sys.exit(1)'Looking up streams for items.')
streams_directory = os.path.join(archive_directory, 'streams')
item_ids_to_stream_ids_and_timestamps = {}
for stream_file_name in os.listdir(streams_directory):
with open(os.path.join(streams_directory, stream_file_name)) as stream_file:
stream_json = json.load(stream_file)
for item_id in item_ids:
timestamp_usec = stream_json['item_refs'].get(item_id.to_json())
if not timestamp_usec:
item_ids_to_stream_ids_and_timestamps.setdefault(item_id, []).append(
(stream_json['stream_id'], timestamp_usec))

for item_id in item_ids:'Item ID %s:', item_id)
stream_ids_and_timestamps = \
if stream_ids_and_timestamps:
for stream_id, timestamp_usec in stream_ids_and_timestamps:
timestamp_date = datetime.datetime.utcfromtimestamp(
timestamp_usec/1000000.0)' In the stream %s with timestamp %d (%s)',
stream_id, timestamp_usec, timestamp_date.isoformat())
logging.warn(' Not found in any streams')'Looking up bodies for items.')
for item_id in item_ids:
item_body_path = base.paths.item_id_to_file_path(
os.path.join(archive_directory, 'items'), item_id)
if os.path.exists(item_body_path):
with open(item_body_path) as item_body_file:
feed = base.atom.parse(item_body_file)
found_entry = False
for entry in feed.entries:
if entry.item_id == item_id:'Body for item %s:', item_id)' %s', ET.tostring(entry.element))
found_entry = True
if not found_entry:
logging.warning('Did not find item body for %s', item_id)
logging.warning('No item body file found for %s', item_id)'Looking up comments for items')
for item_id in item_ids:
item_comments_path = os.path.join(base.paths.item_id_to_file_path(
os.path.join(archive_directory, 'comments'), item_id),
if os.path.exists(item_comments_path):'Comments on item %s:', item_id)
with open(item_comments_path) as item_comments_file:
comments_json = json.load(item_comments_file)
comments_by_venue = {}
for comment_json in comments_json:
comment = base.api.Comment.from_json(comment_json)
comments_by_venue.setdefault(comment.venue_stream_id, []).append(comment)

for venue_stream_id, comments in comments_by_venue.iteritems():' Venue %s', venue_stream_id)
for comment in comments:' "%s" by %s',
comment.plain_content, comment.author_name)
else:'No comments for item %s', item_id)

if __name__ == '__main__':

