Skip to content

Commit

Permalink
Add an item_lookup tool for showing what was extracted for a given item.
Browse files Browse the repository at this point in the history
  • Loading branch information
mihaip committed Jun 30, 2013
1 parent 0802a04 commit ed16d86
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 2 deletions.
36 changes: 36 additions & 0 deletions base/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import collections
import json
import logging
import re
import urllib
import urllib2

Expand Down Expand Up @@ -411,6 +412,10 @@ class Comment(collections.namedtuple(
def to_json(self):
return self._asdict()

@staticmethod
def from_json(comment_json):
return Comment(**comment_json)

UserInfo = collections.namedtuple('UserInfo', ['user_id', 'email'])

class ItemRef(collections.namedtuple('ItemRef', ['item_id', 'timestamp_usec'])):
Expand All @@ -430,6 +435,18 @@ def to_json(self):
},
}

@staticmethod
def from_json(stream_json):
return Stream(
stream_id=stream_json['stream_id'],
item_refs=[
ItemRef(
item_id=ItemId.from_json(item_id_json),
timestamp_usec=timestamp_usec
) for item_id_json, timestamp_usec
in stream_json['item_refs'].iteritems()
])

# See https://code.google.com/p/google-reader-api/wiki/ItemId for the two forms
# item IDs.
class ItemId(collections.namedtuple('ItemId', ['decimal_form', 'atom_form'])):
Expand All @@ -439,6 +456,10 @@ def to_json(self):
def compact_form(self):
return self.atom_form[len(_ITEM_ID_ATOM_FORM_PREFIX):]

@staticmethod
def from_json(item_id_json):
return item_id_from_compact_form(item_id_json)

def item_id_from_decimal_form(decimal_form):
int_form = int(decimal_form)
if int_form < 0:
Expand All @@ -462,6 +483,21 @@ def item_id_from_compact_form(compact_form):
decimal_form=decimal_form,
atom_form=_ITEM_ID_ATOM_FORM_PREFIX + compact_form)

def item_id_from_any_form(form):
if form.startswith(_ITEM_ID_ATOM_FORM_PREFIX):
return item_id_from_atom_form(form)

if form.startswith('0x'):
return item_id_from_compact_form(form[2:])

if re.match('^[0-9a-f]+$', form, re.I):
return item_id_from_compact_form(form)

if re.match('^-?[0-9]+$', form):
return item_id_from_decimal_form(form)

return None

_TEST_DATA = [
('tag:google.com,2005:reader/item/5d0cfa30041d4348', '6705009029382226760'),
('tag:google.com,2005:reader/item/024025978b5e50d2', '162170919393841362'),
Expand Down
7 changes: 5 additions & 2 deletions base/atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ def init():
ET.register_namespace('media', 'http://search.yahoo.com/mrss/')
ET.register_namespace('thr', 'http://purl.org/syndication/thread/1.0')

def parse(xml_text):
feed_element = ET.fromstring(xml_text)
def parse(xml_text_or_file):
if hasattr(xml_text_or_file, 'read'):
feed_element = ET.parse(xml_text_or_file)
else:
feed_element = ET.fromstring(xml_text_or_file)
entry_elements = feed_element.findall('{%s}entry' % ATOM_NS)
entries = []
for entry_element in entry_elements:
Expand Down
4 changes: 4 additions & 0 deletions bin/item_lookup
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

ROOT_DIR=$(dirname $0)/..
PYTHONPATH=$ROOT_DIR /usr/bin/env python2.7 $ROOT_DIR/item_lookup/item_lookup.py "$@"
121 changes: 121 additions & 0 deletions item_lookup/item_lookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import argparse
import datetime
import json
import logging
import os
import os.path
import sys
import xml.etree.cElementTree as ET

import base.api
import base.atom
import base.paths
import base.log

def main():
global archive_directory
base.atom.init()
base.log.init()

parser = argparse.ArgumentParser(
description='HTTP server that allows the browsing of an archive of a'
'Google Reader account')

parser.add_argument('item_ids', metavar='item_id', nargs='*', default=[],
help='Item ID(s) to look up.')
parser.add_argument('--archive_directory',
help='Path to archive directory generated by '
'reader_archive to look up the item in.')

args = parser.parse_args()

if not args.archive_directory:
logging.error('--archive_directory was not specified')
sys.exit(1)
archive_directory = base.paths.normalize(args.archive_directory)
if not os.path.exists(archive_directory):
logging.error('Could not find archive directory %s', archive_directory)
syst.exit(1)

item_ids = []
for raw_item_id in args.item_ids:
item_id = base.api.item_id_from_any_form(raw_item_id)
if not item_id:
logging.error('%s is not a valid ID', raw_item_id)
sys.exit(1)
item_ids.append(item_id)
if not item_ids:
logging.error('No item IDs were specified.')
sys.exit(1)

logging.info('Looking up streams for items.')
streams_directory = os.path.join(archive_directory, 'streams')
item_ids_to_stream_ids_and_timestamps = {}
for stream_file_name in os.listdir(streams_directory):
with open(os.path.join(streams_directory, stream_file_name)) as stream_file:
stream_json = json.load(stream_file)
for item_id in item_ids:
timestamp_usec = stream_json['item_refs'].get(item_id.to_json())
if not timestamp_usec:
continue
item_ids_to_stream_ids_and_timestamps.setdefault(item_id, []).append(
(stream_json['stream_id'], timestamp_usec))

for item_id in item_ids:
logging.info('Item ID %s:', item_id)
stream_ids_and_timestamps = \
item_ids_to_stream_ids_and_timestamps.get(item_id)
if stream_ids_and_timestamps:
for stream_id, timestamp_usec in stream_ids_and_timestamps:
timestamp_date = datetime.datetime.utcfromtimestamp(
timestamp_usec/1000000.0)
logging.info(' In the stream %s with timestamp %d (%s)',
stream_id, timestamp_usec, timestamp_date.isoformat())
else:
logging.warn(' Not found in any streams')

logging.info('Looking up bodies for items.')
for item_id in item_ids:
item_body_path = base.paths.item_id_to_file_path(
os.path.join(archive_directory, 'items'), item_id)
if os.path.exists(item_body_path):
with open(item_body_path) as item_body_file:
feed = base.atom.parse(item_body_file)
found_entry = False
for entry in feed.entries:
if entry.item_id == item_id:
logging.info('Body for item %s:', item_id)
logging.info(' %s', ET.tostring(entry.element))
found_entry = True
break
if not found_entry:
logging.warning('Did not find item body for %s', item_id)
else:
logging.warning('No item body file found for %s', item_id)

logging.info('Looking up comments for items')
for item_id in item_ids:
item_comments_path = os.path.join(base.paths.item_id_to_file_path(
os.path.join(archive_directory, 'comments'), item_id),
item_id.compact_form())
if os.path.exists(item_comments_path):
logging.info('Comments on item %s:', item_id)
with open(item_comments_path) as item_comments_file:
comments_json = json.load(item_comments_file)
comments_by_venue = {}
for comment_json in comments_json:
comment = base.api.Comment.from_json(comment_json)
comments_by_venue.setdefault(comment.venue_stream_id, []).append(comment)

for venue_stream_id, comments in comments_by_venue.iteritems():
logging.info(' Venue %s', venue_stream_id)
for comment in comments:
logging.info(' "%s" by %s',
comment.plain_content, comment.author_name)
else:
logging.info('No comments for item %s', item_id)



if __name__ == '__main__':
main()

0 comments on commit ed16d86

Please sign in to comment.