Skip to content
Permalink
Browse files

Add basic server and describe API

  • Loading branch information...
ariddell
ariddell committed Dec 29, 2014
1 parent a0965c4 commit 569a23fe9a0ccf4d3ee7b3dafcbd36a6b7cf4682
Showing with 124 additions and 0 deletions.
  1. +2 −0 Dockerfile
  2. +29 −0 README.md
  3. +92 −0 main.py
  4. +1 −0 requirements.txt
@@ -0,0 +1,2 @@
FROM gliderlabs/python-runtime:3.4
MAINTAINER Allen B. Riddell <allen.riddell@dartmouth.edu>
@@ -8,6 +8,35 @@ version control so all changes are recorded.
unimportant, the file `metadata.json` in each directory describes the
relationship between the volume and a *work*.

## Quickstart

Work metadata, volume metadata, and plaintext are exposed via a simple
read-only REST server. This server can be run with the following command
(Python 3.4 and `aiohttp` required):

python main.py

The API has two endpoints:

- `/work` (metadata for all works)
- `/work/<id>`
- `/text/<sha1>`

For example, the novel [Glenarvon](https://en.wikipedia.org/wiki/Glenarvon) has
id `1235` and metadata concerning it and a list of associated volumes may be
retrieved with:

curl http://127.0.0.1:5000/work/1235

As the plaintext of an edition of this novel is available, a list of associated
texts and their SHA-1 hashes is given in the response. The plain text version
of the third volume has hash `40d2491e07dd2f1c71413b65bc551804cb93b0f3` and may
be retrieved with:

curl -s http://127.0.0.1:5000/text/40d2491e07dd2f1c71413b65bc551804cb93b0f3

That's all there is!

## Works

The vast majority of records in `works.csv` are from the two volumes edited by
92 main.py
@@ -0,0 +1,92 @@
"""Simple REST API for novels
1. GET /work/{id} returns all works or individual works
2. GET /text/{sha1sum} returns plaintext for volume
"""
import asyncio
import collections
import csv
import hashlib
import json
import operator
import os

from aiohttp import web


############################################################################
# load volume metadata stored on disk into memory, indexed by work id
# index text filenames by sha1sum
############################################################################
_volumes = collections.defaultdict(list)
texts = {}
for volumes_dir in ['volumes', 'nonfree']:
for dirpath, dirnames, filenames in os.walk(volumes_dir):
for fn in filenames:
if fn == 'metadata.json':
metadata = json.load(open(os.path.join(dirpath, fn)))
work_id = int(metadata['work_id'])
_volumes[work_id] += [metadata]
# sort volumes in ascending order by volume number
_volumes[work_id].sort(key=operator.itemgetter('volume'))
elif os.path.splitext(fn)[-1] == '.txt':
filename = os.path.join(dirpath, fn)
sha1 = hashlib.sha1(open(filename, 'rb').read()).hexdigest()
texts[sha1] = filename

############################################################################
# load work metadata stored on disk into memory
############################################################################
works = collections.OrderedDict((int(d['id']), d) for d in csv.DictReader(open('works.csv')))
# insert volumes into relevant work dictionary
for work in works.values():
# ensure work_id is an integer
work['id'] = int(work['id'])

# inject related volume records
if work['id'] in _volumes:
work['volumes'] = _volumes[work['id']]

############################################################################
# endpoints
############################################################################

@asyncio.coroutine
def work(request):
if request.match_info.get('id', None) is None:
return web.Response(text=json.dumps(works))
else:
id = int(request.match_info['id'])
if id not in works:
return web.HTTPNotFound
return web.Response(text=json.dumps(works[id]))

@asyncio.coroutine
def text(request):
sha1 = request.match_info.get('sha1', None)
if sha1 is None:
return web.HTTPNotFound()
if sha1 not in texts:
return web.HTTPNotFound()
else:
text = open(texts[sha1]).read()
return web.Response(text=text)


@asyncio.coroutine
def init(loop):
app = web.Application(loop=loop)
app.router.add_route('GET', '/work/', work)
app.router.add_route('GET', '/work/{id}', work)
app.router.add_route('GET', '/text/{sha1}', text)

port = 8080
srv = yield from loop.create_server(app.make_handler(), '0.0.0.0', port)
print("Server listening on http://0.0.0.0:{}".format(port))
print("Serving {} works and {} volumes".format(len(works), len(texts)))
return srv

if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(init(loop))
loop.run_forever()
@@ -0,0 +1 @@
aiohttp>=0.12.0,<1.0

0 comments on commit 569a23f

Please sign in to comment.
You can’t perform that action at this time.