/
main.py
187 lines (165 loc) · 7.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""Simple REST API for novels
1. GET /work/{id} returns all works or individual works
2. GET /text/{sha1sum} returns plaintext for volume
"""
import asyncio
import collections
import csv
import hashlib
import json
import logging
import operator
import os
import aiohttp.web
import requests
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)
############################################################################
# urls for bibliographic records and identifiers
############################################################################
# json containing bibliographic records
british_fiction_urls = (
'https://raw.githubusercontent.com/novels-project/british-fiction-1770-1836/master/data/british-fiction-1800-1829.json',
'https://raw.githubusercontent.com/novels-project/british-fiction-1770-1836/master/data/british-fiction-1830-1836.json',
'https://raw.githubusercontent.com/novels-project/british-fiction-1770-1836/master/data/british-fiction-1800-1829-updates.json',
)
british_fiction_annex_url = 'https://github.com/novels-project/british-fiction-1770-1915-annex/raw/master/british-fiction-1770-1915-riddell.json'
# mapping of novels project identifiers to other identifiers
ids_url = 'https://github.com/novels-project/identifiers/raw/master/novels-project-identifiers.json'
############################################################################
# load volume metadata stored on disk into memory, indexed by work id
# index text filenames by sha1sum
############################################################################
def fetch_texts():
texts = {}
for volumes_dir in ['volumes', 'nonfree']:
for dirpath, dirnames, filenames in os.walk(volumes_dir):
for fn in filenames:
if os.path.splitext(fn)[-1].startswith('.txt'):
filename = os.path.join(dirpath, fn)
sha1 = hashlib.sha1(open(filename, 'rb').read()).hexdigest()
texts[sha1] = filename
return texts
def fetch_volumes():
# NB: metadata.json contains sha1 of relevant text
volumes = collections.defaultdict(list)
for volumes_dir in ['volumes', 'nonfree']:
for dirpath, dirnames, filenames in os.walk(volumes_dir):
for fn in filenames:
if fn == 'metadata.json':
try:
metadata = json.load(open(os.path.join(dirpath, fn), encoding='utf8'))
except ValueError:
logger.critical("Failed to load {}".format(os.path.join(dirpath, fn)))
raise
work_id = int(metadata['work_id'])
volumes[work_id] += [metadata]
# sort volumes in ascending order by volume number
volumes[work_id].sort(key=operator.itemgetter('volume'))
return volumes
############################################################################
# fetch bibliographic records from version controlled repositories
############################################################################
def fetch_works():
british_fiction_annex = requests.get(british_fiction_annex_url).json()
ids_same_as = {int(k): v for k, v in requests.get(ids_url).json().items()}
ids_same_as_reverse = {v['garside-raven-schöwerling']: k for k, v in ids_same_as.items() if v.get('garside-raven-schöwerling')}
# populate works dictionary with empty placeholders
works = collections.OrderedDict((int(id), dict()) for id in sorted(ids_same_as.keys()))
for url in british_fiction_urls:
data = requests.get(url).json()
for source_id, record in data.items():
id = ids_same_as_reverse[source_id]
if id not in works:
raise RuntimeError("Could not find mapping for {}".format(source_id))
record['source'] = 'garside-raven-schöwerling'
record['source_id'] = record['id']
del record['id']
if works[id]:
raise ValueError("Found duplicate entry: {}".format(id))
works[id] = record
# populate with extra records
for annex_id, record in british_fiction_annex.items():
source, source_id = annex_id.split('/')
record['source'] = source
if source == 'novels-project':
source_id = int(source_id)
id = source_id
else:
id = ids_same_as_reverse[source_id]
record['source_id'] = source_id
if id not in works:
raise RuntimeError("Could not find mapping for {}".format(source_id))
if works[id]:
raise ValueError("Found duplicate entry: {}".format(id))
works[id] = record
# verify that all records are populated and add id to work dictionary
assert isinstance(id, int)
for id, record in works.items():
if not record:
raise ValueError("Found no information for id {}".format(id))
works[id]['id'] = id
return works
############################################################################
# inject volume information into relevant work metadata
############################################################################
def inject_volumes(volumes, works):
for key, work in works.items():
assert key == work['id']
assert isinstance(key, int)
if work['id'] in volumes:
work['volumes'] = volumes[work['id']]
############################################################################
# load bibliographic records into memory and inject volume information
############################################################################
# NB: works is a global variable
works = fetch_works()
inject_volumes(fetch_volumes(), works)
############################################################################
# load sha1->filename mapping into memory
############################################################################
# NB: texts is a global variable
texts = fetch_texts()
############################################################################
# endpoints
############################################################################
@asyncio.coroutine
def work(request):
if request.match_info.get('id', None) is None:
return aiohttp.web.Response(text=json.dumps(works, ensure_ascii=False, indent=2, sort_keys=True))
else:
id = int(request.match_info['id'])
if id not in works:
return aiohttp.web.HTTPNotFound
return aiohttp.web.Response(text=json.dumps(works[id], ensure_ascii=False, indent=2, sort_keys=True))
@asyncio.coroutine
def text(request):
sha1 = request.match_info.get('sha1', None)
if sha1 is None:
return aiohttp.web.HTTPNotFound()
if sha1 not in texts:
return aiohttp.web.HTTPNotFound()
else:
text = open(texts[sha1], encoding='utf8').read()
return aiohttp.web.Response(text=text)
@asyncio.coroutine
def index(request):
text = "See https://novels.io for details."
return aiohttp.web.Response(text=text)
@asyncio.coroutine
def init(loop):
app = aiohttp.web.Application(loop=loop)
app.router.add_route('GET', '/', index)
app.router.add_route('GET', '/work/', work)
app.router.add_route('GET', '/work/{id}', work)
app.router.add_route('GET', '/text/{sha1}', text)
port = 8080
srv = yield from loop.create_server(app.make_handler(), '0.0.0.0', port)
logger.info("Server listening on http://0.0.0.0:{}".format(port))
logger.info("Serving {} works".format(len(works)))
return srv
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(init(loop))
loop.run_forever()