/
generator.js
451 lines (420 loc) · 16.9 KB
/
generator.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
import {Transform} from "stream"
import path from "path"
import fs from 'fs-extra'
import osm_parser from 'osm-pbf-parser'
import stringify from "json-stringify-pretty-compact"
import * as R from "ramda"
import { equivalentStreet } from "./equivalents.js"
import {stripAffixes} from "./affixes.js"
import { minEponymFrequency } from "./regions.js"
import { readFile, writeFile } from "fs/promises"
import M from 'mustache'
export const osmPath = country => path.resolve("data/osm_data", country + "-latest.osm.pbf")
const inspectPath = country => path.resolve("out", country + "-inspect.json")
// RW json data
const write = file => data =>
fs.writeFileSync(file, stringify(data, { maxLength: 120 }))
const read = R.compose(JSON.parse, fs.readFileSync)
// RW out files
const rawPath = path.resolve(fs.ensureDirSync("data/raw") || "raw")
const rawFile = country => path.resolve(rawPath, country + ".json")
const readRaw = country => read(rawFile(country))
const eponymsPath = "data/countries"
const eponymsFile = country => path.resolve(eponymsPath, country + ".json")
const writeEponyms = country => write(eponymsFile(country))
const readEponyms = country => read(eponymsFile(country))
const writePersons = write("data/persons.json")
const readPersons = () => read("data/persons.json")
// Get the file modified date from the OS; good enough to get a glimpse of the
// freshness of osm data; the alternative is to extract the modified date from
// the osm file (more complicated)
const modifiedDateOSM = country => R.pipe(
osmPath,
fs.statSync,
f => f.mtime.toDateString().substring(4),
)(country)
// Explore the osm data. I've used this to explore what osm fields contain
// street data.
export const inspectOsmData = (country, regex) => fs.createReadStream(osmPath(country))
.on("open", () => fs.writeFileSync(inspectPath(country), "create new file"))
.pipe(new osm_parser())
.pipe(new Transform({
objectMode: true,
transform: (chunk, _encoding, callback) =>
R.pipe(
R.map(JSON.stringify),
R.filter(R.test(new RegExp(regex, "i"))),
R.map(JSON.parse),
// Ignore non-interesting fields
R.map(R.omit(["id", "lat", "lon", "info", "refs", "members"])),
out => out.length > 0 ?
fs.appendFileSync(inspectPath(country), JSON.stringify(out, null, 2))
: "ignore",
() => callback(null, null)
)(chunk)
}))
export const extractOsmData = (country) => {
const streets = fs.createWriteStream(rawFile(country))
// Add metadata; only the osm last modified date, for now
streets.write(`[["${modifiedDateOSM(country)}"]`)
fs.createReadStream(osmPath(country))
.pipe(new osm_parser())
.pipe(new Transform({
objectMode: true,
transform: (chunk, _encoding, callback) =>
R.pipe(
// Keep only pbf entries of type `node` and `way`. If other pbf entries
// are found to contain relevant street name data, include them here.
R.filter(R.compose(
R.includes(R.__, ["node", "way"]),
R.prop("type")
)),
// Extract the `tags` key. If it exists, this key contains the street
// names. If not, ignore this entry.
R.filter(R.prop("tags")),
R.map(R.prop("tags")),
// There are multiple keys that contain the city+street name duo. Check
// all known combinations on each entry and keep the ones that are valid.
// If, in the future, other combinations are found to contain relevant
// street name data, include them here.
R.chain(R.juxt([
R.props(["is_in:city", "name"]),
R.props(["addr:city", "addr:street"])
])),
R.reject(R.any(R.isNil)),
// Pass the transformed chunk to the next stage.
out => callback(null, out)
)(chunk)
}))
.on("data", R.pipe(
R.uniqBy(R.join("-")),
R.map(s => streets.write(",\n" + stringify(s)))
))
.on("finish", () => {
streets.write("]")
streets.close()
})
}
export const parseOsmData = (country) => R.pipe(
readRaw,
// Skip the osm modified date
R.tail,
// Strip affixes and replace with equivalents; keep city unchanged
R.map(R.adjust(1, R.compose(equivalentStreet(country), stripAffixes(country)))),
// Even without the previous step, but moreso after it, there will be
// identical [city, name] entries. I've decided to allow only a single
// eponym per city, even though there might be eponyms for streets, squares
// or other landmarks representing the same person, all in the same city.
// This is a fair defense against misspelled streets or ones tagged multiple
// times with slightly different affixes or equivalents, all in the same
// city. Counting all these would be a mistake. Still, in some instances,
// the city names themselves are spelled differently (with or without
// cedilla, for example).
R.uniqBy(R.join("-")),
// The city name has served its purpose, discard it.
R.map(R.prop(1)),
// Count identical street names.
R.groupBy(R.identity),
R.mapObjIndexed(R.length),
// Transform to [streetName, count] array
R.toPairs,
// Remove streets composed of numbers, letters or other names less than 3
// characters to reduce the output file size; most than likely these do not
// designate persons. Note: it might not apply for China/Korea/Taiwan/etc.
R.reject(R.compose(
R.gte(3),
R.length,
R.prop(0))
),
// Protect against garbage entries and very long files (lots of streets)
R.reject(R.compose(
R.gt(minEponymFrequency(country)),
R.prop(1))
),
// Sort by most frequent street names first.
R.sortWith([R.descend(R.prop(1))]),
hydrateStreets(country),
// Add back the osm modified date
R.prepend(R.head(readRaw(country))),
writeEponyms(country),
)(country)
const readCountry = R.pipe(
eponymsFile,
fs.readFileSync,
JSON.parse
)
const hydrateStreets = (country) => streets =>
fs.existsSync(eponymsFile(country)) ?
R.pipe(
readEponyms,
prevStreets =>
R.map(entry =>
R.pipe(
R.find(R.compose(R.equals(R.head(entry)), R.head)),
// Add the existing link, if it exists. When adding links by hand,
// it might happen that the url is encoded during copy/paste
// (russia/bulgaria/etc); make sure to save it as decoded to save
// space and visuals.
prevStreet => R.append(prevStreet ? decodeURI(prevStreet[2]) : "", entry)
)(prevStreets),
streets)
)(country)
: streets
// Gather all persons from all countries and make a summary of the most frequent
// persons and the total number of streets they appear on.
const worldwideEponyms = () => R.pipe(
R.compose(R.map(R.replace(".json", "")), fs.readdirSync),
// Remove the last-updated line
R.chain(R.compose(R.tail, readCountry)),
// Keep name of persons only
R.filter(R.compose(R.startsWith("http"), R.prop(2))),
R.groupBy(R.prop(2)),
R.mapObjIndexed(R.juxt([R.length, R.reduce((acc, el) => acc + el[1], 0)])),
R.toPairs,
R.map(R.flatten),
R.sortBy(R.prop(1)),
R.reverse
)(eponymsPath)
// Check the `country`.json file for same link assigned to multiple entries. If
// found, manually include them under a single person in the equivalents
// section.
export const linkDups = (country) => R.pipe(
readCountry,
R.groupBy(R.prop(2)),
R.mapObjIndexed(R.length),
R.toPairs,
R.reject(R.propEq(1, 1)),
R.reject(R.propEq('', 0))
)(country)
export const linkDupsAll = () => R.pipe(
R.compose(R.map(R.replace(".json", "")), fs.readdirSync),
R.map(R.juxt([R.identity, linkDups])),
R.reject(R.propEq([], 1)),
R.map(R.head)
)(eponymsPath)
// Check if, for the given country name, any of the links contain special
// characters or do not contain wikipedia urls. Return them, if they do
export const linksConsistency = R.pipe(
readCountry,
R.tail,
R.map(R.prop(2)),
R.reject(R.isEmpty),
R.reject(R.compose(R.isEmpty, R.match(/%|^((?!wikipedia).)*$/i))),
)
// Check if any of the countries fail to pass the link consistency
// checks. Return their names if the do not.
export const linksConsistencyAll = () => R.pipe(
R.compose(R.map(R.replace(".json", "")), fs.readdirSync),
R.map(R.juxt([R.identity, linksConsistency])),
R.reject(R.propEq([], 1)),
R.map(R.head)
)(eponymsPath)
const then = fn => pr => pr.then(fn);
const delay = ms => new Promise(res => setTimeout(res, ms))
// upcase first letter
const upCase = str => str.charAt(0).toUpperCase() + str.slice(1);
// Given a valid wikipedia url, return info about the page, such as name, image
// and a summary
const wiki = async url => {
const page_name = R.compose(R.last, R.split("/"))(url)
const page_language = R.pipe(
R.split("//"),
R.prop(1),
R.split("."),
R.head
)(url)
// Poor man's rate limiter to avoid the 200 requests / second limit for Wiki API
// For really big countries, if this still doesn't work, temporarily remove
// part of the streets in the country.json file and retrieve the data in pieces.
await delay((Math.random() + 25) * 200)
return fetch(`https://${page_language}.wikipedia.org/api/rest_v1/page/summary/${page_name}`)
.then(v => v.json())
.then(r => ({
url,
name: R.prop("title", r),
image: R.pipe(
R.propOr("", "thumbnail"),
// Less popular wiki entries sometimes don't have a picture.
R.propOr("../placeholder.png", "source"))
(r),
summary: R.propOr("", "extract")(r)
}))
}
// Apply the html template to country and the list of persons and save it.
const applyHtmlTemplate = country => persons =>
readFile("./data/template.html", { encoding: 'utf8' })
.then(template =>
writeFile(`./data/html/${country}.html`, M.render(template, persons), 'utf8'))
// Generate an html page with all the eponyms, wiki summary, wiki link and
// thumbnail for the given country
const htmlPage = country => entries => R.pipe(
// Add extra info, like summary and image from the persons db
R.map(e => ({
url: e[0],
count: e[1],
...readPersons()[e[0]]
})),
// Merge all keywords into `keywords`
R.map(persons => {
persons.keywords = R.pipe(
R.props(["keywords", "keywords_extra"]),
R.flatten,
addKeywordsEquivalents,
R.flatten,
R.reject(R.isNil),
R.uniq
)(persons)
return persons
}),
R.map(R.omit(["keywords_extra"])),
// If the person is not in the persons.json db, do not include it.
R.filter(R.has("name")),
R.applySpec({
country: () => upCase(country),
persons_count: R.length,
streets_count: R.compose(R.sum, R.map(R.prop("count"))),
keywords: R.compose(keywordsCount, R.map(R.flatten), R.map(R.props(["keywords"]))),
// Sometimes the summary is too short and it looks weird on the page
persons: R.map(R.evolve({ summary: str => str.padEnd(100, ' ')}))
}),
applyHtmlTemplate(country),
)(entries)
export const htmlPageCountry = country => R.pipe(
readEponyms,
// Skip date and skip street names not named after a person
R.tail,
R.reject(R.compose(R.isEmpty, R.prop(2))),
// Url first, count in second position, skip the name
R.map(R.tail),
R.map(R.reverse),
htmlPage(country)
)(country)
export const htmlPageAllCountries = () => R.pipe(
R.compose(R.map(R.replace(".json", "")), fs.readdirSync),
R.map(htmlPageCountry)
)(eponymsPath)
export const htmlPageWorldwide = () => R.pipe(
worldwideEponyms,
// There are over 10000 entries, take out some of them
R.filter(e => e[1] > 2),
htmlPage("worldwide")
)()
// manually added
// psychologist
const ignore = ["she", "female"]
const equivalents = {
artist: ["music", "dramatist", "actor", "painter", "sculptor", "theatre", "film director", "screenwriter"],
music: ["organist", "conductor", "singer", "songwriter", "soprano", "tenor",
"violonist", "guitarist", "composer"],
military: ["colonel", "general", "marshal", "commander", "privateer",
"admiral", "lieutenant", "guerrilla figther", "officer", "conquistador"],
writer: ["journalist", "poet", "novelist", "philosopher", "playwright", "historian", "folklorist", "translator", "publicist", "essayist", "dramatist"],
woman: ["she", "noblewoman", "female", "nun", "sister", "duchess", "actress"],
ruler: ["king", "queen", "voivode", "duke", "count", "sultan", "caliph", "chancellor"],
religion: ["pastor", "theologian", "patriarch", "saint", "cleric", "abbot",
"prelate", "bishop", "monk", "nun", "apostle", "archbishop"],
scientist: ["anatomist", "archaeologist", "bacteriologist", "biochemist",
"physicist", "entomologist", "zoologist", "agronomist", "architect",
"astronomer", "biologist", "botanist", "chemist"],
sport: ["athlete", "abbot", "footbaler", "runner", "bicycle racer", "cyclist",
"racing driver", "tennis player", "swimmer", "gymnast", "boxer"]
}
const keywords = ["abbot", "actor", "activist", "actress", "admiral", "agronomist",
"alchemist", "anarchist", "anatomist", "apostle", "archaeologist",
"archbishop", "architect", "artist", "astrologer", "astronomer", "athlete",
"author", "aviator", "bacteriologist", "biochemist", "businessman",
"biologist", "bishop", "botanist", "boxer", "caliph", "cartographer",
"chancellor", "cleric", "colonel", "conductor", "commander", "composer",
"conquistador", "cosmonaut", "chemist", "cyclist", "designer", "diplomat",
"doctor", "dramatist", "duchess", "duke", "economist", "educator", "engineer",
"emperor", "entomologist", "entrepreneur", "essayist", "ethnologist",
"explorer", "female", "filmmaker", "film director", "folklorist",
"footballer", "friar", "general", "geographer", "geologist", "guerrilla fighter",
"hajduk", "hero", "historian", "illustrator", "industrialist",
"inventor", "organist", "jazz", "journalist", "judge", "jurist", "king",
"knight", "lawyer", "legendary", "librarian", "linguist", "lieutenant",
"magistrate", "marshal", "martyr", "mathematician", "mayor", "merchant",
"microbiologist", "military", "missionary", "monk", "musician",
"musicologist", "mythology", "nationalist", "naturalist", "navigator",
"neurologist", "nobleman", "noblewoman", "novelist", "officer", "orator",
"pastor", "painter", "partisan", "patriarch", "patriot", "pedagogue",
"pharmacist", "philanthropist", "philologist", "philosopher", "photographer",
"physician", "physicist", "pianist", "pilot", "playwright", "poet",
"polymath", "politician", "preacher", "prelate", "president", "priest",
"prince", "princess", "printmaker", "prime minister", "privateer",
"professor", "publicist", "queen", "resistance fighter", "racing driver",
"revolutionary", "risorgimento", "ruler", "sailor", "saint", "scholar",
"scientist", "screenwriter", "sculptor", "she", "singer", "sociologist",
"soldier", "songwriter", "soprano", "statesman", "sultan", "surgeon",
"teacher", "tenor", "tennis player", "theatre", "theologian",
"trade unionist", "union leader", "translator", "violinist", "voivode", "woman",
"writer", "zoologist"
]
// Some keywords/occupations are subdomains of a higher domain, like a poet is a
// writer, too. Add those hight domains to the list of keywords
const addKeywordsEquivalents = keys => R.concat(keys,
R.map(key => R.pipe(
R.toPairs,
R.filter(R.pipe(
R.last,
R.includes(key)
)),
R.map(R.head),
R.uniq
)(equivalents), keys))
const updatePersonsDb = (country) => R.pipe(
readEponyms,
// Skip date
R.tail,
// Skip street names not named after a person
R.reject(R.compose(R.isEmpty, R.prop(2))),
// Send the url to wiki()
R.map(R.compose(wiki, R.prop(2))),
v => Promise.all(v),
// Use the name of the person as the key of the object. The object now
// contains the image and the summary of that person, as see on wikipedia.
then(R.map(w => ({
[w.url]: R.omit(["url"], w)
}))),
// Add or update the persons db
then(R.mergeAll),
then(R.mergeDeepRight(readPersons())),
then(writePersons),
then(keywordsUpdate)
)(country)
const keywordsFromSummary = (str) => R.pipe(
// Avoid matching general when the word is generally, for example.
R.map(k => str.match(new RegExp(" " + k + "( |,|\\.|;)", "i"))),
R.filter(R.empty),
// The matched string
R.map(R.prop(0)),
R.map(R.trim),
R.map(R.replace(/(,|\.|;)/, "")),
R.map(s => s.toLowerCase())
)(keywords)
const keywordsUpdate = () => R.pipe(
readPersons,
R.map(e => R.assoc("keywords", keywordsFromSummary(e.summary), e)),
writePersons
)()
// keywords - array of arrays of strings.
// Count the number of occurence for all the unique keywords
const keywordsCount = (keywords) => R.pipe(
R.reject(R.isNil),
R.flatten,
R.reject(R.isNil),
R.uniq,
a => R.zipObj(a, R.repeat(0, a.length)),
R.mapObjIndexed((_, key) => R.pipe(
R.map(R.includes(key)),
R.count(R.equals(true)),
R.applySpec({
keyword: R.always(key),
count: R.identity,
// percent: c => ((c / keywords.length) * 100).toFixed(1)
})
)(keywords)),
R.values,
R.sortBy(R.prop("count")),
R.reverse,
)(keywords)