This repository has been archived by the owner on Aug 17, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.js
127 lines (117 loc) · 4.15 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// This software is free software. See AUTHORS and LICENSE for more
// information on the copying conditions.
"use strict";
const basedir = `${__dirname}/../..`;
const jquery = `file:///${basedir}/node_modules/jquery/dist/jquery.min.js`;
const jsdom = require("jsdom");
const prefix = "https://roarmap.eprints.org/cgi/search/advanced";
const request = require("request");
const timeout = 5000;
// Basic building block to fetch URLs with timeout and SSL checks
const fetch = exports.fetch = (url, callback, requestIsh) => {
if (requestIsh === undefined) {
requestIsh = request;
}
requestIsh({
uri: url,
timeout: timeout,
}, (error, response, body) => {
if (error) {
callback(error);
return;
}
if (response.statusCode !== 200) {
callback(new Error("invalid status code"));
return;
}
callback(null, body);
});
};
// Wrapper for jsdom.env() that uses our fetch() to retrieve data
const jsdomWrap = exports.jsdomWrap = (url, cb, fetchIsh, jsdomEnvIsh) => {
if (fetchIsh === undefined) {
fetchIsh = fetch;
}
if (jsdomEnvIsh === undefined) {
jsdomEnvIsh = jsdom.env;
}
fetchIsh(url, (error, body) => {
if (error) {
cb(error);
return;
}
// XXX: I initially tried to specify timeout using the `pool`
// parameter of jsdom.env()'s config but failed.
// Note: The following does not use the nework as long as
// the `jquery` parameter references a file.
jsdomEnvIsh(body, [jquery], cb);
});
};
// Scrape all institutions in JSON format from roarmap website
const scrape = exports.scrape = (callback, jsdomWrapIsh) => {
// TODO: effective mocking of jsdomWrap seems not super trivial.
if (jsdomWrapIsh === undefined) {
jsdomWrapIsh = jsdomWrap;
}
// 1. Load advanced search form
console.log("load:", prefix);
jsdomWrapIsh(prefix, (err, window) => {
if (err) {
callback(err);
return;
}
console.log("ok");
// 2. Do advanced search for all institution types
const institutionTypes = [
"funder", "research_org", "funder_and_research_org",
"multiple_research_orgs", "research_org_subunit"
];
window.$("input.ep_form_checkbox").each((_, elem) => {
if (institutionTypes.indexOf(elem.value) >= 0) {
console.log("click:", elem.value);
elem.click();
}
});
const form = window.$("form[action='/cgi/search/advanced']");
const url = prefix + "?" + form.serialize();
form.on("submit", () => {
console.log("on submit");
jsdomWrapIsh(url, (err, window) => {
if (err) {
callback(err);
return;
}
console.log("ok");
// 3. Select export to JSON
window.$("select[name='output']").val("JSON");
const form = window.$(
"form input.ep_form_action_button[value='Export']")
.parents("form");
const url = prefix + "?" + form.serialize();
form.on("submit", () => {
console.log("on submit");
jsdomWrapIsh(url, (err, window) => {
if (err) {
callback(err);
return;
}
console.log("ok");
// 4. Pass JSON to caller
const ttt = window.document.documentElement.textContent;
try {
JSON.parse(ttt);
} catch (err) {
callback(err);
return;
}
callback(undefined, ttt);
});
});
console.log("submit:", url);
form.submit();
});
});
console.log("submit:", url);
form.submit();
});
};