-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.js
130 lines (115 loc) · 3.71 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// const cheerio = require('cheerio');
const puppeteer = require('puppeteer');
var fs = require("fs");
const tabletojson = require('tabletojson').Tabletojson;
module.exports = class Scraper{
url = ''
tableIndex = 0;
replaces= [];
constructor(url,tableIndex,replaces){
this.url = url;
this.tableIndex = tableIndex;
this.replaces = replaces;
this.getPageContent = this.getPageContent.bind(this);
this.tableToJson = this.tableToJson.bind(this);
this.replace = this.replace.bind(this);
this.replaceAll = this.replaceAll.bind(this);
this.jsonToCurrencies = this.jsonToCurrencies.bind(this);
}
async getPageContent(){
var url = this.url;
return await new Promise((res,rej) => {
puppeteer
.launch()
.then(browser => {
console.log('browser launched')
return browser.newPage()})
.then(page => {
page.setDefaultNavigationTimeout(50000); // change timeout to 50 s
console.log('loading page')
return page.goto(url,{
waitUntil: 'networkidle0',
}).then(() => page)
})
.then((page) => {
console.log('getting page content')
return page.content();
})
.then(html => {
res(html);
})
.catch(rej);
})
}
tableToJson(html){
return tabletojson.convert(html)[this.tableIndex];
}
replace(val){
if(val)
this.replaces.forEach(({from,to, exactly, maxCharDiff}) => {
val = val.trim();
if(exactly && val === from) {
val = to;
}
else{
var replacedVal = val;
replacedVal = replacedVal.replace(from, to)
if(!exactly && (val.includes(from) || replacedVal.includes(to))){
if(Math.abs(val.length - from.length) <= maxCharDiff || Math.abs(replacedVal.length - to.length) <= maxCharDiff){
val = to;
}
}
}
})
return val.trim();
}
replaceAll(json){
return json.map(row => {
var newRow = {};
for(var key in row) {
newRow[this.replace(key)] = this.replace(row[key]);
}
return newRow;
})
}
jsonToCurrencies(replacedJson){
// return replacedJson;
var json = {};
//isNaN: isNaN(val),
var validCurrencies = [
'CAD',
'USD',
'AUD',
'EURO'
];
var validHeaders = [
'Date',
'Buy',
'Sell'
];
for(var row of replacedJson){
if(validCurrencies.includes(row.Currency)){
var newRow = {};
for(var key in row){
if(validHeaders.includes(key)){
newRow[key] = row[key];
}
}
json[row.Currency] = newRow;
}
}
return json;
}
async getCurrencies(){
return await this.getPageContent()
.then(this.tableToJson)
.then(this.replaceAll)
.then(this.jsonToCurrencies)
}
async getCurrenciesFromHTML(html){
var json = this.tableToJson(html);
json = this.replaceAll(json);
json = this.jsonToCurrencies(json);
return json;
}
}