-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlgrabr.ts
104 lines (87 loc) · 2.53 KB
/
htmlgrabr.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import fetch, { Headers, Request } from 'node-fetch'
import { URL } from 'url'
import { promisify } from 'util'
import * as DOM from './dom_handler'
import { clean } from './dom_cleaner'
import { isBlacklisted, BlacklistCtrlFunc } from './blacklist';
const pretty = require('pretty')
const h2p = require('html2plaintext')
const readability = promisify(require('node-readability'))
interface GrabberConfig {
debug?: boolean
isBacklisted?: BlacklistCtrlFunc
headers?: Headers
}
interface GrabbedPage {
title: string
url: string | null
image: string | null
html: string
text: string
images: DOM.ImageMeta[]
}
const DefaultConfig: GrabberConfig = {
debug: false,
isBacklisted: isBlacklisted,
headers: new Headers({
'User-Agent': 'Mozilla/5.0 (compatible; HTMLGrabr/1.0)',
})
}
export default class HTMLGrabr {
config: GrabberConfig
constructor(config: GrabberConfig = DefaultConfig) {
this.config = { ...DefaultConfig, ...config }
}
/**
* Grabs the content of a page from HTML text.
* @param html a string that contains HTML code
* @returns a page object
*/
async grab(html: string): Promise<GrabbedPage> {
// Use Readability.js to extract HTML content
const article = await readability(html)
const doc = article.document
// Extract base URL
const baseURL = DOM.extractBaseUrl(doc) || undefined
// Extract Open Graph propreties
const ogProps = DOM.extractOpenGraphProps(doc)
// Clean the DOM
clean(doc, { baseURL, debug: this.config.debug , isBacklisted: this.config.isBacklisted })
// Extract images
const images = DOM.extractImages(doc, ogProps.image)
// Backup HTML content (if Readability fails)
let content = doc.body.innerHTML
if (article.content) {
// Extract HTML content
content = article.content
}
return {
title: ogProps.title || article.title,
url: ogProps.url || baseURL,
image: ogProps.image,
html: pretty(content, { ocd: true }),
text: h2p(content),
images
}
}
/**
* Grabs the content of a remote HTML page.
* @param url the URL to fetch and process
* @returns a page object
*/
async grabUrl(url: URL) {
const req = new Request(url.toString(), {
headers: this.config.headers,
})
const res = await fetch(req)
if (!res.ok) {
throw new Error(`bad status response: ${res.statusText}`)
}
const body = await res.text()
const result = await this.grab(body)
if (!result.url) {
result.url = url.toString()
}
return result
}
}