Skip to content

Commit e370770

Browse files
committed
feat: add builtin adblock
1 parent 6318f56 commit e370770

File tree

6 files changed

+137
-38
lines changed

6 files changed

+137
-38
lines changed

README.md

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Although you can think [puppeteer](https://github.com/GoogleChrome/puppeteer) co
1919
- Sensible good defaults, aborting unnecessary requests based of what you are doing (e.g, aborting image request if you just want to get [`.html`](#htmlurl-options) content).
2020
- Privacy by default, blocking tracker requests.
2121
- Easily create a pool of instance (via [`@browserless/pool`](#pool-of-instances)).
22-
- Built-in AdBlocker ([soon](https://github.com/Kikobeats/browserless/issues/26)).
22+
- Built-in adblocker for aborting ads requests.
2323

2424
## Install
2525

@@ -165,12 +165,12 @@ default: `['image', 'media', 'stylesheet', 'font', 'xhr']`
165165

166166
A list of `resourceType` requests that can be aborted in order to make the process faster.
167167

168-
##### abortTrackers
168+
##### adblock
169169

170170
type: `boolean`</br>
171171
default: `true`
172172

173-
It will be abort request coming for [tracking domains](https://npm.im/is-tracking-domain).
173+
It will be abort requests detected as ads.
174174

175175
### .text(url, options)
176176

@@ -366,21 +366,14 @@ The target URL
366366
type: `string`</br>
367367
default: `[]`
368368

369-
A list of `req.resourceType()` to be blocked.
369+
A list of `req.resourceType()` to be aborted.
370370

371-
##### abortTrackers
371+
##### adblock
372372

373373
type: `boolean`</br>
374374
default: `true`
375375

376-
It will be abort request coming for [tracking domains](https://npm.im/is-tracking-domain).
377-
378-
##### abortTrackers
379-
380-
type: `boolean`</br>
381-
default: `true`
382-
383-
It will be abort request coming for [tracking domains](https://npm.im/is-tracking-domain).
376+
It will be abort requests detected as ads.
384377

385378
##### waitFor
386379

packages/browserless/src/index.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,14 @@ module.exports = ({
6262
const evaluate = fn =>
6363
wrapError(page => async (url, opts = {}) => {
6464
const {
65-
abortTrackers = true,
65+
adblock = true,
6666
abortTypes = ['image', 'imageset', 'media', 'stylesheet', 'font', 'object', 'sub_frame'],
6767
...args
6868
} = opts
6969

7070
const response = await goto(page, {
7171
url,
72-
abortTrackers,
72+
adblock,
7373
abortTypes,
7474
...args
7575
})
@@ -78,8 +78,8 @@ module.exports = ({
7878
})
7979

8080
const screenshot = wrapError(page => async (url, opts = {}) => {
81-
const { device = 'macbook pro 13', type = 'png', viewport, ...args } = opts
82-
await goto(page, { url, device, ...args })
81+
const { adblock = true, device = 'macbook pro 13', type = 'png', viewport, ...args } = opts
82+
await goto(page, { url, device, adblock, ...args })
8383
return page.screenshot({ type, ...args })
8484
})
8585

packages/goto/package.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@
2929
],
3030
"dependencies": {
3131
"@browserless/devices": "^5.2.3",
32+
"got": "~9.6.0",
33+
"tldts": "~4.0.5",
34+
"@cliqz/adblocker": "~0.8.0",
3235
"debug": "~4.1.0",
33-
"extract-domain": "~2.0.4",
34-
"is-tracking-domain": "~1.1.6",
3536
"p-timeout": "~2.0.1",
3637
"require-one-of": "~1.0.2"
3738
},
@@ -42,7 +43,8 @@
4243
"src"
4344
],
4445
"scripts": {
45-
"test": "exit 0"
46+
"test": "exit 0",
47+
"postinstall": "node scripts/postinstall"
4648
},
4749
"license": "MIT"
4850
}

packages/goto/scripts/postinstall.js

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
'use strict'
2+
3+
const { promisify } = require('util')
4+
const { EOL } = require('os')
5+
const got = require('got')
6+
const fs = require('fs')
7+
8+
const writeFile = promisify(fs.writeFile)
9+
10+
const FILTERS = [
11+
// uBlock Origin – https://github.com/uBlockOrigin/uAssets/tree/master/filters
12+
'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/filters.txt',
13+
'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/badware.txt',
14+
'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/privacy.txt',
15+
'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/resource-abuse.txt',
16+
'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/unbreak.txt',
17+
'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances.txt',
18+
19+
// Easylist – https://easylist.to/
20+
'https://easylist.to/easylist/easylist.txt',
21+
'https://easylist.to/easylist/easyprivacy.txt',
22+
'https://easylist.to/easylist/fanboy-annoyance.txt',
23+
'https://easylist.to/easylist/fanboy-social.txt',
24+
25+
// Other
26+
'http://pgl.yoyo.org/as/serverlist.php?hostformat=adblockplus;showintro=0&mimetype=plaintext',
27+
'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/recipes/recipes_en.txt',
28+
'https://www.i-dont-care-about-cookies.eu/abp/',
29+
'https://raw.githubusercontent.com/liamja/Prebake/master/obtrusive.txt',
30+
'https://gnuzilla.gnu.org/filters/blacklist.txt'
31+
]
32+
33+
const rulesFromURL = async url => {
34+
const { body } = await got(url)
35+
return body
36+
}
37+
38+
const rulesFromURLs = async urls => {
39+
const lists = await Promise.all(urls.map(rulesFromURL))
40+
41+
// Remove duplicate rules
42+
const set = lists.reduce((acc, list) => {
43+
const rules = list.split(EOL).filter(rule => {
44+
// remove empty lines
45+
if (rule === '') return false
46+
// remove comments
47+
if (rule.startsWith('!')) return false
48+
return true
49+
})
50+
51+
return new Set([...acc, ...rules])
52+
}, new Set())
53+
54+
return Array.from(set)
55+
}
56+
57+
const toTxt = (filepath, data) => writeFile(filepath, data)
58+
59+
const main = async urls => {
60+
const rules = await rulesFromURLs(urls)
61+
await toTxt('src/rules.txt', rules.join(EOL))
62+
}
63+
64+
main(FILTERS)
65+
.catch(err => {
66+
console.error(err)
67+
process.exit(1)
68+
})
69+
.then(process.exit)

packages/goto/src/index.js

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,47 @@
11
'use strict'
22

3+
const { FiltersEngine, makeRequest } = require('@cliqz/adblocker')
34
const { getDevice } = require('@browserless/devices')
45
const debug = require('debug')('browserless:goto')
5-
const extractDomain = require('extract-domain')
6+
const tldts = require('tldts')
7+
const path = require('path')
8+
const fs = require('fs')
69

7-
const isTracker = require('./is-tracker')
10+
const engine = FiltersEngine.parse(fs.readFileSync(path.resolve(__dirname, './rules.txt'), 'utf-8'))
811

912
const isEmpty = val => val == null || !(Object.keys(val) || val).length
1013

11-
const isExternalUrl = (domainOne, domainTwo) => domainOne !== domainTwo
14+
const types = {
15+
document: 'main_frame',
16+
eventsource: 'other',
17+
fetch: 'xhr',
18+
font: 'font',
19+
image: 'image',
20+
manifest: 'other',
21+
media: 'media',
22+
other: 'other',
23+
script: 'script',
24+
stylesheet: 'stylesheet',
25+
texttrack: 'other',
26+
websocket: 'websocket',
27+
xhr: 'xhr'
28+
}
29+
30+
/**
31+
*
32+
* Mapping from puppeteer request types to adblocker. This is needed because not all
33+
* types from puppeteer have the same name as the webRequest APIs from browsers
34+
* (which the adblocker expects).
35+
*
36+
* Related:
37+
* - https://github.com/GoogleChrome/puppeteer/blob/v1.14.0/docs/api.md#requestresourcetype
38+
* - https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType
39+
*/
40+
const webRequestType = resourceType => {
41+
const type = types[resourceType]
42+
if (!type) throw Error(`Type ${resourceType} not mapped`)
43+
return type
44+
}
1245

1346
const WAIT_UNTIL = ['networkidle0']
1447

@@ -17,7 +50,7 @@ module.exports = async (
1750
{
1851
url,
1952
device,
20-
abortTrackers,
53+
adblock,
2154
abortTypes = [],
2255
waitFor = 0,
2356
waitUntil = WAIT_UNTIL,
@@ -31,22 +64,31 @@ module.exports = async (
3164

3265
page.on('request', req => {
3366
const resourceUrl = req.url()
34-
3567
const resourceType = req.resourceType()
3668

3769
if (abortTypes.includes(resourceType)) {
3870
debug(`abort:${resourceType}:${++reqCount.abort}`, resourceUrl)
3971
return req.abort()
4072
}
4173

42-
const urlDomain = extractDomain(url)
43-
const resourceDomain = extractDomain(resourceUrl)
44-
const isExternal = isExternalUrl(urlDomain, resourceDomain)
74+
if (adblock) {
75+
const { match: isMatch } = engine.match(
76+
makeRequest(
77+
{
78+
type: webRequestType(resourceType),
79+
sourceUrl: req.frame().url(),
80+
url: resourceUrl
81+
},
82+
url => tldts.parse(url)
83+
)
84+
)
4585

46-
if (abortTrackers && isExternal && isTracker(resourceDomain)) {
47-
debug(`abort:tracker:${++reqCount.abort}`, resourceUrl)
48-
return req.abort()
86+
if (isMatch) {
87+
debug(`abort:tracker:${++reqCount.abort}`, resourceUrl)
88+
return req.abort()
89+
}
4990
}
91+
5092
debug(`continue:${resourceType}:${++reqCount.continue}`, resourceUrl)
5193
return req.continue()
5294
})

packages/goto/src/is-tracker.js

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)