Skip to content

Commit 9bb261d

Browse files
committed
Support URL
1 parent 6149cde commit 9bb261d

File tree

63 files changed

+91
-83
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+91
-83
lines changed

index.js

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
'use strict'
22

3-
const cb2promise = require('cb2promise')
43
const reduce = require('lodash.reduce')
54
const {ensureAsync} = require('async')
5+
const {promisify} = require('util')
66

77
const getData = require('./src/get-data')
88
const loadHtml = require('./src/html')
99
const {props} = getData
1010

11-
const getMetaData = ensureAsync((rawHtml, cb) => {
12-
const html = loadHtml(rawHtml)
11+
const getMetaData = ensureAsync(({url, html}, cb) => {
12+
const htmlDom = loadHtml(html)
1313

1414
const output = reduce(props, (acc, conditions, propName) => {
15-
const value = getData(html, conditions)
15+
const value = getData({htmlDom, url, conditions})
1616
// TODO: Avoid response nil values
1717
acc[propName] = value
1818
return acc
@@ -21,8 +21,8 @@ const getMetaData = ensureAsync((rawHtml, cb) => {
2121
return cb(null, output)
2222
})
2323

24-
module.exports = (html, cb) => {
25-
return cb
26-
? getMetaData(html, cb)
27-
: cb2promise(getMetaData, html)
28-
}
24+
const getMetaDataPromise = promisify(getMetaData)
25+
26+
module.exports = ({url, html}, cb) => (
27+
cb ? getMetaData({url, html}, cb) : getMetaDataPromise({url, html})
28+
)

package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
],
2020
"dependencies": {
2121
"async": "~2.5.0",
22-
"cb2promise": "~1.1.0",
2322
"cheerio": "~1.0.0-rc.2",
2423
"chrono-node": "~1.3.4",
2524
"condense-whitespace": "~1.0.0",

src/get-data/index.js

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,20 @@ const rules = require('req-all')('./rules')
44

55
const isValid = result => result !== null && result !== undefined && result !== ''
66

7-
const getValue = ($, conditions) => {
7+
const getValue = ({htmlDom, url, conditions}) => {
88
const size = conditions.length
99
let index = -1
1010
let value = null
1111

1212
while (!isValid(value) && index++ < size - 1) {
13-
value = conditions[index]($)
13+
value = conditions[index](htmlDom, url)
1414
}
15+
1516
return value
1617
}
1718

18-
const getData = ($, conditions) => {
19-
const data = getValue($, conditions)
19+
const getData = ({htmlDom, url, conditions}) => {
20+
const data = getValue({htmlDom, url, conditions})
2021
return isValid(data) ? data : null
2122
}
2223

src/get-data/rules/date.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,13 @@ module.exports = [
5050
wrap($ => $('time[datetime]').attr('datetime')),
5151
wrap($ => $('[class*="byline"]').text()),
5252
wrap($ => $('[class*="dateline"]').text()),
53-
wrap($ => $('[class*="date"]').text()),
5453
wrap($ => $('[id*="date"]').text()),
54+
wrap($ => $('[class*="date"]').text()),
55+
wrap($ => $('[id*="publish"]').text()),
56+
wrap($ => $('[class*="publish"]').text()),
5557
wrap($ => $('[class*="post-timestamp"]').text()),
5658
wrap($ => $('[class*="post-meta"]').text()),
57-
wrap($ => $('[class*="metadata"]').text())
59+
wrap($ => $('[class*="metadata"]').text()),
60+
wrap($ => $('[id*="time"]').text()),
61+
wrap($ => $('[class*="time"]').text())
5862
]

src/get-data/rules/image.js

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
'use strict'
22

3-
const normalizeUrl = require('normalize-url')
3+
const sanetizeUrl = require('normalize-url')
4+
const {resolve: resolveUrl} = require('url')
45
const urlRegex = require('url-regex')
56

67
const isUrl = value => urlRegex().test(value)
8+
const normalizeUrl = url => sanetizeUrl(url, {stripWWW: false})
79

810
/**
911
* Wrap a rule with validation and formatting logic.
@@ -12,19 +14,19 @@ const isUrl = value => urlRegex().test(value)
1214
* @return {Function} wrapped
1315
*/
1416

15-
const wrap = rule => $ => {
16-
const value = rule($)
17+
const wrap = rule => (htmlDom, url) => {
18+
const imageUrl = rule(htmlDom)
19+
if (!imageUrl) return
1720

18-
if (!isUrl(value)) return
19-
return normalizeUrl(value, {
20-
stripWWW: false
21-
})
21+
if (isUrl(imageUrl)) return normalizeUrl(imageUrl)
22+
23+
const absoluteImageUrl = resolveUrl(url, imageUrl)
24+
if (isUrl(absoluteImageUrl)) return normalizeUrl(absoluteImageUrl)
2225
}
2326

2427
/**
2528
* Rules.
2629
*/
27-
2830
module.exports = [
2931
wrap($ => $('meta[property="og:image:secure_url"]').attr('content')),
3032
wrap($ => $('meta[property="og:image:url"]').attr('content')),
@@ -37,5 +39,7 @@ module.exports = [
3739
wrap($ => $('meta[name="sailthru.image.full"]').attr('content')),
3840
wrap($ => $('meta[name="sailthru.image.thumb"]').attr('content')),
3941
wrap($ => $('article img[src]').first().attr('src')),
40-
wrap($ => $('#content img[src]').first().attr('src'))
42+
wrap($ => $('#content img[src]').first().attr('src')),
43+
wrap($ => $('img[alt*="author"]').attr('src')),
44+
wrap($ => $('img[src]').attr('src'))
4145
]

test/web/bloomberg/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ const readFile = promisify(fs.readFile)
1212
it(basename(__dirname), async () => {
1313
const html = await readFile(resolve(__dirname, 'input.html'))
1414
const json = await loadJSON(resolve(__dirname, 'output.json'))
15-
const metadata = await getMetaData(html)
15+
const metadata = await getMetaData({html})
1616
should(metadata).be.eql(json)
1717
})

test/web/business-today/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ const readFile = promisify(fs.readFile)
1212
it(basename(__dirname), async () => {
1313
const html = await readFile(resolve(__dirname, 'input.html'))
1414
const json = await loadJSON(resolve(__dirname, 'output.json'))
15-
const metadata = await getMetaData(html)
15+
const metadata = await getMetaData({html})
1616
should(metadata).be.eql(json)
1717
})

test/web/cbr/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ const readFile = promisify(fs.readFile)
1212
it(basename(__dirname), async () => {
1313
const html = await readFile(resolve(__dirname, 'input.html'))
1414
const json = await loadJSON(resolve(__dirname, 'output.json'))
15-
const metadata = await getMetaData(html)
15+
const metadata = await getMetaData({html})
1616
should(metadata).be.eql(json)
1717
})

test/web/cio/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ const readFile = promisify(fs.readFile)
1212
it(basename(__dirname), async () => {
1313
const html = await readFile(resolve(__dirname, 'input.html'))
1414
const json = await loadJSON(resolve(__dirname, 'output.json'))
15-
const metadata = await getMetaData(html)
15+
const metadata = await getMetaData({html})
1616
should(metadata).be.eql(json)
1717
})

test/web/cloud-pro/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ const readFile = promisify(fs.readFile)
1212
it(basename(__dirname), async () => {
1313
const html = await readFile(resolve(__dirname, 'input.html'))
1414
const json = await loadJSON(resolve(__dirname, 'output.json'))
15-
const metadata = await getMetaData(html)
15+
const metadata = await getMetaData({html})
1616
should(metadata).be.eql(json)
1717
})

0 commit comments

Comments
 (0)