diff --git a/parser.js b/parser.js index a2fa011..16db171 100644 --- a/parser.js +++ b/parser.js @@ -1,6 +1,17 @@ +const urlparse = require('url'); const {dom, rule, ruleset} = require('fathom-web'); -function buildRuleset(name, rules) { +function makeUrlAbsolute(base, relative) { + const relativeParsed = urlparse.parse(relative); + + if (relativeParsed.host === null) { + return urlparse.resolve(base, relative); + } + + return relative; +} + +function buildRuleset(name, rules, processors) { const reversedRules = Array.from(rules).reverse(); const builtRuleset = ruleset(...reversedRules.map(([query, handler], order) => rule( dom(query), @@ -11,11 +22,19 @@ function buildRuleset(name, rules) { }] ))); - return doc => { + return (doc, context) => { const kb = builtRuleset.score(doc); const maxNode = kb.max(name); + if (maxNode) { - const value = maxNode.flavors.get(name); + let value = maxNode.flavors.get(name); + + if (processors) { + processors.forEach(processor => { + value = processor(value, context); + }); + } + if (value) { return value.trim(); } @@ -24,53 +43,79 @@ function buildRuleset(name, rules) { } const metadataRules = { - description: [ - ['meta[property="og:description"]', node => node.element.getAttribute('content')], - ['meta[name="description"]', node => node.element.getAttribute('content')], - ], - icon_url: [ - ['link[rel="apple-touch-icon"]', node => node.element.getAttribute('href')], - ['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute('href')], - ['link[rel="icon"]', node => node.element.getAttribute('href')], - ['link[rel="fluid-icon"]', node => node.element.getAttribute('href')], - ['link[rel="shortcut icon"]', node => node.element.getAttribute('href')], - ['link[rel="Shortcut Icon"]', node => node.element.getAttribute('href')], - ['link[rel="mask-icon"]', node => node.element.getAttribute('href')], - ], - image_url: [ - ['meta[property="og:image:secure_url"]', node => node.element.getAttribute('content')], - ['meta[property="og:image:url"]', node => node.element.getAttribute('content')], - ['meta[property="og:image"]', node => node.element.getAttribute('content')], - ['meta[property="twitter:image"]', node => node.element.getAttribute('content')], - ['meta[name="thumbnail"]', node => node.element.getAttribute('content')], - ], - keywords: [ - ['meta[name="keywords"]', node => node.element.getAttribute('content')], - ], - title: [ - ['meta[property="og:title"]', node => node.element.getAttribute('content')], - ['meta[property="twitter:title"]', node => node.element.getAttribute('content')], - ['meta[name="hdl"]', node => node.element.getAttribute('content')], - ['title', node => node.element.text], - ], - type: [ - ['meta[property="og:type"]', node => node.element.getAttribute('content')], - ], - url: [ - ['meta[property="og:url"]', node => node.element.getAttribute('content')], - ['link[rel="canonical"]', node => node.element.getAttribute('href')], - ], + description: { + rules: [ + ['meta[property="og:description"]', node => node.element.getAttribute('content')], + ['meta[name="description"]', node => node.element.getAttribute('content')], + ], + }, + + icon_url: { + rules: [ + ['link[rel="apple-touch-icon"]', node => node.element.getAttribute('href')], + ['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute('href')], + ['link[rel="icon"]', node => node.element.getAttribute('href')], + ['link[rel="fluid-icon"]', node => node.element.getAttribute('href')], + ['link[rel="shortcut icon"]', node => node.element.getAttribute('href')], + ['link[rel="Shortcut Icon"]', node => node.element.getAttribute('href')], + ['link[rel="mask-icon"]', node => node.element.getAttribute('href')], + ], + processors: [ + (icon_url, context) => makeUrlAbsolute(context.url, icon_url) + ] + }, + + image_url: { + rules: [ + ['meta[property="og:image:secure_url"]', node => node.element.getAttribute('content')], + ['meta[property="og:image:url"]', node => node.element.getAttribute('content')], + ['meta[property="og:image"]', node => node.element.getAttribute('content')], + ['meta[property="twitter:image"]', node => node.element.getAttribute('content')], + ['meta[name="thumbnail"]', node => node.element.getAttribute('content')], + ], + processors: [ + (image_url, context) => makeUrlAbsolute(context.url, image_url) + ], + }, + + keywords: { + rules: [ + ['meta[name="keywords"]', node => node.element.getAttribute('content')], + ], + }, + + title: { + rules: [ + ['meta[property="og:title"]', node => node.element.getAttribute('content')], + ['meta[property="twitter:title"]', node => node.element.getAttribute('content')], + ['meta[name="hdl"]', node => node.element.getAttribute('content')], + ['title', node => node.element.text], + ], + }, + + type: { + rules: [ + ['meta[property="og:type"]', node => node.element.getAttribute('content')], + ], + }, + + url: { + rules: [ + ['meta[property="og:url"]', node => node.element.getAttribute('content')], + ['link[rel="canonical"]', node => node.element.getAttribute('href')], + ], + }, }; -function getMetadata(doc, rules) { +function getMetadata(doc, rules, context = {}) { const metadata = {}; const ruleSet = rules || metadataRules; Object.keys(ruleSet).map(metadataKey => { const metadataRule = ruleSet[metadataKey]; - if(Array.isArray(metadataRule)) { - metadata[metadataKey] = buildRuleset(metadataKey, metadataRule)(doc); + if(Array.isArray(metadataRule.rules)) { + metadata[metadataKey] = buildRuleset(metadataKey, metadataRule.rules, metadataRule.processors)(doc, context); } else { metadata[metadataKey] = getMetadata(doc, metadataRule); } diff --git a/tests/getMetadata.test.js b/tests/getMetadata.test.js index dea360e..35c8de5 100644 --- a/tests/getMetadata.test.js +++ b/tests/getMetadata.test.js @@ -39,6 +39,27 @@ describe('Get Metadata Tests', function() { assert.equal(metadata.url, sampleUrl, `Unable to find ${sampleUrl} in ${sampleHtml}`); }); + it('uses absolute URLs when url parameter passed in through context', () => { + const relativeHtml = ` + + + + + + + + + + + `; + + const doc = stringToDom(relativeHtml); + const metadata = getMetadata(doc, metadataRules, {url: 'http://www.example.com/'}); + + assert.equal(metadata.icon_url, sampleIcon, `Unable to find ${sampleIcon} in ${relativeHtml}`); + assert.equal(metadata.image_url, sampleImageHTTP, `Unable to find ${sampleImageHTTP} in ${relativeHtml}`); + }); + it('allows custom rules', () => { const doc = stringToDom(sampleHtml); const rules = { diff --git a/tests/metadataRules.test.js b/tests/metadataRules.test.js index 6488209..550e32b 100644 --- a/tests/metadataRules.test.js +++ b/tests/metadataRules.test.js @@ -17,8 +17,10 @@ function ruleTest(testName, testRule, expected, testTag) { it(`finds ${testName}`, () => { const html = buildHTML(testTag); const doc = stringToDom(html); - const rule = buildRuleset(testName, testRule); - const found = rule(doc); + const rule = buildRuleset(testName, testRule.rules, testRule.processors); + const found = rule(doc, { + url: 'http://www.example.com/' + }); assert.equal(found, expected, `Unable to find ${testName} in ${html}`); }); } @@ -52,6 +54,7 @@ describe('Canonical URL Rule Tests', function() { describe('Icon Rule Tests', function() { const pageIcon = 'http://www.example.com/favicon.ico'; + const relativeIcon = '/favicon.ico'; const ruleTests = [ ['apple-touch-icon', ``], @@ -61,6 +64,7 @@ describe('Icon Rule Tests', function() { ['shortcut icon', ``], ['Shortcut Icon', ``], ['mask-icon', ``], + ['relative icon', ``], ]; ruleTests.map(([testName, testTag]) => ruleTest(testName, metadataRules.icon_url, pageIcon, testTag)); @@ -69,6 +73,7 @@ describe('Icon Rule Tests', function() { describe('Image Rule Tests', function() { const pageImage = 'http://www.example.com/image.png'; + const relativeImage = '/image.png'; const ruleTests = [ ['og:image', ``], @@ -76,6 +81,7 @@ describe('Image Rule Tests', function() { ['og:image:secure_url', ` `], ['twitter:image', ``], ['thumbnail', ``], + ['relative image', ``], ]; ruleTests.map(([testName, testTag]) => ruleTest(testName, metadataRules.image_url, pageImage, testTag));