Skip to content
This repository has been archived by the owner on Feb 24, 2022. It is now read-only.

Commit

Permalink
Remove Fathom 1.0 Dependency fixes #90
Browse files Browse the repository at this point in the history
  • Loading branch information
jaredlockhart committed Aug 10, 2017
1 parent bc03298 commit ed29877
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 98 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ build/Release
# Dependency directories
node_modules
jspm_packages
package-lock.json

# Optional npm cache directory
.npm
Expand Down
1 change: 1 addition & 0 deletions circle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies:
- npm install --only=dev
- npm update
- sudo apt-get update && sudo apt-get install libpango1.0-0 libpangocairo-1.0-0 firefox
- sudo rm /usr/bin/firefox;sudo ln -s $(which firefox.ubuntu) /usr/bin/firefox

test:
pre:
Expand Down
6 changes: 2 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
{
"name": "page-metadata-parser",
"description": "A JavaScript library for parsing metadata in a Web Page.",
"version": "0.6.0",
"version": "1.0.0",
"author": "Jared Kerim",
"bugs": {
"url": "https://github.com/mozilla/page-metadata-parser/issues"
},
"dependencies": {
"fathom-web": "^1.1.2"
},
"dependencies": {},
"devDependencies": {
"babel": "^6.5.2",
"babel-core": "^6.17.0",
Expand Down
150 changes: 70 additions & 80 deletions parser.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,7 @@
const urlparse = require('url');
const {dom, rule, ruleset} = require('fathom-web');
const {makeUrlAbsolute, parseUrl} = require('./url-utils');

function makeUrlAbsolute(base, relative) {
const relativeParsed = urlparse.parse(relative);

if (relativeParsed.host === null) {
return urlparse.resolve(base, relative);
}

return relative;
}

function getProvider(url) {
return urlparse.parse(url)
.hostname
function getProvider(host) {
return host
.replace(/www[a-zA-Z0-9]*\./, '')
.replace('.co.', '.')
.split('.')
Expand All @@ -22,77 +10,77 @@ function getProvider(url) {
}

function buildRuleset(name, rules, processors, scorers) {
const reversedRules = Array.from(rules).reverse();
const builtRuleset = ruleset(...reversedRules.map(([query, handler], order) => rule(
dom(query),
node => {
let score = order;

if (scorers) {
scorers.forEach(scorer => {
const newScore = scorer(node, score);

if (newScore) {
score = newScore;
}
});
}
return (doc, context) => {
let maxScore = 0;
let maxValue;

return [{
flavor: name,
score: score,
notes: handler(node),
}];
}
)));
for (let currRule = 0; currRule < rules.length; currRule++) {
const [query, handler] = rules[currRule];

return (doc, context) => {
const kb = builtRuleset.score(doc);
const maxNode = kb.max(name);
const elements = Array.from(doc.querySelectorAll(query));

if (maxNode) {
let value = maxNode.flavors.get(name);
if(elements.length) {
for (const element of elements) {
let score = rules.length - currRule;

if (processors) {
processors.forEach(processor => {
value = processor(value, context);
});
if (scorers) {
for (const scorer of scorers) {
const newScore = scorer(element, score);

if (newScore) {
score = newScore;
}
}
}

if (score > maxScore) {
maxScore = score;
maxValue = handler(element);
}
}
}
}

if (value) {
if (value.trim) {
return value.trim();
if (maxValue) {
if (processors) {
for (const processor of processors) {
maxValue = processor(maxValue, context);
}
return value;
}

if (maxValue.trim) {
return maxValue.trim();
}

return maxValue;
}
};
}

const metadataRules = {
description: {
rules: [
['meta[property="og:description"]', node => node.element.getAttribute('content')],
['meta[name="description"]', node => node.element.getAttribute('content')],
['meta[property="og:description"]', element => element.getAttribute('content')],
['meta[name="description"]', element => element.getAttribute('content')],
],
},

icon_url: {
rules: [
['link[rel="apple-touch-icon"]', node => node.element.getAttribute('href')],
['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute('href')],
['link[rel="icon"]', node => node.element.getAttribute('href')],
['link[rel="fluid-icon"]', node => node.element.getAttribute('href')],
['link[rel="shortcut icon"]', node => node.element.getAttribute('href')],
['link[rel="Shortcut Icon"]', node => node.element.getAttribute('href')],
['link[rel="mask-icon"]', node => node.element.getAttribute('href')],
['link[rel="apple-touch-icon"]', element => element.getAttribute('href')],
['link[rel="apple-touch-icon-precomposed"]', element => element.getAttribute('href')],
['link[rel="icon"]', element => element.getAttribute('href')],
['link[rel="fluid-icon"]', element => element.getAttribute('href')],
['link[rel="shortcut icon"]', element => element.getAttribute('href')],
['link[rel="Shortcut Icon"]', element => element.getAttribute('href')],
['link[rel="mask-icon"]', element => element.getAttribute('href')],
],
scorers: [
// Handles the case where multiple icons are listed with specific sizes ie
// <link rel="icon" href="small.png" sizes="16x16">
// <link rel="icon" href="large.png" sizes="32x32">
(node, score) => {
const sizes = node.element.getAttribute('sizes');
(element, score) => {
const sizes = element.getAttribute('sizes');

if (sizes) {
const sizeMatches = sizes.match(/\d+/g);
Expand All @@ -110,12 +98,12 @@ const metadataRules = {

image_url: {
rules: [
['meta[property="og:image:secure_url"]', node => node.element.getAttribute('content')],
['meta[property="og:image:url"]', node => node.element.getAttribute('content')],
['meta[property="og:image"]', node => node.element.getAttribute('content')],
['meta[name="twitter:image"]', node => node.element.getAttribute('content')],
['meta[property="twitter:image"]', node => node.element.getAttribute('content')],
['meta[name="thumbnail"]', node => node.element.getAttribute('content')],
['meta[property="og:image:secure_url"]', element => element.getAttribute('content')],
['meta[property="og:image:url"]', element => element.getAttribute('content')],
['meta[property="og:image"]', element => element.getAttribute('content')],
['meta[name="twitter:image"]', element => element.getAttribute('content')],
['meta[property="twitter:image"]', element => element.getAttribute('content')],
['meta[name="thumbnail"]', element => element.getAttribute('content')],
],
processors: [
(image_url, context) => makeUrlAbsolute(context.url, image_url)
Expand All @@ -124,7 +112,7 @@ const metadataRules = {

keywords: {
rules: [
['meta[name="keywords"]', node => node.element.getAttribute('content')],
['meta[name="keywords"]', element => element.getAttribute('content')],
],
processors: [
(keywords) => keywords.split(',').map((keyword) => keyword.trim()),
Expand All @@ -133,24 +121,24 @@ const metadataRules = {

title: {
rules: [
['meta[property="og:title"]', node => node.element.getAttribute('content')],
['meta[name="twitter:title"]', node => node.element.getAttribute('content')],
['meta[property="twitter:title"]', node => node.element.getAttribute('content')],
['meta[name="hdl"]', node => node.element.getAttribute('content')],
['title', node => node.element.text],
['meta[property="og:title"]', element => element.getAttribute('content')],
['meta[name="twitter:title"]', element => element.getAttribute('content')],
['meta[property="twitter:title"]', element => element.getAttribute('content')],
['meta[name="hdl"]', element => element.getAttribute('content')],
['title', element => element.text],
],
},

type: {
rules: [
['meta[property="og:type"]', node => node.element.getAttribute('content')],
['meta[property="og:type"]', element => element.getAttribute('content')],
],
},

url: {
rules: [
['meta[property="og:url"]', node => node.element.getAttribute('content')],
['link[rel="canonical"]', node => node.element.getAttribute('href')],
['meta[property="og:url"]', element => element.getAttribute('content')],
['link[rel="canonical"]', element => element.getAttribute('href')],
],
processors: [
(url, context) => makeUrlAbsolute(context.url, url)
Expand All @@ -159,14 +147,17 @@ const metadataRules = {

provider: {
rules: [
['meta[property="og:site_name"]', node => node.element.getAttribute('content')]
['meta[property="og:site_name"]', element => element.getAttribute('content')]
]
},
};

function getMetadata(doc, url, rules) {
const metadata = {};
const context = {url};
const context = {
url,
};

const ruleSet = rules || metadataRules;

Object.keys(ruleSet).map(metadataKey => {
Expand All @@ -191,7 +182,7 @@ function getMetadata(doc, url, rules) {
}

if(url && !metadata.provider) {
metadata.provider = getProvider(url);
metadata.provider = getProvider(parseUrl(url));
}

if(url && !metadata.icon_url) {
Expand All @@ -205,6 +196,5 @@ module.exports = {
buildRuleset,
getMetadata,
getProvider,
makeUrlAbsolute,
metadataRules
};
23 changes: 12 additions & 11 deletions tests/getMetadata.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,27 @@
const {assert} = require('chai');
const {getProvider, getMetadata, metadataRules} = require('../parser');
const {stringToDom} = require('./test-utils');
const {parseUrl} = require('../url-utils');

describe('Get Provider Tests', function() {
it('gets a provider with no subdomain', function() {
assert.equal(getProvider('https://example.com/this/?id=that'), 'example');
assert.equal(getProvider(parseUrl('https://example.com/this/?id=that')), 'example');
});

it('removes www as a subdomain', function() {
assert.equal(getProvider('https://www.example.com/this/?id=that'), 'example');
assert.equal(getProvider(parseUrl('https://www.example.com/this/?id=that')), 'example');
});

it('removes www1 as a subdomain', function() {
assert.equal(getProvider('https://www1.example.com/this/?id=that'), 'example');
assert.equal(getProvider(parseUrl('https://www1.example.com/this/?id=that')), 'example');
});

it('preserves non-www subdomains', function() {
assert.equal(getProvider('https://things.example.com/this/?id=that'), 'things example');
assert.equal(getProvider(parseUrl('https://things.example.com/this/?id=that')), 'things example');
});

it('removes secondary TLDs', function() {
assert.equal(getProvider('https://things.example.co.uk/this/?id=that'), 'things example');
assert.equal(getProvider(parseUrl('https://things.example.co.uk/this/?id=that')), 'things example');
});
});

Expand Down Expand Up @@ -53,7 +54,7 @@ describe('Get Metadata Tests', function() {

it('parses metadata', () => {
const doc = stringToDom(sampleHtml);
const metadata = getMetadata(doc);
const metadata = getMetadata(doc, sampleUrl, metadataRules);

assert.equal(metadata.description, sampleDescription, `Unable to find ${sampleDescription} in ${sampleHtml}`);
assert.equal(metadata.icon_url, sampleIcon, `Unable to find ${sampleIcon} in ${sampleHtml}`);
Expand All @@ -79,7 +80,7 @@ describe('Get Metadata Tests', function() {
`;

const doc = stringToDom(relativeHtml);
const metadata = getMetadata(doc, sampleUrl);
const metadata = getMetadata(doc, sampleUrl, metadataRules);

assert.equal(metadata.icon_url, sampleIcon, `Unable to find ${sampleIcon} in ${relativeHtml}`);
assert.equal(metadata.image_url, sampleImageHTTP, `Unable to find ${sampleImageHTTP} in ${relativeHtml}`);
Expand All @@ -95,7 +96,7 @@ describe('Get Metadata Tests', function() {

const sampleProvider = 'example';
const doc = stringToDom(emptyHtml);
const metadata = getMetadata(doc, sampleUrl);
const metadata = getMetadata(doc, sampleUrl, metadataRules);

assert.equal(metadata.provider, sampleProvider, `Unable to find ${sampleProvider} in ${sampleUrl}`);
});
Expand All @@ -111,7 +112,7 @@ describe('Get Metadata Tests', function() {
`;

const doc = stringToDom(providerHtml);
const metadata = getMetadata(doc, sampleUrl);
const metadata = getMetadata(doc, sampleUrl, metadataRules);

assert.equal(metadata.provider, sampleProvider, `Unable to find ${sampleProvider} in ${providerHtml}`);
});
Expand All @@ -125,7 +126,7 @@ describe('Get Metadata Tests', function() {
`;

const doc = stringToDom(noIconHtml);
const metadata = getMetadata(doc, sampleUrl);
const metadata = getMetadata(doc, sampleUrl, metadataRules);

assert.equal(metadata.icon_url, sampleIcon, `Unable to find ${sampleIcon} in ${metadata.icon_url}`);
});
Expand All @@ -138,7 +139,7 @@ describe('Get Metadata Tests', function() {
`;

const doc = stringToDom(html);
const metadata = getMetadata(doc, sampleUrl);
const metadata = getMetadata(doc, sampleUrl, metadataRules);

assert.equal(metadata.url, sampleUrl, `Unable to find ${sampleUrl} in ${JSON.stringify(metadata)}`);
});
Expand Down
1 change: 0 additions & 1 deletion tests/metadataRules.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,3 @@ describe('Provider Rule Tests', function() {

ruleTests.map(([testName, testTag]) => ruleTest(testName, metadataRules.provider, provider, testTag));
});

4 changes: 3 additions & 1 deletion tests/test-utils.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
if (global.DOMParser !== undefined) {
const parser = new DOMParser();
// We're in Firefox
module.exports = {
stringToDom(str) {
const parser = new DOMParser();
return parser.parseFromString(str, 'text/html');
}
};
} else {
// We're in Node.js
const domino = require('domino');
module.exports = {
stringToDom(str) {
Expand Down

0 comments on commit ed29877

Please sign in to comment.