Skip to content

Commit

Permalink
Fix headers being removed (#650)
Browse files Browse the repository at this point in the history
* add testcase (that we need to fix)

* fix our testcase

* tweak similarity and remove first heading

* add testcase from #464

* add testcase from #615

* keep also first headers and headers that are superset of title
  • Loading branch information
jakubriedl committed Dec 3, 2020
1 parent 290724c commit 11093f0
Show file tree
Hide file tree
Showing 35 changed files with 2,895 additions and 28 deletions.
64 changes: 40 additions & 24 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ Readability.prototype = {
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
tokenize: /\W+/g,
whitespace: /^\s*$/,
hasContent: /\S$/,
hashUrl: /^#.+/,
Expand Down Expand Up @@ -675,7 +676,6 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
this._clean(articleContent, "aside");
Expand All @@ -691,25 +691,6 @@ Readability.prototype = {
});
});

// If there is only one h2 and its text content substantially equals article title,
// they are probably using it as a header and not a subheader,
// so remove it since we already extract the title separately.
var h2 = articleContent.getElementsByTagName("h2");
if (h2.length === 1) {
var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
if (Math.abs(lengthSimilarRate) < 0.5) {
var titlesMatch = false;
if (lengthSimilarRate > 0) {
titlesMatch = h2[0].textContent.includes(this._articleTitle);
} else {
titlesMatch = this._articleTitle.includes(h2[0].textContent);
}
if (titlesMatch) {
this._clean(articleContent, "h2");
}
}
}

this._clean(articleContent, "iframe");
this._clean(articleContent, "input");
this._clean(articleContent, "textarea");
Expand All @@ -723,6 +704,9 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");

// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2");

// Remove extra paragraphs
this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
var imgCount = paragraph.getElementsByTagName("img").length;
Expand Down Expand Up @@ -832,6 +816,21 @@ Readability.prototype = {
return node && node.nextElementSibling;
},

// compares second text to first one
// 1 = same text, 0 = completely different text
// works the way that it splits both texts into words and then finds words that are unique in second text
// the result is given by the lower length of unique parts
_textSimilarity: function(textA, textB) {
var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
if (!tokensA.length || !tokensB.length) {
return 0;
}
var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
return 1 - distanceB;
},

_checkByline: function(node, matchString) {
if (this._articleByline) {
return false;
Expand Down Expand Up @@ -2001,6 +2000,17 @@ Readability.prototype = {
});
},

_getTextDensity: function(e, tags) {
var textLength = this._getInnerText(e, true).length;
if (textLength === 0) {
return 0;
}
var childrenLength = 0;
var children = this._getAllNodesWithTag(e, tags);
this._forEachNode(children, (child) => childrenLength += this._getInnerText(child, true).length);
return childrenLength / textLength;
},

/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
Expand Down Expand Up @@ -2061,6 +2071,7 @@ Readability.prototype = {
var img = node.getElementsByTagName("img").length;
var li = node.getElementsByTagName("li").length - 100;
var input = node.getElementsByTagName("input").length;
var headingDensity = this._getTextDensity(node, ["h1", "h2", "h3", "h4", "h5", "h6"]);

var embedCount = 0;
var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
Expand Down Expand Up @@ -2088,7 +2099,7 @@ Readability.prototype = {
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
Expand Down Expand Up @@ -2118,15 +2129,20 @@ Readability.prototype = {
},

/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
* Clean out spurious headers from an Element.
*
* @param Element
* @return void
**/
_cleanHeaders: function(e) {
this._removeNodes(this._getAllNodesWithTag(e, ["h1", "h2"]), function (header) {
return this._getClassWeight(header) < 0;
var headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
var nodeToRemove = this._findNode(headingNodes, (node) => {
var heading = this._getInnerText(node, false);
return this._textSimilarity(this._articleTitle, heading) > 0.75 || this._getClassWeight(node) < 0;
});
if (nodeToRemove) {
this._removeNodes([nodeToRemove]);
}
},

_flagIsActive: function(flag) {
Expand Down
1 change: 1 addition & 0 deletions test/test-pages/003-metadata-preferred/expected.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div id="readability-page-1" class="page">
<article>
<h2>Test document title</h2>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
</article>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div id="readability-page-1" class="page">
<article>
<h2>Test document title</h2>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
</article>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div id="readability-page-1" class="page">
<article>
<h2>Lorem</h2>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<p>Links</p>
<p><a href="http://fakehost/test/base/foo/bar/baz.html">link</a></p>
Expand Down
1 change: 1 addition & 0 deletions test/test-pages/base-url-base-element/expected.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div id="readability-page-1" class="page">
<article>
<h2>Lorem</h2>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<p>Links</p>
<p><a href="http://fakehost/foo/bar/baz.html">link</a></p>
Expand Down
1 change: 1 addition & 0 deletions test/test-pages/base-url/expected.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div id="readability-page-1" class="page">
<article>
<h2>Lorem</h2>
<p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
<p>Links</p>
<p><a href="http://fakehost/test/foo/bar/baz.html">link</a></p>
Expand Down
3 changes: 3 additions & 0 deletions test/test-pages/citylab-1/expected.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
<span itemprop="caption">The Moulin Rouge cabaret in Paris</span> <span itemprop="creator">Benoit Tessier/Reuters</span>
</figcaption>
</figure>
<div>
<h2 itemprop="headline"> Why Neon Is the Ultimate Symbol of the 20th Century </h2>
</div>
<h2 itemprop="description"> The once-ubiquitous form of lighting was novel when it first emerged in the early 1900s, though it has since come to represent decline. </h2>
<section id="article-section-1">
<p> In the summer of 1898, the Scottish chemist Sir William Ramsay made a discovery that would eventually give the Moulin Rouge in Paris, the Las Vegas Strip, and New York’s Times Square their perpetual nighttime glow. Using the boiling point of argon as a reference point, Ramsay and his colleague Morris W. Travers isolated three more noble gases and gave them evocative Greek names: neon, krypton, and xenon. In so doing, the scientists bestowed a label of permanent novelty on the most famous of the trio—neon, which translates as “new.” This discovery was the foundation on which the French engineer Georges Claude crafted a new form of illumination over the next decade. He designed glass tubes in which neon gas could be trapped, then electrified, to create a light that glowed reliably for more than 1,000 hours. </p>
Expand Down
1 change: 1 addition & 0 deletions test/test-pages/daringfireball-1/expected.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div id="readability-page-1" class="page">
<div id="Box">
<h2>About This Site</h2>
<p>Daring Fireball is written and produced by John Gruber.</p>
<p>
<a href="http://fakehost/graphics/author/addison-bw.jpg"> <img src="http://fakehost/graphics/author/addison-bw-425.jpg" alt="Photograph of the author." /></a>
Expand Down
8 changes: 8 additions & 0 deletions test/test-pages/dropbox-blog/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"title": "How we designed Dropbox’s ATF - an async task framework",
"byline": "Arun Sai Krishnan",
"dir": null,
"excerpt": "I joined Dropbox not long after graduating with a Master’s degree in computer science. Aside from an internship, this was my first big-league engineering job. My team had already begun designing a critical internal service that most of our software would use: It would handle asynchronous computing requests behind the scenes, powering everything from dragging a file into a Dropbox folder to scheduling a marketing campaign.",
"siteName": null,
"readerable": true
}
Loading

0 comments on commit 11093f0

Please sign in to comment.