Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 106 additions & 4 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,11 @@ Readability.prototype = {
_prepArticle: function(articleContent) {
this._cleanStyles(articleContent);

// Check for data tables before we continue, to avoid removing items in
// those tables, which will often be isolated even though they're
// visually linked to other content-ful elements (text, images, etc.).
this._markDataTables(articleContent);

// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
Expand Down Expand Up @@ -1673,23 +1678,111 @@ Readability.prototype = {
* @param HTMLElement node
* @param String tagName
* @param Number maxDepth
* @param Function filterFn a filter to invoke to determine whether this node 'counts'
* @return Boolean
*/
_hasAncestorTag: function(node, tagName, maxDepth) {
_hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
maxDepth = maxDepth || 3;
tagName = tagName.toUpperCase();
var depth = 0;
while (node.parentNode) {
if (depth > maxDepth)
if (maxDepth > 0 && depth > maxDepth)
return false;
if (node.parentNode.tagName === tagName)
if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
return true;
node = node.parentNode;
depth++;
}
return false;
},

/**
* Return an object indicating how many rows and columns this table has.
*/
_getRowAndColumnCount: function(table) {
var rows = 0;
var columns = 0;
var trs = table.getElementsByTagName("tr");
for (var i = 0; i < trs.length; i++) {
var rowspan = trs[i].getAttribute("rowspan") || 0;
if (rowspan) {
rowspan = parseInt(rowspan, 10);
}
rows += (rowspan || 1);

// Now look for column-related info
var columnsInThisRow = 0;
var cells = trs[i].getElementsByTagName("td");
for (var j = 0; j < cells.length; j++) {
var colspan = cells[j].getAttribute("colspan") || 0;
if (colspan) {
colspan = parseInt(colspan, 10);
}
columnsInThisRow += (colspan || 1);
}
columns = Math.max(columns, columnsInThisRow);
}
return {rows: rows, columns: columns};
},

/**
* Look for 'data' (as opposed to 'layout') tables, for which we use
* similar checks as
* https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
*/
_markDataTables: function(root) {
var tables = root.getElementsByTagName("table");
for (var i = 0; i < tables.length; i++) {
var table = tables[i];
var role = table.getAttribute("role");
if (role == "presentation") {
table._readabilityDataTable = false;
continue;
}
var datatable = table.getAttribute("datatable");
if (datatable == "0") {
table._readabilityDataTable = false;
continue;
}
var summary = table.getAttribute("summary");
if (summary) {
table._readabilityDataTable = true;
continue;
}

var caption = table.getElementsByTagName("caption")[0];
if (caption && caption.childNodes.length > 0) {
table._readabilityDataTable = true;
continue;
}

// If the table has a descendant with any of these tags, consider a data table:
var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
var descendantExists = function(tag) {
return !!table.getElementsByTagName(tag)[0];
};
if (dataTableDescendants.some(descendantExists)) {
this.log("Data table because found data-y descendant");
table._readabilityDataTable = true;
continue;
}

// Nested tables indicate a layout table:
if (table.getElementsByTagName("table")[0]) {
table._readabilityDataTable = false;
continue;
}

var sizeInfo = this._getRowAndColumnCount(table);
if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
table._readabilityDataTable = true;
continue;
}
// Now just go by size entirely:
table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
}
},

/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
Expand All @@ -1708,6 +1801,15 @@ Readability.prototype = {
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(e.getElementsByTagName(tag), function(node) {
// First check if we're in a data table, in which case don't remove us.
var isDataTable = function(t) {
return t._readabilityDataTable;
};

if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
return false;
}

var weight = this._getClassWeight(node);
var contentScore = 0;

Expand All @@ -1723,7 +1825,7 @@ Readability.prototype = {
// ominous signs, remove the element.
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
var li = node.getElementsByTagName("li").length-100;
var li = node.getElementsByTagName("li").length - 100;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add new eslint rules to avoid the bad code style like this. Already filed a follow up issue.

var input = node.getElementsByTagName("input").length;

var embedCount = 0;
Expand Down
7 changes: 7 additions & 0 deletions test/test-pages/links-in-tables/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"title": "Saving Data: Reducing the size of App Updates by 65%",
"byline": null,
"dir": "ltr",
"excerpt": "Posted by Andrew Hayden, Software Engineer on Google Play",
"readerable": true
}
155 changes: 155 additions & 0 deletions test/test-pages/links-in-tables/expected.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
<div id="readability-page-1" class="page">
<div class="post-body entry-content" id="post-body-2701400044422363572" itemprop="articlesBody">
<p> <em>Posted by Andrew Hayden, Software Engineer on Google Play</em> </p>
<p> Android users are downloading tens of billions of apps and games on Google Play. We're also seeing developers update their apps frequently in order to provide users with great content, improve security, and enhance the overall user experience. It takes a lot of data to download these updates and we know users care about how much data their devices are using. Earlier this year, we announced that we started using <a href="https://android-developers.blogspot.com/2016/07/improvements-for-smaller-app-downloads.html">the
bsdiff algorithm</a> <a href="https://android-developers.blogspot.com/2016/07/improvements-for-smaller-app-downloads.html">(by
Colin Percival)</a>. Using bsdiff, we were able to reduce the size of app updates on average by 47% compared to the full APK size. </p>
<p> Today, we're excited to share a new approach that goes further — <strong><a href="https://github.com/andrewhayden/archive-patcher/blob/master/README.md">File-by-File
patching</a></strong>. App Updates using File-by-File patching are, <strong>on average,</strong> <strong>65% smaller than the full app</strong>, and in some cases more than 90% smaller. </p>
<p> The savings, compared to our previous approach, add up to 6 petabytes of user data saved per day! </p>
<p> In order to get the new version of the app, Google Play sends your device a patch that describes the <em>differences</em> between the old and new versions of the app. </p>
<p> Imagine you are an author of a book about to be published, and wish to change a single sentence - it's much easier to tell the editor which sentence to change and what to change, rather than send an entirely new book. In the same way, patches are much smaller and much faster to download than the entire APK. </p>
<p> <strong><span>Techniques used in File-by-File
patching </span></strong> </p>
<p> Android apps are packaged as APKs, which are ZIP files with special conventions. Most of the content within the ZIP files (and APKs) is compressed using a technology called <a href="https://en.wikipedia.org/w/index.php?title=DEFLATE&amp;oldid=735386036">Deflate</a>. Deflate is really good at compressing data but it has a drawback: it makes identifying changes in the original (uncompressed) content really hard. Even a tiny change to the original content (like changing one word in a book) can make the compressed output of deflate look <em>completely different</em>. Describing the differences between the <em>original</em> content is easy, but describing the differences between the <em>compressed</em> content is so hard that it leads to inefficient patches. </p>
<p> Watch how much the compressed text on the right side changes from a one-letter change in the uncompressed text on the left: </p>
<div class="separator">
<a href="https://2.bp.blogspot.com/-chCZZinlUTg/WEcxvJo9gdI/AAAAAAAADnk/3ND_BspqN6Y2j5xxkLFW3RyS2Ig0NHZpQCLcB/s1600/ipsum-opsum.gif" imageanchor="1"><img src="https://2.bp.blogspot.com/-chCZZinlUTg/WEcxvJo9gdI/AAAAAAAADnk/3ND_BspqN6Y2j5xxkLFW3RyS2Ig0NHZpQCLcB/s640/ipsum-opsum.gif" width="640" height="105" border="0" /></a>
</div>
<p> File-by-File therefore is based on detecting changes in the uncompressed data. To generate a patch, we first decompress both old and new files before computing the delta (we still use bsdiff here). Then to apply the patch, we decompress the old file, apply the delta to the uncompressed content and then recompress the new file. In doing so, we need to make sure that the APK on your device is a perfect match, byte for byte, to the one on the Play Store (see <a href="https://source.android.com/security/apksigning/v2.html">APK Signature
Schema v2 </a>for why). </p>
<p> When recompressing the new file, we hit two complications. First, Deflate has a number of settings that affect output; and we don't know which settings were used in the first place. Second, many versions of deflate exist and we need to know whether the version on your device is suitable. </p>
<p> Fortunately, after analysis of the apps on the Play Store, we've discovered that recent and compatible versions of deflate based on zlib (the most popular deflate library) account for almost all deflated content in the Play Store. In addition, the default settings (level=6) and maximum compression settings (level=9) are the only settings we encountered in practice. </p>
<p> Knowing this, we can detect and reproduce the original deflate settings. This makes it possible to uncompress the data, apply a patch, and then recompress the data back to <em>exactly the same bytes</em> as originally uploaded. </p>
<p> However, there is one trade off; extra processing power is needed on the device. On modern devices (e.g. from 2015), recompression can take a little over a second per megabyte and on older or less powerful devices it can be longer. Analysis so far shows that, on average, if the patch size is halved then the time spent applying the patch (which for File-by-File includes recompression) is doubled. </p>
<p> For now, we are limiting the use of this new patching technology to auto-updates only, i.e. the updates that take place in the background, usually at night when your phone is plugged into power and you're not likely to be using it. This ensures that users won't have to wait any longer than usual for an update to finish when manually updating an app. </p>
<p> <strong><span>How effective is File-by-File
Patching?</span></strong> </p>
<p> Here are examples of app updates already using File-by-File Patching: </p>
<div dir="ltr" trbidi="on">
<div dir="ltr">
<table>
<colgroup>
<col width="142" />
<col width="102" />
<col width="176" />
<col width="176" />
</colgroup>
<tbody>
<tr>
<td>
<p dir="ltr"> <span>Application</span></p>
</td>
<td>
<p dir="ltr"> <span>Original Size</span></p>
</td>
<td>
<p dir="ltr"> <span>Previous (BSDiff) Patch Size</span></p>
<p dir="ltr"> <span>(% vs original)</span></p>
</td>
<td>
<p dir="ltr"> <span>File-by-File Patch Size (% vs original)</span></p>
</td>
</tr>
<tr>
<td>
<div dir="ltr"><a href="https://play.google.com/store/apps/details?id=com.king.farmheroessupersaga&amp;hl=en"><span>Farm Heroes Super Saga</span></a></div>
</td>
<td>
<p dir="ltr"> <span>71.1 MB</span></p>
</td>
<td>
<p dir="ltr"> <span>13.4 MB (-81%)</span></p>
</td>
<td>
<p dir="ltr"> <span>8.0 MB (-89%)</span></p>
</td>
</tr>
<tr>
<td>
<div dir="ltr"><a href="https://play.google.com/store/apps/details?id=com.google.android.apps.maps"><span>Google Maps</span></a></div>
</td>
<td>
<p dir="ltr"> <span>32.7 MB</span></p>
</td>
<td>
<p dir="ltr"> <span>17.5 MB (-46%)</span></p>
</td>
<td>
<p dir="ltr"> <span>9.6 MB (-71%)</span></p>
</td>
</tr>
<tr>
<td>
<div dir="ltr"><a href="https://play.google.com/store/apps/details?id=com.google.android.gm"><span>Gmail</span></a></div>
</td>
<td>
<p dir="ltr"> <span>17.8 MB</span></p>
</td>
<td>
<p dir="ltr"> <span>7.6 MB (-57%)</span></p>
</td>
<td>
<p dir="ltr"> <span>7.3 MB (-59%)</span></p>
</td>
</tr>
<tr>
<td>
<div dir="ltr"><a href="https://play.google.com/store/apps/details?id=com.google.android.tts"><span>Google TTS</span></a></div>
</td>
<td>
<p dir="ltr"> <span>18.9 MB</span></p>
</td>
<td>
<p dir="ltr"> <span>17.2 MB (-9%)</span></p>
</td>
<td>
<p dir="ltr"> <span>13.1 MB (-31%)</span></p>
</td>
</tr>
<tr>
<td>
<div dir="ltr"><a href="https://play.google.com/store/apps/details?id=com.amazon.kindle"><span>Kindle</span></a></div>
</td>
<td>
<p dir="ltr"> <span>52.4 MB</span></p>
</td>
<td>
<p dir="ltr"> <span>19.1 MB (-64%)</span></p>
</td>
<td>
<p dir="ltr"> <span>8.4 MB (-84%)</span></p>
</td>
</tr>
<tr>
<td>
<div dir="ltr"><a href="https://play.google.com/store/apps/details?id=com.netflix.mediaclient"><span>Netflix</span></a></div>
</td>
<td>
<p dir="ltr"> <span>16.2 MB</span></p>
</td>
<td>
<p dir="ltr"> <span>7.7 MB (-52%)</span></p>
</td>
<td>
<p dir="ltr"> <span>1.2 MB (-92%)</span></p>
</td>
</tr>
</tbody>
</table>
</div><span id="docs-internal-guid-de7f0210-d587-05da-d332-146959aa303f"></span><br/></div><em>Disclaimer: if you see different patch sizes when you press "update"
manually, that is because we are not currently using File-by-file for
interactive updates, only those done in the background.</em>
<p> <strong><span>Saving data and making our
users (&amp; developers!) happy</span></strong> </p>
<p> These changes are designed to ensure our community of over a billion Android users use as little data as possible for regular app updates. The best thing is that as a developer you don't need to do anything. You get these reductions to your update size for free! </p>
<p> If you'd like to know more about File-by-File patching, including the technical details, head over to the <a href="https://github.com/andrewhayden/archive-patcher">Archive Patcher GitHub
project</a> where you can find information, including the source code. Yes, File-by-File patching is completely open-source! </p>
<p> As a developer if you're interested in reducing your APK size still further, here are some <a href="https://developer.android.com/topic/performance/reduce-apk-size.html?utm_campaign=android_discussion_filebyfile_120616&amp;utm_source=anddev&amp;utm_medium=blog">general
tips on reducing APK size</a>. </p>
<div class="separator">
<a href="https://2.bp.blogspot.com/-5aRh1dM6Unc/WEcNs55RGhI/AAAAAAAADnI/tzr_oOJjZwgWd9Vu25ydY0UwB3eXKupXwCLcB/s1600/image01.png" imageanchor="1"><img src="https://2.bp.blogspot.com/-5aRh1dM6Unc/WEcNs55RGhI/AAAAAAAADnI/tzr_oOJjZwgWd9Vu25ydY0UwB3eXKupXwCLcB/s200/image01.png" width="191" height="200" border="0" /></a>
</div><span itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person">
<meta content="https://plus.google.com/116899029375914044550" itemprop="url"/>
</span></div>
</div>
Loading