Improve HTML5 extraction: extract <main> if it exists and no article …

…was found. - Also account for nested <main><article></article></main> cases where we only want the inner article
lwindolf · May 12, 2022 · a4ac661 · a4ac661
1 parent d94b9ca
commit a4ac661
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 7 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -12,6 +12,9 @@ To be released
 	  vars LIFEREA_UA and LIFEREA_UA_ANONYMOUS now also affect the internal
 	  browsing.
 
+	* Improve HTML5 extraction: extract <main> if it exists and no article
+	  was found.
+
 
 2022-04-05   Lars Windolf <lars.windolf@gmx.de>
 

diff --git a/src/tests/parse_html.c b/src/tests/parse_html.c
@@ -122,6 +122,16 @@ gchar *tc_article[] = {
 	"<p>1</p>\n"
 };
 
+gchar *tc_article_main[] = {
+	"<html lang='fr'><script>blabla</script><style>body { background:red }</style><body><main><p>1</p></main></body></html>",
+	"<p>1</p>\n"
+};
+
+gchar *tc_article_main2[] = {
+	"<html lang='fr'><script>blabla</script><style>body { background:red }</style><body><main><article><p>1</p></article></main></body></html>",
+	"<p>1</p>\n"
+};
+
 gchar *tc_article_micro_format[] = {
 	"<html><head></head><body><div property='articleBody'><p>1</p></div></body></html>",
 	"<p>1</p>\n"
@@ -217,6 +227,8 @@ main (int argc, char *argv[])
 	g_test_add_data_func ("/html/auto_discover_link_xml_atom3", &tc_xml_atom3, &tc_auto_discover_link);
 
 	g_test_add_data_func ("/html/html5_extract_article", &tc_article, &tc_get_article);
+	g_test_add_data_func ("/html/html5_extract_article_main", &tc_article_main, &tc_get_article);
+	g_test_add_data_func ("/html/html5_extract_article_main2", &tc_article_main2, &tc_get_article);
 	g_test_add_data_func ("/html/html5_extract_article_micro_format", &tc_article_micro_format, &tc_get_article);
 	g_test_add_data_func ("/html/html5_extract_article_cms_content_id", &tc_article_cms_content_id, &tc_get_article);
 	g_test_add_data_func ("/html/html5_extract_article_missing", &tc_article_missing, &tc_get_article);

diff --git a/xslt/html5-extract.xml.in b/xslt/html5-extract.xml.in
@@ -38,15 +38,34 @@
             omit-xml-declaration="yes" />
 
 <xsl:template name='copy'>
-	<xsl:choose>
-		<!-- identity copy nodes only for the following extraction locations:
+	<!-- identity copy nodes only for the following extraction locations:
+
+		1.) "//article" for HTML5
+		2.) "//main" for HTML5 when there is no "//article"
+		3a.) "//div[@property='articleBody']" for microformats
+		3b.) "//div[@id='content']" for just guessing CMS main div
 
-			1.) "//article" for HTML5
-			2.) "//div[@property='articleBody']" for microformats
-			3.) "//div[@id='content']" for just guessing CMS main div
+	-->
+	<xsl:variable name="mode">
+		<xsl:choose>
+			<xsl:when test="//article">1</xsl:when>
+			<xsl:when test="//main">2</xsl:when>
+			<xsl:otherwise>3</xsl:otherwise>
+		</xsl:choose>
+	</xsl:variable>
 
-		-->
-		<xsl:when test="ancestor::article | ancestor::div[@property='articleBody'] | ancestor::div[@id='content']">
+	<xsl:choose>
+		<xsl:when test="$mode != '2' and ancestor::article | ancestor::div[@property='articleBody'] | ancestor::div[@id='content']">
+			<xsl:copy>
+				<xsl:apply-templates select="node()|@*"/>
+				<!-- Fill empty tags with a space to ensure we can
+				     output to HTML5 and get no self-closing tags -->
+				<xsl:if test="not(*) and not(text())">
+					<xsl:text> </xsl:text>
+				</xsl:if>
+			</xsl:copy>
+		</xsl:when>
+		<xsl:when test="$mode = '2' and ancestor::main">
 			<xsl:copy>
 				<xsl:apply-templates select="node()|@*"/>
 				<!-- Fill empty tags with a space to ensure we can
@@ -62,6 +81,7 @@
 	</xsl:choose>
 </xsl:template>
 
+
 <xsl:template match="node()|@*">
 	<xsl:call-template name='copy'/>
 </xsl:template>