Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Support for the class="robots-nocontent" attribute of HTML tags.

Everything inside tags with this attribute will be skipped by the
DOM parser.

See http://www.ysearchblog.com/2007/05/02/introducing-robots-nocontent-for-page-sections/
 for more details.
  • Loading branch information...
commit f1b0e126600006f55ddc0c1d13fcd83771e0f136 1 parent 2501c9e
@mj authored
View
24 src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -134,16 +134,28 @@ private boolean getTextHelper(StringBuffer sb, Node node,
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
- String nodeName = currentNode.getNodeName();
+ String nodeName = currentNode.getNodeName().toLowerCase();
short nodeType = currentNode.getNodeType();
- if ("script".equalsIgnoreCase(nodeName)) {
- walker.skipChildren();
- }
- if ("style".equalsIgnoreCase(nodeName)) {
+ if ("script".equals(nodeName) || "style".equals(nodeName)) {
walker.skipChildren();
+ } else if (nodeType == Node.ELEMENT_NODE && currentNode.hasAttributes()) {
+ NamedNodeMap attributes = currentNode.getAttributes();
+
+ Node classAttribute = attributes.getNamedItem("class");
+ if (classAttribute != null) {
+ String[] classes = ((Attr)classAttribute).getValue().split(" ");
+
+ for (int i = 0; i < classes.length; i++) {
+ if ("robots-nocontent".equals(classes[i])) {
+ walker.skipChildren();
+ break;
+ }
+ }
+ }
}
- if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+
+ if (abortOnNestedAnchors && "a".equals(nodeName)) {
anchorDepth++;
if (anchorDepth > 1) {
abort = true;
View
18 src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -154,6 +154,10 @@
+ "<a href=\"g1\"> <!--whitespace--> </a>"
+ "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>"
+ "</body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "something <div class=\"foo robots-nocontent bar\">somethingelse</div>"
+ + "</body></html>"),
};
private static int SKIP = 9;
@@ -171,7 +175,8 @@
"http://www.nutch.org/",
"http://www.nutch.org/",
"http://www.nutch.org/;something",
- "http://www.nutch.org/"
+ "http://www.nutch.org/",
+ "http://www.nutch.org/",
};
private static final DocumentFragment testDOMs[]=
@@ -199,7 +204,8 @@
"test1 test2",
"title anchor1 anchor2 anchor3",
"title anchor1 anchor2 anchor3 anchor4 anchor5",
- "title"
+ "title",
+ "title something",
};
private static final String[] answerTitle= {
@@ -215,7 +221,9 @@
"",
"title",
"title",
- "title"
+ "title",
+ "title",
+ "title",
};
// note: should be in page-order
@@ -306,7 +314,9 @@ private static void setup() {
new Outlink("http://www.nutch.org/g1", ""),
new Outlink("http://www.nutch.org/g2", "bla bla"),
new Outlink("http://www.nutch.org/test.gif", "bla bla"),
- }
+ },
+ {
+ },
};
} catch (MalformedURLException e) {
Please sign in to comment.
Something went wrong with that request. Please try again.