Permalink
Browse files

Changes to get working with nutch 2.2.1 and firefox v 24

  • Loading branch information...
momer committed Jul 31, 2014
1 parent 3a3555f commit 88758df8db8acc8679841d57ab740d681121c762
View
@@ -1 +1,4 @@
-*.class
+.DS_Store
+*.iml
+.idea/
+*.class
@@ -20,6 +20,9 @@
<import file="../build-plugin.xml"/>
<!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-http/*.jar" />
+ </fileset>
</path>
</project>
@@ -7,6 +7,7 @@
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
+import com.gargoylesoftware.htmlunit.BrowserVersion;
/**
* Htmlunit WebClient Helper
@@ -23,7 +24,7 @@ public static HtmlPage getHtmlPage(String url, Configuration conf) {
WebClient webClient = threadWebClient.get();
if (webClient == null) {
LOG.info("Initing web client for thread: {}", Thread.currentThread().getId());
- webClient = new WebClient();
+ webClient = new WebClient(BrowserVersion.FIREFOX_24);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setAppletEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>groupId</groupId>
+ <artifactId>lib-htmlunit</artifactId>
+ <version>1.0-SNAPSHOT</version>
+
+
+</project>
@@ -22,7 +22,7 @@
<!-- Build compilation dependencies -->
<target name="deps-jar">
<ant target="jar" inheritall="false" dir="../lib-http"/>
- <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
+ <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
</target>
<!-- Add compilation dependencies to classpath -->
@@ -32,7 +32,7 @@
<publications>
<!--get the artifact from our module name-->
- <artifact conf="master"/>
+ <artifact conf="default"/>
</publications>
<dependencies>
@@ -29,10 +29,11 @@
<requires>
<import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-http"/>
<import plugin="lib-htmlunit"/>
</requires>
- <extension id="org.apache.nutch.protocol.http"
+ <extension id="org.apache.nutch.protocol.htmlunit"
name="HttpProtocol"
point="org.apache.nutch.protocol.Protocol">
@@ -3,14 +3,19 @@
// JDK imports
import java.io.IOException;
import java.net.URL;
+import java.util.Collection;
+import java.util.HashSet;
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
+// import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.http.api.HttpBase;
+// import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.htmlunit.HttpResponse;
-import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// Commons Logging imports
@@ -22,11 +27,18 @@
public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.MODIFIED_TIME);
+ FIELDS.add(WebPage.Field.HEADERS);
+ }
public Http() {
super(LOG);
}
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
// Level logLevel = Level.WARNING;
@@ -42,9 +54,14 @@ public static void main(String[] args) throws Exception {
main(http, args);
}
- protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+ protected Response getResponse(URL url, WebPage page, boolean redirect)
throws ProtocolException, IOException {
- return new HttpResponse(this, url, datum,getConf());
+ LOG.info("fetching this url " + url);
+ return new HttpResponse(this, url, page, getConf());
}
+ @Override
+ public Collection<WebPage.Field> getFields() {
+ return FIELDS;
+ }
}
@@ -12,7 +12,8 @@
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
+// import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.storage.WebPage;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
@@ -31,7 +32,7 @@
/** An HTTP response. */
public class HttpResponse implements Response {
- private HttpBase http;
+ private Http http;
private URL url;
private String orig;
private String base;
@@ -41,8 +42,8 @@
/** The nutch configuration */
private Configuration conf = null;
-
- public HttpResponse(HttpBase http, URL url, CrawlDatum datum,Configuration conf) throws ProtocolException, IOException {
+
+ public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws ProtocolException, IOException {
this.conf = conf;
this.http = http;
@@ -123,8 +124,8 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum datum,Configuration conf)
reqStr.append(this.http.getAccept());
reqStr.append("\r\n");
- if (datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+ if (page.getModifiedTime() > 0) {
+ reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(page.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>groupId</groupId>
+ <artifactId>protocol-htmlunit</artifactId>
+ <version>1.0-SNAPSHOT</version>
+
+
+</project>
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the htmlunit.</p><p></p>
+</body>
+</html>

0 comments on commit 88758df

Please sign in to comment.