diff --git a/deploy/deploy-core/src/main/java/dk/netarkivet/deploy/DeployApplication.java b/deploy/deploy-core/src/main/java/dk/netarkivet/deploy/DeployApplication.java index 36ae0eb4fd..4556c100cf 100644 --- a/deploy/deploy-core/src/main/java/dk/netarkivet/deploy/DeployApplication.java +++ b/deploy/deploy-core/src/main/java/dk/netarkivet/deploy/DeployApplication.java @@ -451,15 +451,12 @@ public static void initJarFolder(String folderName) { public static void initBundlerZip(String defaultBundlerZipName) { if (defaultBundlerZipName != null) { defaultBundlerZip = new File(defaultBundlerZipName); - if (!defaultBundlerZip.exists()) { System.err.print(Constants.MSG_ERROR_NO_BUNDLER_ZIP_FILE); System.out.println(); System.out.println("Couldn't find the default bundler file: " + defaultBundlerZip.getAbsolutePath()); System.exit(1); } - } else { - defaultBundlerZip = null; } } diff --git a/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/DomainnameQueueAssignmentPolicy.java b/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/DomainnameQueueAssignmentPolicy.java index f47e9b446e..928b249bcb 100644 --- a/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/DomainnameQueueAssignmentPolicy.java +++ b/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/DomainnameQueueAssignmentPolicy.java @@ -40,18 +40,18 @@ * nn.nn.nn.nn -> nn.nn.nn.nn * */ -public class DomainnameQueueAssignmentPolicy - extends HostnameQueueAssignmentPolicy { - /** A key used for the cases when we can't figure out the URI. +public class DomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy { + + /** A key used for the cases when we can't figure out the URI. * This is taken from parent, where it has private access. Parent returns * this on things like about:blank. */ static final String DEFAULT_CLASS_KEY = "default..."; - private Log log - = LogFactory.getLog(getClass()); + private Log log = LogFactory.getLog(getClass()); - /** Return a key for queue names based on domain names (last two parts of + /** + * Return a key for queue names based on domain names (last two parts of * host name) or IP address. They key may include a # at the end. * * @param basis A potential URI. @@ -65,8 +65,7 @@ protected String getCoreKey(UURI basis) { try { candidate = super.getCoreKey(basis); } catch (NullPointerException e) { - log.debug("Heritrix broke getting class key candidate for " - + basis); + log.debug("Heritrix broke getting class key candidate for " + basis); candidate = DEFAULT_CLASS_KEY; } if (candidate == null) { //FIXME the candidate should not be null with dns: schema @@ -86,8 +85,7 @@ protected String getCoreKey(UURI basis) { } String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]); if (domainName == null) { // Not valid according to our rules - log.debug("Illegal class key candidate '" + candidate - + "' for '" + basis + "'"); + log.debug("Illegal class key candidate '" + candidate + "' for '" + basis + "'"); return candidate; } return domainName; diff --git a/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/NASSurtPrefixedDecideRule.java b/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/NASSurtPrefixedDecideRule.java new file mode 100644 index 0000000000..4de30bdf9b --- /dev/null +++ b/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/NASSurtPrefixedDecideRule.java @@ -0,0 +1,74 @@ +package dk.netarkivet.harvester.harvesting; + +import java.util.logging.Logger; + +import org.archive.modules.CrawlURI; +import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule; + +/** + * Extended SurtPrefixedDecideRule class. + * Modifies a small subset of SURT seeds so it is possible to define http(s)//tld,host,..., seeds. + * Only http(s)//www.host.tld are converted to http(s)//(tld,host,www, instead of http(s)//(tld,host,www,)/ + * + * @author nicl + */ +public class NASSurtPrefixedDecideRule extends SurtPrefixedDecideRule { + + /** + * UUID. + */ + private static final long serialVersionUID = 3334790462876505839L; + + private static final Logger logger = Logger.getLogger(NASSurtPrefixedDecideRule.class.getName()); + + @Override + public void addedSeed(final CrawlURI curi) { + if(getSeedsAsSurtPrefixes()) { + addedSeedImpl(curi); + } + } + + /** + * addedSeedCrawlURI object to convert + * @return URI converted to SURT string + */ + protected String addedSeedImpl(final CrawlURI curi) { + String originalUri = curi.getSourceTag(); + String surt = prefixFrom(curi.getURI()); + int idx; + int idx2; + String scheme; + String surtHost; + String path; + if (surt != null) { + idx = surt.indexOf("://"); + if (idx != -1) { + scheme = surt.substring(0, idx); + idx += "://".length(); + idx2 = surt.indexOf(')', idx); + if (idx2 != -1 && surt.charAt(idx++) == '(') { + surtHost = surt.substring(idx, idx2); + path = surt.substring(idx2 + 1); + if ("/".compareTo(path) == 0) { + if (originalUri != null) { + idx = originalUri.indexOf("://"); + if (idx != -1) { + idx += "://".length(); + idx = originalUri.indexOf('/', idx); + if (idx == -1) { + surt = scheme + "://(" + surtHost; + } + } + } else { + logger.warning("originalUri not available"); + } + } + } + } + } + surtPrefixes.add(surt); + return surt; + } + +} diff --git a/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/SeedUriDomainnameQueueAssignmentPolicy.java b/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/SeedUriDomainnameQueueAssignmentPolicy.java index c81a1d5ce4..ad5d6a49c5 100644 --- a/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/SeedUriDomainnameQueueAssignmentPolicy.java +++ b/harvester/heritrix3/heritrix3-extensions/src/main/java/dk/netarkivet/harvester/harvesting/SeedUriDomainnameQueueAssignmentPolicy.java @@ -49,8 +49,7 @@ * nn.nn.nn.nn -> nn.nn.nn.nn * */ -public class SeedUriDomainnameQueueAssignmentPolicy - extends HostnameQueueAssignmentPolicy { +public class SeedUriDomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy { /** A key used for the cases when we can't figure out the URI. * This is taken from parent, where it has private access. Parent returns @@ -58,10 +57,10 @@ public class SeedUriDomainnameQueueAssignmentPolicy */ static final String DEFAULT_CLASS_KEY = "default..."; - private Log log - = LogFactory.getLog(getClass()); + private Log log = LogFactory.getLog(getClass()); - /** Return a key for queue names based on domain names (last two parts of + /** + * Return a key for queue names based on domain names (last two parts of * host name) or IP address. They key may include a # at the end. * * @param cauri A potential URI. @@ -72,16 +71,12 @@ public class SeedUriDomainnameQueueAssignmentPolicy public String getClassKey(CrawlURI cauri) { String candidate; - boolean ignoreSourceSeed = - cauri != null && - cauri.getCanonicalString().startsWith("dns"); + boolean ignoreSourceSeed = cauri != null && cauri.getCanonicalString().startsWith("dns"); try { - // Since getClassKey has no contract, we must encapsulate it from - // errors. + // Since getClassKey has no contract, we must encapsulate it from errors. candidate = super.getClassKey(cauri); } catch (NullPointerException e) { - log.debug("Heritrix broke getting class key candidate for " - + cauri); + log.debug("Heritrix broke getting class key candidate for " + cauri); candidate = DEFAULT_CLASS_KEY; } @@ -102,8 +97,7 @@ public String getClassKey(CrawlURI cauri) { String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]); if (domainName == null) { // Not valid according to our rules - log.debug("Illegal class key candidate '" + candidate - + "' for '" + cauri + "'"); + log.debug("Illegal class key candidate '" + candidate + "' for '" + cauri + "'"); return candidate; } return domainName; @@ -128,8 +122,7 @@ private String getCandidateFromSource(CrawlURI cauri) { try { hostname = UURIFactory.getInstance(sourceCandidate).getHost(); } catch (URIException e) { - log.warn("Hostname could not be extracted from sourceCandidate: " - + sourceCandidate); + log.warn("Hostname could not be extracted from sourceCandidate: " + sourceCandidate); return null; } return DomainUtils.domainNameFromHostname(hostname); diff --git a/harvester/heritrix3/heritrix3-extensions/src/test/java/dk/netarkivet/harvester/harvesting/NASSurtPrefixedDecideRuleTester.java b/harvester/heritrix3/heritrix3-extensions/src/test/java/dk/netarkivet/harvester/harvesting/NASSurtPrefixedDecideRuleTester.java new file mode 100644 index 0000000000..4b15bad1aa --- /dev/null +++ b/harvester/heritrix3/heritrix3-extensions/src/test/java/dk/netarkivet/harvester/harvesting/NASSurtPrefixedDecideRuleTester.java @@ -0,0 +1,228 @@ +package dk.netarkivet.harvester.harvesting; + +import java.lang.reflect.Field; +import java.util.Arrays; + +import org.apache.commons.httpclient.URIException; +import org.archive.modules.CrawlURI; +import org.archive.modules.SchedulingConstants; +import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.util.SurtPrefixSet; +import org.junit.Assert; +import org.junit.Test; + +public class NASSurtPrefixedDecideRuleTester { + + @Test + public void test_nas_surtprefixeddeciderule() { + Object[][] cases; + NASSurtPrefixedDecideRule decideRule; + Field field; + SurtPrefixSet surtPrefixSet; + UURI uuri; + CrawlURI curi; + String surt; + + cases = new Object[][] { + { + "http://sport.tv2.dk/mokeybusiness/", + "http://(dk,tv2,sport,)/mokeybusiness/", + new String[] { + "http://(dk,tv2,sport,)/mokeybusiness/" + } + }, + { + "http://nyheder.tv2.dk/business", + "http://(dk,tv2,nyheder,)/", + new String[] { + "http://(dk,tv2,nyheder,)/", + "http://(dk,tv2,sport,)/mokeybusiness/", + } + }, + { + "http://www.tv2.dk/", + "http://(dk,tv2,www,)/", + new String[] { + "http://(dk,tv2,nyheder,)/", + "http://(dk,tv2,sport,)/mokeybusiness/", + "http://(dk,tv2,www,)/" + } + }, + { + "http://www.tv2.dk", + "http://(dk,tv2,www,", + new String[] { + "http://(dk,tv2,nyheder,)/", + "http://(dk,tv2,sport,)/mokeybusiness/", + "http://(dk,tv2,www," + } + }, + { + "http://tv2.dk/", + "http://(dk,tv2,)/", + new String[] { + "http://(dk,tv2,)/", + "http://(dk,tv2,nyheder,)/", + "http://(dk,tv2,sport,)/mokeybusiness/", + "http://(dk,tv2,www," + } + }, + { + "http://tv2.dk", + "http://(dk,tv2,", + new String[] { + "http://(dk,tv2," + } + }, + }; + + try { + decideRule = new NASSurtPrefixedDecideRule(); + // Use reflection to read protected field. + field = SurtPrefixedDecideRule.class.getDeclaredField("surtPrefixes"); + field.setAccessible(true); + surtPrefixSet = (SurtPrefixSet)field.get(decideRule); + + for (int i=0; i