Skip to content

Commit

Permalink
NAS-2528: Extend SurtPrefixedDecideRule to rewrite certain seeds afte…
Browse files Browse the repository at this point in the history
…r they have been converted to SURTs.
  • Loading branch information
nclarkekb committed May 19, 2016
1 parent 0d5466b commit ec9cef4
Show file tree
Hide file tree
Showing 5 changed files with 319 additions and 29 deletions.
Expand Up @@ -451,15 +451,12 @@ public static void initJarFolder(String folderName) {
public static void initBundlerZip(String defaultBundlerZipName) {
if (defaultBundlerZipName != null) {
defaultBundlerZip = new File(defaultBundlerZipName);

if (!defaultBundlerZip.exists()) {
System.err.print(Constants.MSG_ERROR_NO_BUNDLER_ZIP_FILE);
System.out.println();
System.out.println("Couldn't find the default bundler file: " + defaultBundlerZip.getAbsolutePath());
System.exit(1);
}
} else {
defaultBundlerZip = null;
}
}

Expand Down
Expand Up @@ -40,18 +40,18 @@
* nn.nn.nn.nn -> nn.nn.nn.nn
*
*/
public class DomainnameQueueAssignmentPolicy
extends HostnameQueueAssignmentPolicy {
/** A key used for the cases when we can't figure out the URI.
public class DomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy {

/** A key used for the cases when we can't figure out the URI.
* This is taken from parent, where it has private access. Parent returns
* this on things like about:blank.
*/
static final String DEFAULT_CLASS_KEY = "default...";

private Log log
= LogFactory.getLog(getClass());
private Log log = LogFactory.getLog(getClass());

/** Return a key for queue names based on domain names (last two parts of
/**
* Return a key for queue names based on domain names (last two parts of
* host name) or IP address. They key may include a #<portnr> at the end.
*
* @param basis A potential URI.
Expand All @@ -65,8 +65,7 @@ protected String getCoreKey(UURI basis) {
try {
candidate = super.getCoreKey(basis);
} catch (NullPointerException e) {
log.debug("Heritrix broke getting class key candidate for "
+ basis);
log.debug("Heritrix broke getting class key candidate for " + basis);
candidate = DEFAULT_CLASS_KEY;
}
if (candidate == null) { //FIXME the candidate should not be null with dns: schema
Expand All @@ -86,8 +85,7 @@ protected String getCoreKey(UURI basis) {
}
String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]);
if (domainName == null) { // Not valid according to our rules
log.debug("Illegal class key candidate '" + candidate
+ "' for '" + basis + "'");
log.debug("Illegal class key candidate '" + candidate + "' for '" + basis + "'");
return candidate;
}
return domainName;
Expand Down
@@ -0,0 +1,74 @@
package dk.netarkivet.harvester.harvesting;

import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule;

/**
* Extended <code>SurtPrefixedDecideRule</code> class.
* Modifies a small subset of SURT seeds so it is possible to define http(s)//tld,host,..., seeds.
* Only http(s)//www.host.tld are converted to http(s)//(tld,host,www, instead of http(s)//(tld,host,www,)/
*
* @author nicl
*/
public class NASSurtPrefixedDecideRule extends SurtPrefixedDecideRule {

/**
* UUID.
*/
private static final long serialVersionUID = 3334790462876505839L;

private static final Logger logger = Logger.getLogger(NASSurtPrefixedDecideRule.class.getName());

@Override
public void addedSeed(final CrawlURI curi) {
if(getSeedsAsSurtPrefixes()) {
addedSeedImpl(curi);
}
}

/**
* <code>addedSeed</code iImplementation method to facilitate unit testing.
* @param curi <code>CrawlURI</code> object to convert
* @return URI converted to SURT string
*/
protected String addedSeedImpl(final CrawlURI curi) {
String originalUri = curi.getSourceTag();
String surt = prefixFrom(curi.getURI());
int idx;
int idx2;
String scheme;
String surtHost;
String path;
if (surt != null) {
idx = surt.indexOf("://");
if (idx != -1) {
scheme = surt.substring(0, idx);
idx += "://".length();
idx2 = surt.indexOf(')', idx);
if (idx2 != -1 && surt.charAt(idx++) == '(') {
surtHost = surt.substring(idx, idx2);
path = surt.substring(idx2 + 1);
if ("/".compareTo(path) == 0) {
if (originalUri != null) {
idx = originalUri.indexOf("://");
if (idx != -1) {
idx += "://".length();
idx = originalUri.indexOf('/', idx);
if (idx == -1) {
surt = scheme + "://(" + surtHost;
}
}
} else {
logger.warning("originalUri not available");
}
}
}
}
}
surtPrefixes.add(surt);
return surt;
}

}
Expand Up @@ -49,19 +49,18 @@
* nn.nn.nn.nn -> nn.nn.nn.nn
*
*/
public class SeedUriDomainnameQueueAssignmentPolicy
extends HostnameQueueAssignmentPolicy {
public class SeedUriDomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy {

/** A key used for the cases when we can't figure out the URI.
* This is taken from parent, where it has private access. Parent returns
* this on things like about:blank.
*/
static final String DEFAULT_CLASS_KEY = "default...";

private Log log
= LogFactory.getLog(getClass());
private Log log = LogFactory.getLog(getClass());

/** Return a key for queue names based on domain names (last two parts of
/**
* Return a key for queue names based on domain names (last two parts of
* host name) or IP address. They key may include a #<portnr> at the end.
*
* @param cauri A potential URI.
Expand All @@ -72,16 +71,12 @@ public class SeedUriDomainnameQueueAssignmentPolicy
public String getClassKey(CrawlURI cauri) {
String candidate;

boolean ignoreSourceSeed =
cauri != null &&
cauri.getCanonicalString().startsWith("dns");
boolean ignoreSourceSeed = cauri != null && cauri.getCanonicalString().startsWith("dns");
try {
// Since getClassKey has no contract, we must encapsulate it from
// errors.
// Since getClassKey has no contract, we must encapsulate it from errors.
candidate = super.getClassKey(cauri);
} catch (NullPointerException e) {
log.debug("Heritrix broke getting class key candidate for "
+ cauri);
log.debug("Heritrix broke getting class key candidate for " + cauri);
candidate = DEFAULT_CLASS_KEY;
}

Expand All @@ -102,8 +97,7 @@ public String getClassKey(CrawlURI cauri) {

String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]);
if (domainName == null) { // Not valid according to our rules
log.debug("Illegal class key candidate '" + candidate
+ "' for '" + cauri + "'");
log.debug("Illegal class key candidate '" + candidate + "' for '" + cauri + "'");
return candidate;
}
return domainName;
Expand All @@ -128,8 +122,7 @@ private String getCandidateFromSource(CrawlURI cauri) {
try {
hostname = UURIFactory.getInstance(sourceCandidate).getHost();
} catch (URIException e) {
log.warn("Hostname could not be extracted from sourceCandidate: "
+ sourceCandidate);
log.warn("Hostname could not be extracted from sourceCandidate: " + sourceCandidate);
return null;
}
return DomainUtils.domainNameFromHostname(hostname);
Expand Down

0 comments on commit ec9cef4

Please sign in to comment.