Skip to content

Commit

Permalink
Fix to NAS-2794 - correct queue assignment when url scheme is missing
Browse files Browse the repository at this point in the history
  • Loading branch information
csrster committed Sep 7, 2018
1 parent ba50012 commit a37f279
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy;
import org.archive.modules.CrawlURI;
import org.archive.net.UURIFactory;
import org.archive.url.UsableURI;

import dk.netarkivet.common.utils.DomainUtils;

Expand Down Expand Up @@ -135,10 +136,14 @@ private String getKeyFromUriHostname(CrawlURI cauri) {
*/
private String getKeyFromSeed(CrawlURI cauri) {
String key = null;
String seed = cauri.getSourceTag();
if (!UsableURI.hasScheme(seed)) {
seed = "http://" + seed;
}
try {
key = DomainUtils.domainNameFromHostname(UURIFactory.getInstance(cauri.getSourceTag()).getHost());
key = DomainUtils.domainNameFromHostname(UURIFactory.getInstance(seed).getHost());
} catch (Exception e) {
e.printStackTrace();
log.debug("Could not extract a domain key from seed '" + seed + "'");
}
return key;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,23 @@ public void getClassKey() throws Exception {
assertEquals(policy.getClassKey(curi1), policy.getClassKey(curi2));
}

/**
* Test that we can get the right key from the policy even when the seed is missing a schema.
* @throws Exception
*/
@Test
public void testGetKeyNoSchema() throws Exception {
SeedUriDomainnameQueueAssignmentPolicy policy = new SeedUriDomainnameQueueAssignmentPolicy();
String url1 = "http://www.ssup.dk";
UURI uuri1 = UURIFactory.getInstance(url1);

CrawlURI curi1 = new CrawlURI(uuri1);
curi1.setSeed(true);
curi1.setSourceTag("www.ssdown.dk");
// Should get the key from the seed, not the url being harvested
assertEquals("ssdown.dk", policy.getClassKey(curi1));
}

@Test
public void getClassKeyTestChain() throws Exception {
SeedUriDomainnameQueueAssignmentPolicy policy = new SeedUriDomainnameQueueAssignmentPolicy();
Expand Down

0 comments on commit a37f279

Please sign in to comment.