Skip to content

Commit

Permalink
NAS-2463: Fix of unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
jrg committed Dec 19, 2017
1 parent b581dd3 commit 38e4903
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 20 deletions.
Expand Up @@ -248,7 +248,9 @@ public Stream<String> getCrawledUrls(long jobId, Heritrix3JobMonitor h3Job) {

try {
Stream<String> attemptedHarvestedUrlsFromCrawllog = Files.lines(Paths.get(crawlLogPath),
Charset.forName("UTF-8")).filter(line -> urlInLineIsAttemptedHarvested(line));
Charset.forName("UTF-8"))
.filter(line -> urlInLineIsAttemptedHarvested(line))
.map(line -> line.split("\\s+")[3]);

return attemptedHarvestedUrlsFromCrawllog;
} catch (java.io.IOException e) {
Expand Down Expand Up @@ -290,12 +292,12 @@ private String normalizeDomainUrl(String url) {
* @param domainName The domain
* @return whether the given job harvests given domain
*/
public boolean jobHarvestsDomain(long jobId, String domainName) {
public boolean jobHarvestsDomain(long jobId, String domainName, Heritrix3JobMonitor h3Job) {
// Normalize search URL
String searchedDomain = normalizeDomainUrl(domainName);

// Return whether or not the crawled URLs contain the searched URL
return getCrawledUrls(jobId, null)
return getCrawledUrls(jobId, h3Job)
.map(url -> normalizeDomainUrl(url))
.anyMatch(url -> searchedDomain.equalsIgnoreCase(url));
}
Expand Down
Expand Up @@ -42,9 +42,9 @@ public void testGetCrawledUrls() throws Exception {

// Create a mock crawllog file
String mockCrawllogContent
= "2005-05-06T11:47:26.550Z 1 53 dns:www.kb.dk P http://www.kb.dk/ text/dns #002 20050506114726441+2 - -\n"
= "2005-05-06T11:47:26.550Z 1 53 dns:www.sb.dk P http://www.sb.dk/ text/dns #002 20050506114726441+2 - -\n"
+ "2005-05-06T11:47:28.464Z 404 278 http://www.netarkivet.dk/robots.txt P http://www.netarkivet.dk/ text/html #028 20050506114728458+5 NYN2HPNQGIPJTPMGAV4QPBUCVJVNMM54 -\n"
+ "2005-05-06T11:47:34.753Z -9998 - https://rex.kb.dk/F L http://www.kb.dk/ no-type #030 - - 3t\n"
+ "2005-05-06T11:47:34.753Z -9998 - https://rex.qb.dk/F L http://www.qb.dk/ no-type #030 - - 3t\n"
+ "2005-05-06T11:47:30.544Z 200 13750 http://www.kb.dk/ - - text/html #001 20050506114730466+32 U4X3Z5EGCNUYTMIXST6BJXGA5SBKTEAJ 3t\n";

File tempFile = File.createTempFile("NASEnvironmentTest-mock-crawllog-", ".tmp");
Expand All @@ -68,20 +68,10 @@ public void testGetCrawledUrls() throws Exception {
Heritrix3JobMonitor h3Job = new Heritrix3JobMonitor();
h3Job.setCrawlLogFilePath(crawlLogFilePath);

Stream<String> crawledUrls = environment.getCrawledUrls(1, h3Job);
List<String> crawled = crawledUrls.collect(Collectors.toList());

// Check whether output corresponds to the input crawllog-mock-file
if (crawled.size() != 2) {
fail("Wrong amount (" + crawled.size() + ") of URLs extracted from crawllog!");
return;
}

if (!crawled.get(0).equalsIgnoreCase("http://www.netarkivet.dk/robots.txt")
|| !crawled.get(1).equalsIgnoreCase("http://www.kb.dk/")) {
fail("URL(s) extracted from crawllog do not match!");
return;
}
assertTrue(environment.jobHarvestsDomain(1, "netarkivet.dk", h3Job));
assertTrue(environment.jobHarvestsDomain(1, "kb.dk", h3Job));
assertFalse(environment.jobHarvestsDomain(1, "rex.qb.dk", h3Job));
assertFalse(environment.jobHarvestsDomain(1, "sb.dk", h3Job));
}

}
Expand Up @@ -304,7 +304,7 @@ This page displays a list of running jobs.
if (searchedDomainName != null && !searchedDomainName.equals("")) {
// Something's been searched for, so let's see if this job should be skipped according to the search...
if (HistoryServlet.environment != null
&& !HistoryServlet.environment.jobHarvestsDomain(jobId, searchedDomainName)) {
&& !HistoryServlet.environment.jobHarvestsDomain(jobId, searchedDomainName, null)) {
// Current job doesn't harvest searched domain, so don't show it. Continue from the next job.
continue;
}
Expand Down

0 comments on commit 38e4903

Please sign in to comment.