Skip to content

Commit

Permalink
Changed domain match to element 3 of crawl log line
Browse files Browse the repository at this point in the history
  • Loading branch information
csrster committed Jul 8, 2022
1 parent 85b7a3a commit afd2015
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@
package dk.netarkivet.viewerproxy.webinterface;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
Expand Down Expand Up @@ -312,39 +314,9 @@ private static File createTempResultFile(String uuidSuffix) {
return tempFile;
}

/**
* Helper method to get sorted File of crawllog lines.
*
* @param crawlLogLines The crawllog lines output from a job.
* @return A File containing the sorted lines.
*/
private static File createSortedResultFile(List<String> crawlLogLines) {
final String uuid = UUID.randomUUID().toString();
File tempFile = createTempResultFile(uuid);
File sortedTempFile = createTempResultFile(uuid + "-sorted");
FileUtils.writeCollectionToFile(tempFile, crawlLogLines);
FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
FileUtils.remove(tempFile);
return sortedTempFile;
}

/**
* Helper method to get result from a batchjob.
*
* @param batchJob a certain FileBatchJob
* @return a file with the result.
*/
private static File createSortedResultFile(FileBatchJob batchJob) {
final String uuid = UUID.randomUUID().toString();
File tempFile = createTempResultFile(uuid);
File sortedTempFile = createTempResultFile(uuid);
BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob,
Settings.get(CommonSettings.USE_REPLICA_ID));
status.getResultFile().copyTo(tempFile);
FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
FileUtils.remove(tempFile);
return sortedTempFile;
}



/**
* Return any crawllog lines for a given jobid matching the given regular expression.
Expand Down Expand Up @@ -385,13 +357,82 @@ private static File getCrawlLogCache(long jobid) {
*/
private static File getCrawlLogLinesUsingHadoop(long jobID, String regex) {
File cacheFile = getCrawlLogFromCacheOrHdfs(jobID);
//TODO Use a pattern like https://stackoverflow.com/questions/65111979/write-a-streamstring-to-a-file-java
//to write the results directly to a file and then sort the file externally
//Otherwise we get an OOM here !!!!
List<String> matches = getMatchingStringsFromFile(cacheFile, regex);
return createSortedResultFile(matches);
Pattern regexp = Pattern.compile(regex);
log.info("Filtering cache file {} with regexp {}", cacheFile.getAbsolutePath(), regex);
final String uuid = UUID.randomUUID().toString();
File tempFile = createTempResultFile(uuid);
log.info("Unsorted results in {}." + tempFile.getAbsolutePath());
File sortedTempFile = createTempResultFile(uuid + "-sorted");
log.info("Sorted results in {}.", sortedTempFile.getAbsolutePath());
try (BufferedWriter writer = Files.newBufferedWriter(tempFile.toPath())) {
try (BufferedReader reader = Files.newBufferedReader(cacheFile.toPath())) {
String line;
while ((line = reader.readLine()) != null ) {
if (regexp.matcher(line).matches()) {
writer.write(line);
writer.newLine();
}
}
} catch (IOException e) {
throw new RuntimeException("Error reading file " + cacheFile.getAbsolutePath(), e);
}
} catch (IOException e) {
throw new RuntimeException("Error writing to file " + tempFile.getAbsolutePath());
}
FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
FileUtils.remove(tempFile);
return sortedTempFile;
}

private static List<String> getMatchingStringsFromFile(File cacheFile,
String regex) {
List<String> matches = null;
Pattern regexp = Pattern.compile(regex);
try {
matches = org.apache.commons.io.FileUtils.readLines(cacheFile).stream().filter(s -> regexp.matcher(s).matches() ).collect(
Collectors.toList());
} catch (IOException e) {
throw new RuntimeException(e);
}
return matches;
}

/**
* Helper method to get sorted File of crawllog lines.
*
* @param crawlLogLines The crawllog lines output from a job.
* @return A File containing the sorted lines.
*/
private static File createSortedResultFile(List<String> crawlLogLines) {
final String uuid = UUID.randomUUID().toString();
File tempFile = createTempResultFile(uuid);
File sortedTempFile = createTempResultFile(uuid + "-sorted");
FileUtils.writeCollectionToFile(tempFile, crawlLogLines);
FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
FileUtils.remove(tempFile);
return sortedTempFile;
}


/**
* Helper method to get result from a batchjob.
*
* @param batchJob a certain FileBatchJob
* @return a file with the result.
*/
private static File createSortedResultFile(FileBatchJob batchJob) {
final String uuid = UUID.randomUUID().toString();
File tempFile = createTempResultFile(uuid);
File sortedTempFile = createTempResultFile(uuid);
BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob,
Settings.get(CommonSettings.USE_REPLICA_ID));
status.getResultFile().copyTo(tempFile);
FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
FileUtils.remove(tempFile);
return sortedTempFile;
}

//Called from .jsp
public static File getCrawlLogLinesMatchingDomain(long jobID, String domain) {
log.info("Finding matching crawl log lines for {} in job {}", domain, jobID);
File cacheFile = getCrawlLogFromCacheOrHdfs(jobID);
Expand All @@ -401,6 +442,7 @@ public static File getCrawlLogLinesMatchingDomain(long jobID, String domain) {
return createSortedResultFile(matches);
}

//TODO this is also a walking oom
private static List<String> getMatchingDomainStringsFromFile(File cacheFile, String domain) {
try {
return org.apache.commons.io.FileUtils.readLines(cacheFile).stream()
Expand All @@ -412,8 +454,9 @@ private static List<String> getMatchingDomainStringsFromFile(File cacheFile, Str
}

private static boolean lineMatchesDomain(String crawlLine, String domain) {
int urlElement = 3;
String urlS = crawlLine.split("\\s+")[urlElement];
try {
String urlS = crawlLine.split("\\s+")[10];
URL url = new URL(urlS);
if (url.getHost().equals(domain) || url.getHost().endsWith("."+domain)) {
log.debug("Domain {} found in crawlline {}", domain, crawlLine);
Expand All @@ -423,23 +466,12 @@ private static boolean lineMatchesDomain(String crawlLine, String domain) {
return false;
}
} catch (Exception e) {
log.debug("No domain to match found in {}", crawlLine);
log.debug("No domain to match found in element {} of '{}' which is '{}'", urlElement, crawlLine, urlS);
return false;
}
}

private static List<String> getMatchingStringsFromFile(File cacheFile,
String regex) {
List<String> matches = null;
Pattern regexp = Pattern.compile(regex);
try {
matches = org.apache.commons.io.FileUtils.readLines(cacheFile).stream().filter(s -> regexp.matcher(s).matches() ).collect(
Collectors.toList());
} catch (IOException e) {
throw new RuntimeException(e);
}
return matches;
}


private static File getCrawlLogFromCacheOrHdfs(long jobID) {
File cacheFile = getCrawlLogCache(jobID);
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<!-- Heritrix versions are from https://github.com/netarchivesuite/heritrix3 which tracks the official
repository at https://github.com/internetarchive/heritrix3 as closely as we can -->
<heritrix3.version>3.4.0-NAS-7.3</heritrix3.version>
<heritrix3-wrapper.version>1.0.4</heritrix3-wrapper.version>
<heritrix3-wrapper.version>1.0.5-SNAPSHOT</heritrix3-wrapper.version>
<wayback.version>1.8.0-20130411</wayback.version>
<openwayback.version>2.0.0</openwayback.version>
<jwat.version>1.0.4</jwat.version>
Expand Down

0 comments on commit afd2015

Please sign in to comment.