Changed domain match to element 3 of crawl log line

netarchivesuite · Jul 8, 2022 · afd2015 · afd2015
1 parent 85b7a3a
commit afd2015
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 52 deletions.
diff --git a/harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java b/harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java
@@ -23,13 +23,15 @@
 package dk.netarkivet.viewerproxy.webinterface;
 
 import java.io.BufferedReader;
+import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -312,39 +314,9 @@ private static File createTempResultFile(String uuidSuffix) {
         return tempFile;
     }
 
-    /**
-     * Helper method to get sorted File of crawllog lines.
-     *
-     * @param crawlLogLines The crawllog lines output from a job.
-     * @return A File containing the sorted lines.
-     */
-    private static File createSortedResultFile(List<String> crawlLogLines) {
-        final String uuid = UUID.randomUUID().toString();
-        File tempFile = createTempResultFile(uuid);
-        File sortedTempFile = createTempResultFile(uuid + "-sorted");
-        FileUtils.writeCollectionToFile(tempFile, crawlLogLines);
-        FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
-        FileUtils.remove(tempFile);
-        return sortedTempFile;
-    }
 
-    /**
-     * Helper method to get result from a batchjob.
-     *
-     * @param batchJob a certain FileBatchJob
-     * @return a file with the result.
-     */
-    private static File createSortedResultFile(FileBatchJob batchJob) {
-        final String uuid = UUID.randomUUID().toString();
-        File tempFile = createTempResultFile(uuid);
-        File sortedTempFile = createTempResultFile(uuid);
-        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob,
-                Settings.get(CommonSettings.USE_REPLICA_ID));
-        status.getResultFile().copyTo(tempFile);
-        FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
-        FileUtils.remove(tempFile);
-        return sortedTempFile;
-    }
+
+
 
     /**
      * Return any crawllog lines for a given jobid matching the given regular expression.
@@ -385,13 +357,82 @@ private static File getCrawlLogCache(long jobid) {
      */
     private static File getCrawlLogLinesUsingHadoop(long jobID, String regex) {
         File cacheFile = getCrawlLogFromCacheOrHdfs(jobID);
-        //TODO Use a pattern like https://stackoverflow.com/questions/65111979/write-a-streamstring-to-a-file-java
-        //to write the results directly to a file and then sort the file externally
-        //Otherwise we get an OOM here !!!!
-        List<String> matches = getMatchingStringsFromFile(cacheFile, regex);
-        return createSortedResultFile(matches);
+        Pattern regexp = Pattern.compile(regex);
+        log.info("Filtering cache file {} with regexp {}", cacheFile.getAbsolutePath(), regex);
+        final String uuid = UUID.randomUUID().toString();
+        File tempFile = createTempResultFile(uuid);
+        log.info("Unsorted results in {}." + tempFile.getAbsolutePath());
+        File sortedTempFile = createTempResultFile(uuid + "-sorted");
+        log.info("Sorted results in {}.", sortedTempFile.getAbsolutePath());
+        try (BufferedWriter writer = Files.newBufferedWriter(tempFile.toPath())) {
+            try (BufferedReader reader = Files.newBufferedReader(cacheFile.toPath())) {
+                String line;
+                while ((line = reader.readLine()) != null ) {
+                    if (regexp.matcher(line).matches()) {
+                        writer.write(line);
+                        writer.newLine();
+                    }
+                }
+            } catch (IOException e) {
+                throw new RuntimeException("Error reading file " + cacheFile.getAbsolutePath(), e);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException("Error writing to file " + tempFile.getAbsolutePath());
+        }
+        FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
+        FileUtils.remove(tempFile);
+        return sortedTempFile;
+    }
+
+    private static List<String> getMatchingStringsFromFile(File cacheFile,
+            String regex) {
+        List<String> matches = null;
+        Pattern regexp = Pattern.compile(regex);
+        try {
+            matches = org.apache.commons.io.FileUtils.readLines(cacheFile).stream().filter(s -> regexp.matcher(s).matches() ).collect(
+                    Collectors.toList());
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        return matches;
+    }
+
+    /**
+     * Helper method to get sorted File of crawllog lines.
+     *
+     * @param crawlLogLines The crawllog lines output from a job.
+     * @return A File containing the sorted lines.
+     */
+    private static File createSortedResultFile(List<String> crawlLogLines) {
+        final String uuid = UUID.randomUUID().toString();
+        File tempFile = createTempResultFile(uuid);
+        File sortedTempFile = createTempResultFile(uuid + "-sorted");
+        FileUtils.writeCollectionToFile(tempFile, crawlLogLines);
+        FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
+        FileUtils.remove(tempFile);
+        return sortedTempFile;
+    }
+
+
+    /**
+     * Helper method to get result from a batchjob.
+     *
+     * @param batchJob a certain FileBatchJob
+     * @return a file with the result.
+     */
+    private static File createSortedResultFile(FileBatchJob batchJob) {
+        final String uuid = UUID.randomUUID().toString();
+        File tempFile = createTempResultFile(uuid);
+        File sortedTempFile = createTempResultFile(uuid);
+        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob,
+                Settings.get(CommonSettings.USE_REPLICA_ID));
+        status.getResultFile().copyTo(tempFile);
+        FileUtils.sortCrawlLogOnTimestamp(tempFile, sortedTempFile);
+        FileUtils.remove(tempFile);
+        return sortedTempFile;
     }
 
+    //Called from .jsp
     public static File getCrawlLogLinesMatchingDomain(long jobID, String domain) {
         log.info("Finding matching crawl log lines for {} in job {}", domain, jobID);
         File cacheFile = getCrawlLogFromCacheOrHdfs(jobID);
@@ -401,6 +442,7 @@ public static File getCrawlLogLinesMatchingDomain(long jobID, String domain) {
         return createSortedResultFile(matches);
     }
 
+    //TODO this is also a walking oom
     private static List<String> getMatchingDomainStringsFromFile(File cacheFile, String domain) {
         try {
             return org.apache.commons.io.FileUtils.readLines(cacheFile).stream()
@@ -412,8 +454,9 @@ private static List<String> getMatchingDomainStringsFromFile(File cacheFile, Str
     }
 
     private static boolean lineMatchesDomain(String crawlLine, String domain) {
+        int urlElement = 3;
+        String urlS = crawlLine.split("\\s+")[urlElement];
         try {
-            String urlS = crawlLine.split("\\s+")[10];
             URL url = new URL(urlS);
             if (url.getHost().equals(domain) || url.getHost().endsWith("."+domain)) {
                 log.debug("Domain {} found in crawlline {}", domain, crawlLine);
@@ -423,23 +466,12 @@ private static boolean lineMatchesDomain(String crawlLine, String domain) {
                 return false;
             }
         } catch (Exception e) {
-            log.debug("No domain to match found in {}", crawlLine);
+            log.debug("No domain to match found in element {} of '{}' which is '{}'", urlElement, crawlLine, urlS);
             return false;
         }
     }
 
-    private static List<String> getMatchingStringsFromFile(File cacheFile,
-            String regex) {
-        List<String> matches = null;
-        Pattern regexp = Pattern.compile(regex);
-        try {
-            matches = org.apache.commons.io.FileUtils.readLines(cacheFile).stream().filter(s -> regexp.matcher(s).matches() ).collect(
-                    Collectors.toList());
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-        return matches;
-    }
+
 
     private static File getCrawlLogFromCacheOrHdfs(long jobID) {
         File cacheFile = getCrawlLogCache(jobID);

diff --git a/pom.xml b/pom.xml
@@ -20,7 +20,7 @@
     <!-- Heritrix versions are from https://github.com/netarchivesuite/heritrix3 which tracks the official
      repository at https://github.com/internetarchive/heritrix3 as closely as we can -->
     <heritrix3.version>3.4.0-NAS-7.3</heritrix3.version>
-    <heritrix3-wrapper.version>1.0.4</heritrix3-wrapper.version>
+    <heritrix3-wrapper.version>1.0.5-SNAPSHOT</heritrix3-wrapper.version>
     <wayback.version>1.8.0-20130411</wayback.version>
     <openwayback.version>2.0.0</openwayback.version>
     <jwat.version>1.0.4</jwat.version>