Also cache crawl logs

netarchivesuite · May 19, 2021 · 1d15d60 · 1d15d60
1 parent ad347f1
commit 1d15d60
Showing 1 changed file with 35 additions and 3 deletions.
diff --git a/harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java b/harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java
@@ -173,8 +173,9 @@ public static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) {
     private static File getCDXCache(long jobid) {
         String cacheDir = "metadata_cache";
         String cdxcache = "cdxcache";
-        File cacheFile = new File(new File(new File(cacheDir), cdxcache), "" + jobid);
-        cacheFile.mkdirs();
+        File cdxdir = new File(new File(cacheDir), cdxcache);
+        cdxdir.mkdirs();
+        File cacheFile = new File(cdxdir, "" + jobid);
         return cacheFile;
     }
 
@@ -363,6 +364,17 @@ public static File getCrawlLoglinesMatchingRegexp(long jobid, String regexp) {
         }
     }
 
+    private static File getCrawlLogCache(long jobid) {
+        String cacheDir = "metadata_cache";
+        String crawllog_cache = "crawllog_cache";
+        File crawllog_dir = new File(new File(cacheDir), crawllog_cache);
+        crawllog_dir.mkdirs();
+        File cacheFile = new File(crawllog_dir, "" + jobid);
+        return cacheFile;
+    }
+
+
+
     /**
      * Using Hadoop, gets crawllog lines for a given jobID matching a given regular expression.
      *
@@ -371,9 +383,29 @@ public static File getCrawlLoglinesMatchingRegexp(long jobid, String regexp) {
      * @return a File with the matching lines.
      */
     private static File getCrawlLogLinesUsingHadoop(long jobID, String regex) {
+        File cacheFile = getCrawlLogCache(jobID);
+        if (!cacheFile.exists()) {
+            File outputFile = getCrawlLogUsingHadoop(jobID);
+            try {
+                org.apache.commons.io.FileUtils.copyFile(outputFile, cacheFile);
+            } catch (IOException e) {
+                throw new RuntimeException((e));
+            }
+        }
+        List<String> matches = null;
+        try {
+            matches = org.apache.commons.io.FileUtils.readLines(cacheFile).stream().filter(s -> s.matches(regex)).collect(
+                    Collectors.toList());
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        return getResultFile(matches);
+    }
+
+    private static File getCrawlLogUsingHadoop(long jobID) {
         String metadataFileSearchPattern = getMetadataFilePatternForJobId(jobID);
         Configuration hadoopConf = HadoopJobUtils.getConf();
-        hadoopConf.setPattern("regex", Pattern.compile(regex));
+        hadoopConf.setPattern("regex", Pattern.compile(".*"));
         try (FileSystem fileSystem = FileSystem.newInstance(hadoopConf)) {
             HadoopJobStrategy jobStrategy = new CrawlLogExtractionStrategy(jobID, fileSystem);
             HadoopJob job = new HadoopJob(jobID, jobStrategy);