Make sure we don't keep caches zero-length result files for crawl log…

…s and metadata indices
netarchivesuite · Jun 8, 2021 · 5d54c34 · 5d54c34
1 parent b575541
commit 5d54c34
Showing 1 changed file with 5 additions and 2 deletions.
diff --git a/harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java b/harvester/harvester-core/src/main/java/dk/netarkivet/viewerproxy/webinterface/Reporting.java
@@ -180,7 +180,7 @@ private static File getCDXCacheFile(long jobid) {
     private static List<CDXRecord> getCachedCDXRecords(long jobid) {
         List<String> cdxLines;
         File cacheFile = getCDXCacheFile(jobid);
-        if (cacheFile.exists()) {
+        if (cacheFile.exists() && cacheFile.length() != 0) {
             try {
                 cdxLines = org.apache.commons.io.FileUtils.readLines(cacheFile);
                 return HadoopJobUtils.getCDXRecordListFromCDXLines(cdxLines);
@@ -384,7 +384,10 @@ private static File getCrawlLogCache(long jobid) {
      */
     private static File getCrawlLogLinesUsingHadoop(long jobID, String regex) {
         File cacheFile = getCrawlLogCache(jobID);
-        if (!cacheFile.exists()) {
+        if (cacheFile.exists() && cacheFile.length() == 0) {
+            log.info("Overwriting empty cache file {}.", cacheFile.getAbsolutePath());
+        }
+        if (cacheFile.length()==0 || !cacheFile.exists()) { //The || part of this is strictly unnecessary
             File outputFile = getCrawlLogUsingHadoop(jobID);
             try {
                 org.apache.commons.io.FileUtils.copyFile(outputFile, cacheFile);