Fixed NAS-2752 - disabled by default the search for duplicationmigrat…

…ion records
netarchivesuite · May 18, 2018 · ceeed43 · ceeed43
1 parent 86c5de2
commit ceeed43
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 10 deletions.
diff --git a/deploy/deploy-core/src/main/resources/dk/netarkivet/deploy/complete_settings.xml b/deploy/deploy-core/src/main/resources/dk/netarkivet/deploy/complete_settings.xml
@@ -422,6 +422,7 @@
             <indexingtimeout>259200000</indexingtimeout>
             <maxsegments>42</maxsegments>
             <satisfactorythresholdpercentage>70</satisfactorythresholdpercentage>
+            <tryToMigrateDuplicationRecords>false</tryToMigrateDuplicationRecords>
             <lookfordataInAllBitarchiveReplicas>false</lookfordataInAllBitarchiveReplicas>
             <indexrequestserver>
                 <class>dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer</class>

diff --git a/harvester/harvester-core/src/main/java/dk/netarkivet/harvester/HarvesterSettings.java b/harvester/harvester-core/src/main/java/dk/netarkivet/harvester/HarvesterSettings.java
@@ -697,10 +697,17 @@ public class HarvesterSettings {
      */
     public static String INDEXSERVER_INDEXING_LISTENING_INTERVAL = "settings.harvester.indexserver.listeningcheckinterval";
     /**
-     * <b>settings.archive.indexserver.satisfactorythresholdpercentage</b>: <br>
+     * <b>settings.harvester.indexserver.satisfactorythresholdpercentage</b>: <br>
      * Setting for the satisfactory threshold of the indexing result as a percentage. The default is 70 percent
      */
     public static String INDEXSERVER_INDEXING_SATISFACTORYTHRESHOLD_PERCENTAGE = "settings.harvester.indexserver.satisfactorythresholdpercentage";
+
+    /**
+     * <b>settings.harvester.indexserver.tryToMigrateDuplicationRecords</b>: <br>
+     * Setting for trying to migrate deduplicate information from old jobs with duplicationmigration records.
+     * The default is false;
+     */
+    public static String INDEXSERVER_INDEXING_TRY_TO_MIGRATE_DUPLICATION_RECORDS = "settings.harvester.indexserver.tryToMigrateDuplicationRecords";
 
     /**
      * <b>settings.harvester.indexserver.indexrequestserver.class</b>: <br>

diff --git a/...er/harvester-core/src/main/java/dk/netarkivet/harvester/indexserver/RawMetadataCache.java b/...er/harvester-core/src/main/java/dk/netarkivet/harvester/indexserver/RawMetadataCache.java
@@ -73,9 +73,11 @@ public class RawMetadataCache extends FileBasedCache<Long> implements RawDataCac
     /** The actual pattern to be used for matching the url in the metadata record */
     private Pattern urlPattern;
 
-   /** The actual pattern to be used for matching the mimetype in the metadata record */ 
+    /** The actual pattern to be used for matching the mimetype in the metadata record */ 
     private Pattern mimePattern;
 
+    /** Try to migrate jobs with a duplicationmigration record. */
+    private boolean tryToMigrateDuplicationRecords;
     /**
      * Create a new RawMetadataCache. For a given job ID, this will fetch and cache selected content from metadata files
      * (&lt;ID&gt;-metadata-[0-9]+.arc). Any entry in a metadata file that matches both patterns will be returned. The
@@ -107,6 +109,8 @@ public RawMetadataCache(String prefix, Pattern urlMatcher, Pattern mimeMatcher)
         log.info("Metadata cache for '{}' is fetching metadata with urls matching '{}' and mimetype matching '{}'",
                 prefix, urlMatcher1.toString(), mimeMatcher1);
         job = new GetMetadataArchiveBatchJob(urlMatcher1, mimeMatcher1);
+        // Should we try to migrate duplicaterecords, yes or no.
+        tryToMigrateDuplicationRecords = Settings.getBoolean(HarvesterSettings.INDEXSERVER_INDEXING_TRY_TO_MIGRATE_DUPLICATION_RECORDS);
     }
 
     /**
@@ -133,6 +137,7 @@ public File getCacheFile(Long id) {
     protected Long cacheData(Long id) {
         final String replicaUsed = Settings.get(CommonSettings.USE_REPLICA_ID);
         final String metadataFilePatternSuffix = Settings.get(CommonSettings.METADATAFILE_REGEX_SUFFIX);
+
         //FIXME The current specifiedPattern also accepts files that that includes the Id in the metadatafile name, either
         // as a prefix, infix, or suffix (NAS-1712)
         final String specifiedPattern = ".*" + id + ".*" + metadataFilePatternSuffix;
@@ -148,7 +153,13 @@ protected Long cacheData(Long id) {
         // Mind you, the data may be empty, but at least one file was
         // successfully processed.
         if (b.hasResultFile() && b.getNoOfFilesProcessed() > b.getFilesFailed().size()) {
-            migrateDuplicates(id, replicaUsed, specifiedPattern, b);
+            File cacheFileName = getCacheFile(id);
+            if (tryToMigrateDuplicationRecords) {
+                migrateDuplicates(id, replicaUsed, specifiedPattern, b, cacheFileName);
+            } else {
+                b.copyResults(cacheFileName);
+            }
+            log.debug("Cached data for job '{}' for '{}'", id, prefix);
             return id;
         } else {
             // Look for data in other bitarchive replicas, if this option is enabled
@@ -166,7 +177,13 @@ protected Long cacheData(Long id) {
                         // Perform same check as for the batchresults from
                         // the default replica.
                         if (b.hasResultFile() && (b.getNoOfFilesProcessed() > b.getFilesFailed().size())) {
-                            migrateDuplicates(id, rep.getId(), specifiedPattern, b);
+                            File cacheFileName = getCacheFile(id);
+                            if (tryToMigrateDuplicationRecords) {
+                                migrateDuplicates(id, rep.getId(), specifiedPattern, b, cacheFileName);
+                            } else {
+                                b.copyResults(cacheFileName);
+                            }
+                            log.debug("Cached data for job '{}' for '{}'", id, prefix);
                             return id;
                         } else {
                             log.trace("No data found for job '{}' for '{}' in bitarchive '{}'. ", id, prefix, rep);
@@ -189,9 +206,9 @@ protected Long cacheData(Long id) {
      * @param specifiedPattern the pattern specifying the files to be found
      * @param originalBatchJob the original batch job which returned the unmigrated data.
      */
-    private void migrateDuplicates(Long id, String replicaUsed, String specifiedPattern, BatchStatus originalBatchJob) {
-        File cacheFileName = getCacheFile(id);
+    private void migrateDuplicates(Long id, String replicaUsed, String specifiedPattern, BatchStatus originalBatchJob, File cacheFileName) {
         Pattern duplicatePattern = Pattern.compile(".*duplicate:\"([^,]+),([0-9]+).*");
+        log.debug("Looking for a duplicationmigration record for id {}", id);
         if (urlPattern.pattern().equals(MetadataFile.CRAWL_LOG_PATTERN)) {
             GetMetadataArchiveBatchJob job2 = new GetMetadataArchiveBatchJob(Pattern.compile(".*duplicationmigration.*"), Pattern.compile("text/plain"));
             job2.processOnlyFilesMatching(specifiedPattern);
@@ -208,7 +225,7 @@ private void migrateDuplicates(Long id, String replicaUsed, String specifiedPatt
             boolean doMigration =  migration.exists() && migration.length() > 0;
             Hashtable<Pair<String, Long>, Long> lookup = new Hashtable<>();
             if (doMigration) {
-                log.info("Doing migration for {}", id);
+                log.info("Found a nonempty duplicationmigration record. Now we do the migration for job {}", id);
                 try {
                     final List<String> migrationLines = org.apache.commons.io.FileUtils.readLines(migration);
                     log.info("{} migration records found for job {}", migrationLines.size(), id);
@@ -271,7 +288,5 @@ private void migrateDuplicates(Long id, String replicaUsed, String specifiedPatt
         } else {
             originalBatchJob.copyResults(cacheFileName);
         }
-        log.debug("Cached data for job '{}' for '{}'", id, prefix);
     }
-
 }
diff --git a/harvester/harvester-core/src/main/resources/dk/netarkivet/harvester/settings.xml b/harvester/harvester-core/src/main/resources/dk/netarkivet/harvester/settings.xml
@@ -223,6 +223,7 @@ National Library.
             <indexingtimeout>259200000</indexingtimeout>
             <maxsegments>42</maxsegments>
             <satisfactorythresholdpercentage>70</satisfactorythresholdpercentage>
+            <tryToMigrateDuplicationRecords>false</tryToMigrateDuplicationRecords>
             <lookfordataInAllBitarchiveReplicas>false</lookfordataInAllBitarchiveReplicas>
             <indexrequestserver>
                 <class>dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer</class>

diff --git a/...vester-test/src/test/java/dk/netarkivet/harvester/indexserver/RawMetadataCacheTester.java b/...vester-test/src/test/java/dk/netarkivet/harvester/indexserver/RawMetadataCacheTester.java
@@ -36,8 +36,9 @@
 import org.junit.Test;
 
 import dk.netarkivet.common.arcrepository.TestArcRepositoryClient;
-import dk.netarkivet.common.utils.FileUtils;
+import dk.netarkivet.common.utils.Settings;
 import dk.netarkivet.common.utils.archive.ArchiveBatchJob;
+import dk.netarkivet.harvester.HarvesterSettings;
 import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
 import dk.netarkivet.testutils.FileAsserts;
 import dk.netarkivet.testutils.ReflectUtils;
@@ -91,6 +92,7 @@ public void testGetCacheDir() throws Exception {
      */
     @Test
     public void testCacheMigratedMetadata() throws Exception {
+        Settings.set(HarvesterSettings.INDEXSERVER_INDEXING_TRY_TO_MIGRATE_DUPLICATION_RECORDS, "true");
         TestArcRepositoryClient tarc = new TestArcRepositoryClient(new File(TestInfo.WORKING_DIR, "arcfiles"));
         Field arcrepfield = ReflectUtils.getPrivateField(RawMetadataCache.class, "arcrep");
         // Try one with just URL pattern.