Skip to content

Commit

Permalink
Fixed NAS-2752 - disabled by default the search for duplicationmigrat…
Browse files Browse the repository at this point in the history
…ion records
  • Loading branch information
Søren Vejrup Carlsen committed May 18, 2018
1 parent 86c5de2 commit ceeed43
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@
<indexingtimeout>259200000</indexingtimeout>
<maxsegments>42</maxsegments>
<satisfactorythresholdpercentage>70</satisfactorythresholdpercentage>
<tryToMigrateDuplicationRecords>false</tryToMigrateDuplicationRecords>
<lookfordataInAllBitarchiveReplicas>false</lookfordataInAllBitarchiveReplicas>
<indexrequestserver>
<class>dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -697,10 +697,17 @@ public class HarvesterSettings {
*/
public static String INDEXSERVER_INDEXING_LISTENING_INTERVAL = "settings.harvester.indexserver.listeningcheckinterval";
/**
* <b>settings.archive.indexserver.satisfactorythresholdpercentage</b>: <br>
* <b>settings.harvester.indexserver.satisfactorythresholdpercentage</b>: <br>
* Setting for the satisfactory threshold of the indexing result as a percentage. The default is 70 percent
*/
public static String INDEXSERVER_INDEXING_SATISFACTORYTHRESHOLD_PERCENTAGE = "settings.harvester.indexserver.satisfactorythresholdpercentage";

/**
* <b>settings.harvester.indexserver.tryToMigrateDuplicationRecords</b>: <br>
* Setting for trying to migrate deduplicate information from old jobs with duplicationmigration records.
* The default is false;
*/
public static String INDEXSERVER_INDEXING_TRY_TO_MIGRATE_DUPLICATION_RECORDS = "settings.harvester.indexserver.tryToMigrateDuplicationRecords";

/**
* <b>settings.harvester.indexserver.indexrequestserver.class</b>: <br>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,11 @@ public class RawMetadataCache extends FileBasedCache<Long> implements RawDataCac
/** The actual pattern to be used for matching the url in the metadata record */
private Pattern urlPattern;

/** The actual pattern to be used for matching the mimetype in the metadata record */
/** The actual pattern to be used for matching the mimetype in the metadata record */
private Pattern mimePattern;

/** Try to migrate jobs with a duplicationmigration record. */
private boolean tryToMigrateDuplicationRecords;
/**
* Create a new RawMetadataCache. For a given job ID, this will fetch and cache selected content from metadata files
* (&lt;ID&gt;-metadata-[0-9]+.arc). Any entry in a metadata file that matches both patterns will be returned. The
Expand Down Expand Up @@ -107,6 +109,8 @@ public RawMetadataCache(String prefix, Pattern urlMatcher, Pattern mimeMatcher)
log.info("Metadata cache for '{}' is fetching metadata with urls matching '{}' and mimetype matching '{}'",
prefix, urlMatcher1.toString(), mimeMatcher1);
job = new GetMetadataArchiveBatchJob(urlMatcher1, mimeMatcher1);
// Should we try to migrate duplicaterecords, yes or no.
tryToMigrateDuplicationRecords = Settings.getBoolean(HarvesterSettings.INDEXSERVER_INDEXING_TRY_TO_MIGRATE_DUPLICATION_RECORDS);
}

/**
Expand All @@ -133,6 +137,7 @@ public File getCacheFile(Long id) {
protected Long cacheData(Long id) {
final String replicaUsed = Settings.get(CommonSettings.USE_REPLICA_ID);
final String metadataFilePatternSuffix = Settings.get(CommonSettings.METADATAFILE_REGEX_SUFFIX);

//FIXME The current specifiedPattern also accepts files that that includes the Id in the metadatafile name, either
// as a prefix, infix, or suffix (NAS-1712)
final String specifiedPattern = ".*" + id + ".*" + metadataFilePatternSuffix;
Expand All @@ -148,7 +153,13 @@ protected Long cacheData(Long id) {
// Mind you, the data may be empty, but at least one file was
// successfully processed.
if (b.hasResultFile() && b.getNoOfFilesProcessed() > b.getFilesFailed().size()) {
migrateDuplicates(id, replicaUsed, specifiedPattern, b);
File cacheFileName = getCacheFile(id);
if (tryToMigrateDuplicationRecords) {
migrateDuplicates(id, replicaUsed, specifiedPattern, b, cacheFileName);
} else {
b.copyResults(cacheFileName);
}
log.debug("Cached data for job '{}' for '{}'", id, prefix);
return id;
} else {
// Look for data in other bitarchive replicas, if this option is enabled
Expand All @@ -166,7 +177,13 @@ protected Long cacheData(Long id) {
// Perform same check as for the batchresults from
// the default replica.
if (b.hasResultFile() && (b.getNoOfFilesProcessed() > b.getFilesFailed().size())) {
migrateDuplicates(id, rep.getId(), specifiedPattern, b);
File cacheFileName = getCacheFile(id);
if (tryToMigrateDuplicationRecords) {
migrateDuplicates(id, rep.getId(), specifiedPattern, b, cacheFileName);
} else {
b.copyResults(cacheFileName);
}
log.debug("Cached data for job '{}' for '{}'", id, prefix);
return id;
} else {
log.trace("No data found for job '{}' for '{}' in bitarchive '{}'. ", id, prefix, rep);
Expand All @@ -189,9 +206,9 @@ protected Long cacheData(Long id) {
* @param specifiedPattern the pattern specifying the files to be found
* @param originalBatchJob the original batch job which returned the unmigrated data.
*/
private void migrateDuplicates(Long id, String replicaUsed, String specifiedPattern, BatchStatus originalBatchJob) {
File cacheFileName = getCacheFile(id);
private void migrateDuplicates(Long id, String replicaUsed, String specifiedPattern, BatchStatus originalBatchJob, File cacheFileName) {
Pattern duplicatePattern = Pattern.compile(".*duplicate:\"([^,]+),([0-9]+).*");
log.debug("Looking for a duplicationmigration record for id {}", id);
if (urlPattern.pattern().equals(MetadataFile.CRAWL_LOG_PATTERN)) {
GetMetadataArchiveBatchJob job2 = new GetMetadataArchiveBatchJob(Pattern.compile(".*duplicationmigration.*"), Pattern.compile("text/plain"));
job2.processOnlyFilesMatching(specifiedPattern);
Expand All @@ -208,7 +225,7 @@ private void migrateDuplicates(Long id, String replicaUsed, String specifiedPatt
boolean doMigration = migration.exists() && migration.length() > 0;
Hashtable<Pair<String, Long>, Long> lookup = new Hashtable<>();
if (doMigration) {
log.info("Doing migration for {}", id);
log.info("Found a nonempty duplicationmigration record. Now we do the migration for job {}", id);
try {
final List<String> migrationLines = org.apache.commons.io.FileUtils.readLines(migration);
log.info("{} migration records found for job {}", migrationLines.size(), id);
Expand Down Expand Up @@ -271,7 +288,5 @@ private void migrateDuplicates(Long id, String replicaUsed, String specifiedPatt
} else {
originalBatchJob.copyResults(cacheFileName);
}
log.debug("Cached data for job '{}' for '{}'", id, prefix);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ National Library.
<indexingtimeout>259200000</indexingtimeout>
<maxsegments>42</maxsegments>
<satisfactorythresholdpercentage>70</satisfactorythresholdpercentage>
<tryToMigrateDuplicationRecords>false</tryToMigrateDuplicationRecords>
<lookfordataInAllBitarchiveReplicas>false</lookfordataInAllBitarchiveReplicas>
<indexrequestserver>
<class>dk.netarkivet.harvester.indexserver.distribute.IndexRequestServer</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@
import org.junit.Test;

import dk.netarkivet.common.arcrepository.TestArcRepositoryClient;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.archive.ArchiveBatchJob;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
import dk.netarkivet.testutils.FileAsserts;
import dk.netarkivet.testutils.ReflectUtils;
Expand Down Expand Up @@ -91,6 +92,7 @@ public void testGetCacheDir() throws Exception {
*/
@Test
public void testCacheMigratedMetadata() throws Exception {
Settings.set(HarvesterSettings.INDEXSERVER_INDEXING_TRY_TO_MIGRATE_DUPLICATION_RECORDS, "true");
TestArcRepositoryClient tarc = new TestArcRepositoryClient(new File(TestInfo.WORKING_DIR, "arcfiles"));
Field arcrepfield = ReflectUtils.getPrivateField(RawMetadataCache.class, "arcrep");
// Try one with just URL pattern.
Expand Down

0 comments on commit ceeed43

Please sign in to comment.