Skip to content

Commit

Permalink
fixing in h1 and h3 issues #2 and #3
Browse files Browse the repository at this point in the history
  • Loading branch information
scheylord committed Jul 13, 2016
1 parent 5c4f353 commit 1c98140
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,12 @@ public class HarvesterSettings {
*/
public static String METADATA_COMPRESSION = "settings.harvester.harvesting.metadata.compression";

/**
* <b>settings.harvester.harvesting.heritrix.archiveNaming.collectionName</b>
* if METADATA_FILENAME_FORMAT is "prefix", then check of a collection name to prefix metadata filename
*/
public static String HERITRIX_METADATA_PREFIX_COLLECTION_NAME = "settings.harvester.harvesting.heritrix.archiveNaming.collectionName";

/**
* <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the
* harvested data. default: warc (alternative: arc)
Expand Down Expand Up @@ -748,7 +754,9 @@ public class HarvesterSettings {
public static String HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS = "settings.harvester.harvesting.heritrix3.warc.skipIdenticalDigests";

public static String HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT = "settings.harvester.harvesting.heritrix3.warc.startNewFilesOnCheckpoint";


public static String HERITRIX3_METADATA_PREFIX_COLLECTION_NAME = "settings.harvester.harvesting.heritrix3.archiveNaming.collectionName";

/**
* <b>settings.harvester.harvesting.heritrix.archiveFormat</b> The dataformat used by heritrix to write the
* harvested data. default: warc (alternative: arc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,24 @@ protected static synchronized void initializeMetadataFormat() {
* file is ever made.
* @throws ArgumentNotValid if any parameter was null.
*/
public static String getMetadataArchiveFileName(String jobID) throws ArgumentNotValid {
public static String getMetadataArchiveFileName(String jobID, Long harvestID) throws ArgumentNotValid {
ArgumentNotValid.checkNotNull(jobID, "jobID");
//retrieving the collectionName
String collectionName = "";
boolean isPrefix = false;
//try to retrieve settings for prefixing or not metadata files
if("prefix".equals(Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT))) {
try {
//try to retrieve in both <heritrix> and <heritrix3> tags
collectionName = Settings.get(HarvesterSettings.HERITRIX_METADATA_PREFIX_COLLECTION_NAME);
if(collectionName == null || collectionName.length() == 0) {
collectionName = Settings.get(HarvesterSettings.HERITRIX3_METADATA_PREFIX_COLLECTION_NAME);
}
isPrefix = true;
} catch(UnknownID e) {
//nothing
}
}
if (metadataFormat == 0) {
initializeMetadataFormat();
}
Expand All @@ -104,9 +120,17 @@ public static String getMetadataArchiveFileName(String jobID) throws ArgumentNot
}
switch (metadataFormat) {
case MDF_ARC:
return jobID + "-metadata-" + 1 + ".arc" + possibleGzSuffix;
if(isPrefix) {
return collectionName + "-" + jobID + "-" + harvestID + "-metadata-" + 1 + possibleGzSuffix;
} else {
return jobID + "-metadata-" + 1 + ".arc" + possibleGzSuffix;
}
case MDF_WARC:
return jobID + "-metadata-" + 1 + ".warc" + possibleGzSuffix;
if(isPrefix) {
return collectionName + "-" + jobID + "-" + harvestID + "-metadata-" + 1 + possibleGzSuffix;
} else {
return jobID + "-metadata-" + 1 + ".warc" + possibleGzSuffix;
}
default:
throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.METADATA_FORMAT + "' is invalid!");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;

/**
Expand Down Expand Up @@ -68,6 +70,8 @@ public class IngestableFiles {

private String harvestnamePrefix;

public static final String METADATA_FILENAME_FORMAT = Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT);

private Long harvestId;

/**
Expand Down Expand Up @@ -193,7 +197,7 @@ private File getMetadataDir() {
* @return metadata arc file as a File
*/
protected File getMetadataFile() {
return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
}

/**
Expand All @@ -211,7 +215,7 @@ public File getTmpMetadataDir() {
* @return tmp-metadata arc file as a File
*/
private File getTmpMetadataFile() {
return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ public WARCWriterProcessor(final String name) {
e.setExpertSetting(true);
e = addElementToDefinition(new SimpleType(ATTR_WRITE_METADATA,
"Whether to write 'metadata' type records. Default is true.", new Boolean(true)));
e.setOverrideable(true);
e.setExpertSetting(true);
e = addElementToDefinition(new SimpleType(ATTR_WRITE_METADATA_OUTLINKS,
"Whether to write 'metadata-outlinks' type records. Default is true.", new Boolean(true)));
e.setOverrideable(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.exceptions.IllegalState;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;

/**
Expand Down Expand Up @@ -69,6 +71,8 @@ public class IngestableFiles {

private String harvestnamePrefix;

public static final String METADATA_FILENAME_FORMAT = Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT);

private Long harvestId;

private File heritrixJobDir;
Expand Down Expand Up @@ -196,7 +200,7 @@ private File getMetadataDir() {
* @return metadata arc file as a File
*/
protected File getMetadataFile() {
return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
}

/**
Expand All @@ -214,7 +218,7 @@ public File getTmpMetadataDir() {
* @return tmp-metadata arc file as a File
*/
private File getTmpMetadataFile() {
return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public class NasWARCProcessor extends WARCWriterProcessor {
private static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience";

public boolean getWriteMetadataOutlinks() {
return (Boolean) kp.get("writeMetadata");
return (Boolean) kp.get("writeMetadataOutlinks");
}
public void setWriteMetadataOutlinks(boolean writeMetadataOutlinks) {
kp.put("writeMetadataOutlinks",writeMetadataOutlinks);
Expand Down

0 comments on commit 1c98140

Please sign in to comment.