Skip to content

Commit

Permalink
Implemented NAS-2470
Browse files Browse the repository at this point in the history
  • Loading branch information
svcarlsen committed Nov 17, 2015
1 parent 62751dc commit f4251db
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 7 deletions.
Expand Up @@ -497,7 +497,7 @@ public class HarvesterSettings {
/**
* <b>settings.harvester.harvesting.metadata.archiveFilesReportName</b> If
* {@link #METADATA_GENERATE_ARCHIVE_FILES_REPORT} is set to true, sets the header of the generated report file.
* This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Opened] [Closed] [Size]'.
* This setting should generally be left to its default value, which is '[ARCHIVEFILE] [Closed] [Size]'.
*
* @see HarvestDocumentation#documentHarvest(dk.netarkivet.harvester.harvesting.IngestableFiles)
*/
Expand Down
@@ -0,0 +1,83 @@
package dk.netarkivet.harvester.heritrix3;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.Date;

import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.Settings;



/**This class generate a report that lists ARC/WARC files (depending on the configured archive format) along with the
* opening date, closing date (if file was properly closed), and size in bytes.
* <p>
* Here is a sample of such a file:
* <p>
* [ARCHIVEFILE] [Closed] [Size] 5-1-20100720161253-00000-bnf_test.arc.gz "2010-07-20 16:14:31.792" 162928
* <p>
* The file is named "archivefiles-report.txt"
*/
class ArchiveFilesReportGenerator {

private static final SimpleDateFormat ISO_8601_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");

/**
* The name of the report file. It will be generated in the crawl directory.
*/
public static final String REPORT_FILE_NAME = Settings.get(Heritrix3Settings.METADATA_ARCHIVE_FILES_REPORT_NAME);

/**
* The header line of the report file.
*/
public static final String REPORT_FILE_HEADER = Settings
.get(Heritrix3Settings.METADATA_ARCHIVE_FILES_REPORT_HEADER);

private IngestableFiles ingestablefiles;

/**
* Builds a ARC files report generator, given the Ingestable files object.
*
* @param ingestableFiles files belonging to a Heritrix harvest
*/

public ArchiveFilesReportGenerator(IngestableFiles ingestableFiles) {
this.ingestablefiles = ingestableFiles;
}

/**
* Parses heritrix.out and generates the ARC/WARC files report.
*
* @return the generated report file.
*/
protected File generateReport() {

File reportFile = new File(ingestablefiles.getCrawlDir(), REPORT_FILE_NAME);

try {
boolean created = reportFile.createNewFile();
if (!created) {
throw new IOException("Unable to create '" + reportFile.getAbsolutePath() + "'.");
}
PrintWriter out = new PrintWriter(reportFile);

out.println(REPORT_FILE_HEADER);

for (File arcfile : ingestablefiles.getArcFiles()) {
out.println(arcfile.getName() + " " + ISO_8601_DATE_FORMAT.format(new Date(arcfile.lastModified())) + " " + arcfile.length());
}
for (File warcfile : ingestablefiles.getWarcFiles()) {
out.println(warcfile.getName() + " " + ISO_8601_DATE_FORMAT.format(new Date(warcfile.lastModified())) + " " + warcfile.length());
}

out.close();
} catch (IOException e) {
throw new IOFailure("Failed to create " + reportFile.getName(), e);
}

return reportFile;
}

}
Expand Up @@ -320,16 +320,15 @@ public boolean accept(File f) {
}

// Generate an arcfiles-report.txt if configured to do so.
// FIXME This is not possible to extract from the crawl.log (Is this list available in any other way?)
// This is not possible to extract from the crawl.log, but we will make one from just listing the files harvested by Heritrix3

boolean genArcFilesReport = Settings.getBoolean(Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
if (genArcFilesReport) {
log.debug("Arcfiles-report.txt generation Not currently supported by Heritrix3");
/*

log.debug("Creating an arcfiles-report.txt");
files.add(new MetadataFile(new ArchiveFilesReportGenerator(crawlDir).generateReport(), harvestID, jobID,
files.add(new MetadataFile(new ArchiveFilesReportGenerator(ingestableFiles).generateReport(), harvestID, jobID,
heritrixVersion));
*/

} else {
log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!",
Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
Expand Down
Expand Up @@ -154,7 +154,7 @@ National Library.
<archiveFilesReport>
<generate>true</generate>
<fileName>archivefiles-report.txt</fileName>
<fileHeader>[ARCHIVEFILE] [Opened] [Closed] [Size]</fileHeader>
<fileHeader>[ARCHIVEFILE] [Closed] [Size]</fileHeader>
</archiveFilesReport>
<metadataFormat>warc</metadataFormat>
</metadata>
Expand Down

0 comments on commit f4251db

Please sign in to comment.