Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
csrster committed Jan 20, 2016
2 parents 9148f88 + 10c8953 commit 212efe2
Show file tree
Hide file tree
Showing 9 changed files with 182 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,36 @@ public static String toHex(final byte[] ba) {
return new String(hexchars);
}

public static byte[] digestFile(File src, String digestAlgorithm) {
ArgumentNotValid.checkNotNull(src, "File src");
ArgumentNotValid.checkTrue(src.isFile(), "Argument should be a file");
try {
FileInputStream fileInputStream = new FileInputStream(src);
try {
return digestInputStream(fileInputStream, digestAlgorithm);
} finally {
IOUtils.closeQuietly(fileInputStream);
}
} catch (FileNotFoundException e) {
throw new IOFailure("Could not read file '" + src.getAbsolutePath() + "'", e);
}
}

public static byte[] digestInputStream(InputStream instream, String algorithm) {
byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
MessageDigest messageDigest = getMessageDigest(algorithm);
messageDigest.reset();
int bytesRead;
try {
while ((bytesRead = instream.read(buffer)) != -1) {
messageDigest.update(buffer, 0, bytesRead);
}
} catch (IOException e) {
throw new IOFailure("Error making a '" + algorithm + "' digest on the inputstream", e);
}
return messageDigest.digest();
}

/**
* Get a MessageDigest for a specific algorithm.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@
import java.util.UUID;

import org.apache.commons.io.IOUtils;
import org.archive.util.Base32;
import org.jwat.common.ANVLRecord;
import org.jwat.common.ContentType;
import org.jwat.common.Uri;
import org.jwat.warc.WarcConstants;
import org.jwat.warc.WarcDigest;
import org.jwat.warc.WarcFileNaming;
import org.jwat.warc.WarcFileNamingSingleFile;
import org.jwat.warc.WarcFileWriter;
Expand Down Expand Up @@ -122,12 +124,13 @@ public void insertInfoRecord(ANVLRecord payloadToInfoRecord) {
try {
recordId = new Uri("urn:uuid:" + UUID.randomUUID().toString());
} catch (URISyntaxException e) {
throw new IllegalState("Epic fail creating URI from UUID!");
throw new IllegalState("Epic fail creating URI from UUID!", e);
}
warcInfoUID = recordId;
try {
byte[] payloadAsBytes = payloadToInfoRecord.getUTF8Bytes();
String blockDigest = ChecksumCalculator.calculateSha1(new ByteArrayInputStream(payloadAsBytes));
byte[] blockDigestBytes = ChecksumCalculator.digestInputStream(new ByteArrayInputStream(payloadAsBytes), "SHA1");
WarcDigest blockDigest = WarcDigest.createWarcDigest("SHA1", blockDigestBytes, "base32", Base32.encode(blockDigestBytes));
WarcRecord record = WarcRecord.createRecord(writer.writer);
WarcHeader header = record.header;
header.warcTypeIdx = WarcConstants.RT_IDX_WARCINFO;
Expand All @@ -136,7 +139,7 @@ public void insertInfoRecord(ANVLRecord payloadToInfoRecord) {
header.addHeader(WarcConstants.FN_WARC_FILENAME, filename);
header.addHeader(WarcConstants.FN_CONTENT_TYPE, ContentType.parseContentType(WarcConstants.CT_APP_WARC_FIELDS), null);
header.addHeader(WarcConstants.FN_CONTENT_LENGTH, new Long(payloadAsBytes.length), null);
header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, blockDigest);
header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, blockDigest, null);
writer.writer.writeHeader(record);
ByteArrayInputStream bin = new ByteArrayInputStream(payloadAsBytes);
writer.writer.streamPayload(bin);
Expand All @@ -160,12 +163,13 @@ public boolean writeTo(File fileToArchive, String URL, String mimetype) {
throw new IllegalState("An WarcInfo record has not been inserted yet");
}
log.info("{} {}", fileToArchive, fileToArchive.length());
String blockDigest = ChecksumCalculator.calculateSha1(fileToArchive);
byte[] blockDigestBytes = ChecksumCalculator.digestFile(fileToArchive, "SHA1");
WarcDigest blockDigest = WarcDigest.createWarcDigest("SHA1", blockDigestBytes, "base32", Base32.encode(blockDigestBytes));
Uri recordId;
try {
recordId = new Uri("urn:uuid:" + UUID.randomUUID().toString());
} catch (URISyntaxException e) {
throw new IllegalState("Epic fail creating URI from UUID!");
throw new IllegalState("Epic fail creating URI from UUID!", e);
}
InputStream in = null;
try {
Expand All @@ -177,7 +181,7 @@ public boolean writeTo(File fileToArchive, String URL, String mimetype) {
header.addHeader(WarcConstants.FN_WARC_WARCINFO_ID, warcInfoUID, null);
header.addHeader(WarcConstants.FN_WARC_IP_ADDRESS, SystemUtils.getLocalIP());
header.addHeader(WarcConstants.FN_WARC_TARGET_URI, URL);
header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, blockDigest);
header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, blockDigest, null);
header.addHeader(WarcConstants.FN_CONTENT_TYPE, ContentType.parseContentType(mimetype), null);
header.addHeader(WarcConstants.FN_CONTENT_LENGTH, new Long(fileToArchive.length()), null);
writer.writer.writeHeader(record);
Expand All @@ -198,12 +202,13 @@ public boolean writeTo(File fileToArchive, String URL, String mimetype) {
public void write(String uri, String contentType, String hostIP, long fetchBeginTimeStamp, byte[] payload)
throws java.io.IOException {
ByteArrayInputStream in = new ByteArrayInputStream(payload);
String blockDigest = ChecksumCalculator.calculateSha1(in);
byte[] blockDigestBytes = ChecksumCalculator.digestInputStream(in, "SHA1");
WarcDigest blockDigest = WarcDigest.createWarcDigest("SHA1", blockDigestBytes, "base32", Base32.encode(blockDigestBytes));
Uri recordId;
try {
recordId = new Uri("urn:uuid:" + UUID.randomUUID().toString());
} catch (URISyntaxException e) {
throw new IllegalState("Epic fail creating URI from UUID!");
throw new IllegalState("Epic fail creating URI from UUID!", e);
}
WarcRecord record = WarcRecord.createRecord(writer.writer);
WarcHeader header = record.header;
Expand All @@ -213,7 +218,7 @@ public void write(String uri, String contentType, String hostIP, long fetchBegin
header.addHeader(WarcConstants.FN_WARC_WARCINFO_ID, warcInfoUID, null);
header.addHeader(WarcConstants.FN_WARC_IP_ADDRESS, hostIP);
header.addHeader(WarcConstants.FN_WARC_TARGET_URI, uri);
header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, blockDigest);
header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, blockDigest, null);
header.addHeader(WarcConstants.FN_CONTENT_TYPE, ContentType.parseContentType(contentType), null);
header.addHeader(WarcConstants.FN_CONTENT_LENGTH, new Long(payload.length), null);
writer.writer.writeHeader(record);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
Expand All @@ -40,20 +41,26 @@

import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.SlowTest;
import dk.netarkivet.harvester.harvesting.Heritrix1ControllerTestInfo;
import dk.netarkivet.testutils.TestResourceUtils;

@SuppressWarnings({"unchecked", "rawtypes"})
public class MetadataFileWriterTester {
@Rule public TestName test = new TestName();

@Rule
public TestName test = new TestName();

private static File logsDir = TestResourceUtils.getFile("crawldir/logs");

private File WORKING_DIR;
private static File logsDir = new File(Heritrix1ControllerTestInfo.CRAWLDIR_ORIGINALS_DIR, "logs");

@Before
public void initialize() {
WORKING_DIR = new File(TestResourceUtils.OUTPUT_DIR, getClass().getSimpleName() + "/" + test.getMethodName());
WORKING_DIR = new File(TestResourceUtils.OUTPUT_DIR, getClass().getSimpleName() + "/" + test.getMethodName());
FileUtils.removeRecursively(WORKING_DIR);
FileUtils.createDir(WORKING_DIR);
if (!logsDir.exists() || !logsDir.isDirectory()) {
Assert.fail("Test resource directory missing 'crawldir/logs'!");
}
}

@Test
Expand Down Expand Up @@ -143,4 +150,5 @@ private File getOutputArcFile(String name) {
}
return arcfile;
}

}
Loading

0 comments on commit 212efe2

Please sign in to comment.