Skip to content
Permalink
Browse files

fixing issue netarchivesuite/netarchivesuite-svngit-migration#2 in bo…

…th h1 and h3
  • Loading branch information...
bnfklm committed Jul 13, 2016
1 parent 6c6ad0a commit 5c4f3534b9707636d8822581bd85b89fbd2b2d74
@@ -391,6 +391,9 @@ private void setWarcArchiveformat() {
propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix
+ Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA)
+ valueSuffix + propertyEnd);
propertyBuilder.append(propertyName + "writeMetadataOutlinks" + valuePrefix
+ Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA_OUTLINKS)
+ valueSuffix + propertyEnd);
propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix
+ Settings.get(HarvesterSettings.HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS)
+ valueSuffix + propertyEnd);
@@ -118,6 +118,9 @@ public long getDefaultMaxFileSize() {

/** Key for whether to write 'metadata' type records where possible */
public static final String ATTR_WRITE_METADATA = "write-metadata";

/** Key for whether to write 'metadata-outlinks' type records where possible */
public static final String ATTR_WRITE_METADATA_OUTLINKS = "write-metadata-outlinks";

/**
* Key for whether to write 'revisit' type records when consecutive identical digest
@@ -178,6 +181,8 @@ public WARCWriterProcessor(final String name) {
e.setExpertSetting(true);
e = addElementToDefinition(new SimpleType(ATTR_WRITE_METADATA,
"Whether to write 'metadata' type records. Default is true.", new Boolean(true)));
e = addElementToDefinition(new SimpleType(ATTR_WRITE_METADATA_OUTLINKS,
"Whether to write 'metadata-outlinks' type records. Default is true.", new Boolean(true)));
e.setOverrideable(true);
e.setExpertSetting(true);
e = addElementToDefinition(new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
@@ -321,7 +326,7 @@ private void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid
if (((Boolean) getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
headers = new ANVLRecord(1);
headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
writeMetadata(w, timestamp, baseid, curi, headers);
writeMetadata(w, timestamp, baseid, curi, headers, ((Boolean) getUncheckedAttribute(curi, ATTR_WRITE_METADATA_OUTLINKS)));
}
}

@@ -374,7 +379,7 @@ private void writeHttpRecords(WARCWriter w, final CrawlURI curi, final URI basei
writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE, baseid, curi, headers);
}
if (((Boolean) getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
writeMetadata(w, timestamp, baseid, curi, headers);
writeMetadata(w, timestamp, baseid, curi, headers, ((Boolean) getUncheckedAttribute(curi, ATTR_WRITE_METADATA_OUTLINKS)));
}
}

@@ -493,7 +498,7 @@ protected void saveHeader(String origName, HttpMethodBase method, ANVLRecord hea
}

protected URI writeMetadata(final WARCWriter w, final String timestamp, final URI baseid, final CrawlURI curi,
final ANVLRecord namedFields) throws IOException {
final ANVLRecord namedFields, final boolean writeMetadataOutlinks) throws IOException {
final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
// Get some metadata from the curi.
// TODO: Get all curi metadata.
@@ -521,12 +526,15 @@ protected URI writeMetadata(final WARCWriter w, final String timestamp, final UR
r.addLabelValue("ftpFetchStatus", curi.getString(A_FTP_FETCH_STATUS));
}

// Add outlinks though they are effectively useless without anchor text.
Collection<Link> links = curi.getOutLinks();
if (links != null && links.size() > 0) {
for (Link link : links) {
r.addLabelValue("outlink", link.toString());
}
//only if parameter is true, add the outlinks
if (writeMetadataOutlinks == true) {
// Add outlinks though they are effectively useless without anchor text.
Collection<Link> links = curi.getOutLinks();
if (links != null && links.size() > 0) {
for (Link link : links) {
r.addLabelValue("outlink", link.toString());
}
}
}

// TODO: Other curi fields to write to metadata.
@@ -1,13 +1,26 @@
package dk.netarkivet.harvester.harvesting;

import static org.archive.format.warc.WARCConstants.TYPE;
import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS;
import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.writer.WARCWriterProcessor;
import org.archive.util.ArchiveUtils;
import org.archive.util.anvl.ANVLRecord;
@@ -37,6 +50,13 @@
private static final String HARVESTINFO_PERFORMER = "harvestInfo.performer";
private static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience";

public boolean getWriteMetadataOutlinks() {
return (Boolean) kp.get("writeMetadata");
}
public void setWriteMetadataOutlinks(boolean writeMetadataOutlinks) {
kp.put("writeMetadataOutlinks",writeMetadataOutlinks);
}

public NasWARCProcessor() {
super();
}
@@ -165,4 +185,95 @@ public void setMetadataItems(Map<String,String> metadataItems) {
return cachedMetadata;
}

/**
* modify default writeMetadata method to handle the write of outlinks
* in metadata or not
*/
@Override
protected URI writeMetadata(final WARCWriter w,
final String timestamp,
final URI baseid, final CrawlURI curi,
final ANVLRecord namedFields)
throws IOException {
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setType(WARCRecordType.metadata);
recordInfo.setUrl(curi.toString());
recordInfo.setCreate14DigitDate(timestamp);
recordInfo.setMimetype(ANVLRecord.MIMETYPE);
recordInfo.setExtraHeaders(namedFields);
recordInfo.setEnforceLength(true);

recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString()));

// Get some metadata from the curi.
// TODO: Get all curi metadata.
// TODO: Use other than ANVL (or rename ANVL as NameValue or use
// RFC822 (commons-httpclient?).
ANVLRecord r = new ANVLRecord();
if (curi.isSeed()) {
r.addLabel("seed");
} else {
if (curi.forceFetch()) {
r.addLabel("force-fetch");
}
if(StringUtils.isNotBlank(flattenVia(curi))) {
r.addLabelValue("via", flattenVia(curi));
}
if(StringUtils.isNotBlank(curi.getPathFromSeed())) {
r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
}
if (curi.containsDataKey(A_SOURCE_TAG)) {
r.addLabelValue("sourceTag",
(String)curi.getData().get(A_SOURCE_TAG));
}
}
long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime();
if (duration > -1) {
r.addLabelValue("fetchTimeMs", Long.toString(duration));
}

if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) {
r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString());
}

if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) {
r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name());
}

for (String annotation: curi.getAnnotations()) {
if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) {
String[] kv = annotation.split(":", 2);
r.addLabelValue(kv[0], kv[1]);
}
}

//only if parameter is true, add the outlinks
if (getWriteMetadataOutlinks() == true) {
// Add outlinks though they are effectively useless without anchor text.
Collection<CrawlURI> links = curi.getOutLinks();
if (links != null && links.size() > 0) {
for (CrawlURI link: links) {
r.addLabelValue("outlink", link.getURI()+" "+link.getLastHop()+" "+link.getViaContext());
}
}
}

// TODO: Other curi fields to write to metadata.
//
// Credentials
//
// fetch-began-time: 1154569278774
// fetch-completed-time: 1154569281816
//
// Annotations.

byte [] b = r.getUTF8Bytes();
recordInfo.setContentStream(new ByteArrayInputStream(b));
recordInfo.setContentLength((long) b.length);

w.writeRecord(recordInfo);

return recordInfo.getRecordId();
}

}

0 comments on commit 5c4f353

Please sign in to comment.
You can’t perform that action at this time.