Skip to content
Permalink
Browse files

Reformated the date currently stored in our lucene-indices to a prope…

…r WARC-date. This is followup on review of the revisit record produced by the new Deduplicator (NAS-2290)
  • Loading branch information...
svcarlsen committed Aug 3, 2016
1 parent 765f566 commit 5c16fe598c62185b6b06fd3c15c02ee930585990
@@ -0,0 +1,82 @@
package is.hi.bok.deduplicator;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.TimeZone;

/**
*
* TODO merge with dk.netarkivet.common.utils.archive.ArchiveDateConverter
*/
public class ArchiveDateConverter {
/** ARC date format string as specified in the ARC documentation (14 digits) */
public static final String ARC_DATE_FORMAT = "yyyyMMddHHmmss";

/** WARC date format string as specified by the WARC ISO standard. */
public static final String WARC_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";

/** date format string used by Heritrix with 17 digits */
public static final String HERITRIX_DATE_FORMAT = "yyyyMMddHHmmssSSS";

/** ARC <code>DateFormat</code> as specified in the ARC documentation. */
private final DateFormat arcDateFormat;

/** WARC <code>DateFormat</code> as specified in the WARC ISO standard. */
private final DateFormat warcDateFormat;

/** code>DateFormat</code> as used by Heritrix */
private final DateFormat d17DateFormat;


/**
* Creates a new <code>ArchiveDate</code>.
*/
private ArchiveDateConverter() {
arcDateFormat = new SimpleDateFormat(ARC_DATE_FORMAT);
arcDateFormat.setLenient(false);
arcDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
warcDateFormat = new SimpleDateFormat(WARC_DATE_FORMAT);
warcDateFormat.setLenient(false);
warcDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
d17DateFormat = new SimpleDateFormat(HERITRIX_DATE_FORMAT);
d17DateFormat.setLenient(false);
d17DateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
}

/**
* <code>DateFormat</code> is not thread safe, so we wrap its construction inside a <code>ThreadLocal</code> object.
*/
private static final ThreadLocal<ArchiveDateConverter> DateParserTL = new ThreadLocal<ArchiveDateConverter>() {
@Override
public ArchiveDateConverter initialValue() {
return new ArchiveDateConverter();
}
};

/**
* Returns a <code>DateFormat</code> object for ARC date conversion.
*
* @return a <code>DateFormat</code> object for ARC date conversion
*/
public static DateFormat getArcDateFormat() {
return DateParserTL.get().arcDateFormat;
}

/**
* Returns a <code>DateFormat</code> object for WARC date conversion.
*
* @return a <code>DateFormat</code> object for WARC date conversion
*/
public static DateFormat getWarcDateFormat() {
return DateParserTL.get().warcDateFormat;
}

/**
* Returns a <code>DateFormat</code> object for Heritrix 17-digit date conversion
*
* @return a <code>DateFormat</code> object for WARC date conversion
*/
public static DateFormat getHeritrixDateFormat() {
return DateParserTL.get().d17DateFormat;
}
}
@@ -88,9 +88,30 @@
<property name="originHandling" value="INDEX"/> Other options: NONE,PROCESSOR
<property name="statsPerHost" value="true"/>
*
*
*/
// /**
// (FROM deduplicator-commons/src/main/java/is/landsbokasafn/deduplicator/IndexFields.java)
// * These enums correspond to the names of fields in the Lucene index
// */
// public enum IndexFields {
// /** The URL
// * This value is suitable for use in warc/revisit records as the WARC-Refers-To-Target-URI
// **/
// URL,
// /** The content digest as String **/
// DIGEST,
// /** The URLs timestamp (time of fetch). Suitable for use in WARC-Refers-To-Date. Encoded according to
// * w3c-iso8601
// */
// DATE,
// /** The document's etag **/
// ETAG,
// /** A canonicalized version of the URL **/
// URL_CANONICALIZED,
// /** WARC Record ID of original payload capture. Suitable for WARC-Refers-To field. **/
// ORIGINAL_RECORD_ID;
//
// }

@SuppressWarnings({"unchecked"})
public class DeDuplicator extends Processor implements InitializingBean {

@@ -301,8 +322,6 @@ public void setServerCache(ServerCache serverCache) {


// Member variables.

//protected IndexSearcher searcher = null;
protected IndexSearcher indexSearcher = null;
protected IndexReader indexReader = null;

@@ -448,15 +467,29 @@ protected ProcessResult innerProcessResult(CrawlURI curi) throws InterruptedExce

duplicateRevisit.setRefersToTargetURI(
duplicate.get("url")); // URL.name()
duplicateRevisit.setRefersToDate(
duplicate.get("date")); // DATE.name()

/* TODO enable a ORIGINAL_RECORD_ID field during indexing
* Requires the record ID information to be available to the indexer.
String refersToRecordID = duplicate.get(ORIGINAL_RECORD_ID.name());
String indexedDate = duplicate.get("date"); // DATE.name()
Date readDate = null;
try {
readDate = ArchiveDateConverter.getHeritrixDateFormat().parse(indexedDate);
} catch (ParseException e) {
logger.warning("Unable to parse the indexed date '" + indexedDate
+ "' as a 17-digit date: " + e);
}
String refersToDateString = indexedDate;
if (readDate != null) {
refersToDateString = ArchiveDateConverter.getWarcDateFormat().format(readDate);
}

duplicateRevisit.setRefersToDate(refersToDateString);


//Check if the record ID information is available in the index.
// This requires that record information is available during indexing
String refersToRecordID = duplicate.get("orig_record_id"); // ORIGINAL_RECORD_ID.name());

if (refersToRecordID!=null && !refersToRecordID.isEmpty()) {
duplicateRevisit.setRefersToRecordID(refersToRecordID);
} */
}


// Increment statistics counters

0 comments on commit 5c16fe5

Please sign in to comment.
You can’t perform that action at this time.