Skip to content
Permalink
Browse files

Enable programmatic disabling of deduplication.

  • Loading branch information...
csrster committed Nov 24, 2016
1 parent e909ec5 commit 1b893ff86d2b80b00ed632cfe886c99d48e29286
@@ -206,6 +206,7 @@ public boolean canAccept(Job job, DomainConfiguration cfg, DomainConfiguration p
*/
protected void editJobOrderXml(Job job) {
HeritrixTemplate doc = job.getOrderXMLdoc();
doc.enableOrDisableDeduplication(DEDUPLICATION_ENABLED);
if (DEDUPLICATION_ENABLED) {
// Check that the Deduplicator element is present in the
// OrderXMl and enabled. If missing or disabled log a warning
@@ -812,7 +812,12 @@ public void removeDeduplicatorIfPresent() {
}
}

@Override
@Override public void enableOrDisableDeduplication(boolean enabled) {
//NOP
log.debug("In H1 templates we don't enable/disable deduplication.");
}

@Override
public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName,
String scheduleName, String performer) {

@@ -78,7 +78,11 @@
public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}";
public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}";
public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}";


/**
* ##TODO These next two are very fragile patterns! One whitespace and they don't match.
* Replace with more robust regexp match e.g. ".*ref.*bean.*DeDuplicator.*"
*/
public static final String DEDUPLICATION_BEAN_REFERENCE_PATTERN = "<ref bean=\"DeDuplicator\"/>";
public static final String DEDUPLICATION_BEAN_PATTERN = "<bean id=\"DeDuplicator\"";
public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER
@@ -93,7 +97,9 @@
"%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}";

public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER
= "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}";
= "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}";

public static final String DEDUPLICATION_ENABLED_PLACEHOLDER = "%{DEDUPLICATION_ENABLED_PLACEHOLDER}";


// PLACEHOLDERS for archiver beans (Maybe not necessary)
@@ -521,10 +527,14 @@ public void setDiskPath(String absolutePath) {
@Override
public void removeDeduplicatorIfPresent() {
//NOP
log.warn("Removing the Deduplicator is not possible with the H3 templates and should not be required with the H3 template.");
log.debug("In H3 we don't remove the deduplicator, but just disable it.");
}

//<property name="metadataItems">

@Override public void enableOrDisableDeduplication(boolean enabled) {
template = template.replace(DEDUPLICATION_ENABLED_PLACEHOLDER, Boolean.toString(enabled).toLowerCase());
}

//<property name="metadataItems">
// <map>
// <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one -->
// <entry key="harvestInfo.jobId" value="1"/>
@@ -254,6 +254,11 @@ public static HeritrixTemplate read(long template_id, Reader orderTemplateReader
*/
public abstract void removeDeduplicatorIfPresent();

/**
*
*/
public abstract void enableOrDisableDeduplication(boolean enabled);

/**
* Method to add settings to the WARCWriterProcesser, so that it can generate a proper WARCINFO record.
* @param ajob a HarvestJob
@@ -358,8 +358,23 @@ extractorJs.enabled=true
</list>
</property>
</bean>

<!-- DISPOSITION CHAIN -->

<bean id="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator">
<!-- DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is replaced by path on harvest-server -->
<property name="indexLocation" value="%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"/>
<property name="matchingMethod" value="URL"/>
<property name="tryEquivalent" value="TRUE"/>
<property name="changeContentSize" value="false"/>
<property name="mimeFilter" value="^text/.*"/>
<property name="filterMode" value="BLACKLIST"/>
<!-- <property name="analysisMode" value="TIMESTAMP"/> TODO does not work. but isn't a problem, as the default is always USED -->
<property name="origin" value=""/>
<property name="originHandling" value="INDEX"/>
<property name="statsPerHost" value="true"/>
<property name="enabled" value="%{DEDUPLICATION_ENABLED_PLACEHOLDER}" />
</bean>

<!-- DISPOSITION CHAIN -->
<!-- processors declared as named beans -->

<!-- Here the (W)arc writer is inserted -->
@@ -377,6 +392,8 @@ extractorJs.enabled=true
<bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
<property name="processors">
<list>
<!-- remove the reference below, and the DeDuplicator bean itself to disable Deduplication -->
<ref bean="DeDuplicator"/>
<!-- write to aggregate archival files... -->
<!-- Here the reference to the (w)arcWriter bean is inserted -->

0 comments on commit 1b893ff

Please sign in to comment.
You can’t perform that action at this time.