integration-test/system-test/src/test/resources/default_orderxml.cxml

<?xml version="1.0" encoding="UTF-8"?>
<!-- 
  HERITRIX 3 CRAWL JOB CONFIGURATION FILE - MIGRATE TEMPLATE
  
   This is a relatively minimal configuration suitable for many crawls.
   
   Commented-out beans and properties are provided as an example; values
   shown in comments reflect the actual defaults which are in effect
   without specification. (To change from the default behavior, 
   uncomment AND alter the shown values.)   

   This is also the first step towards a way of migrating our NetarchiveSuite H1 templates to H3.3.0

   This means adding beans for a QuotaEnforcer, a DeDuplicator, a WARCWriterProcessor with added WarcInfo metadata.

 -->
<beans xmlns="http://www.springframework.org/schema/beans"
	     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xmlns:context="http://www.springframework.org/schema/context"
	     xmlns:aop="http://www.springframework.org/schema/aop"
	     xmlns:tx="http://www.springframework.org/schema/tx"
	     xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
           http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd
           http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd
           http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">
 
 <context:annotation-config/>

<!-- 
  OVERRIDES
   Values elsewhere in the configuration may be replaced ('overridden') 
   by a Properties map declared in a PropertiesOverrideConfigurer, 
   using a dotted-bean-path to address individual bean properties. 
   This allows us to collect a few of the most-often changed values
   in an easy-to-edit format here at the beginning of the model
   configuration.    
 -->
 <!-- overrides from a text property list -->
 <bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
  <property name="properties">
<!-- Overrides the default values used by Heritrix -->
   <value>
# This Properties map is specified in the Java 'property list' text format
# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29

###
### some of these overrides is actually just the default value, so they can be skipped
###

## Q: can overrides like 'fetchDns.enabled=false' be used to disable the beans?

metadata.jobName=default_orderxml
metadata.description=Default Profile
metadata.operator=Admin
metadata.userAgentTemplate=Mozilla/5.0 (compatible; heritrix/3.3.0 +@OPERATOR_CONTACT_URL@)
## Edit the two following lines to match your setup.
metadata.operatorContactUrl=http://netarkivet.dk/webcrawler/
metadata.operatorFrom=info@netarkivet.dk

loggerModule.path=logs

crawlLimiter.maxBytesDownload=0
crawlLimiter.maxDocumentsDownload=0
## MaxTimeseconds inserted by NetarchiveSuite (Delete line, if behaviour unwanted)
crawlLimiter.maxTimeSeconds=%{MAX_TIME_SECONDS_PLACEHOLDER}

crawlController.maxToeThreads=50
crawlController.recorderOutBufferBytes=4096
crawlController.recorderInBufferBytes=65536
crawlController.pauseAtStart=false
crawlController.scratchDir=scratch

## org.archive.bdb.BdbModule overrides
bdb.dir=state
bdb.cachePercent=40

## seeds properties
seeds.sourceTagSeeds=false

scope.rules[2].maxHops=25
scope.rules[6].maxRepetitions=3
scope.rules[3].maxTransHops=5
scope.rules[3].maxSpeculativeHops=1

## Politeness overrides
disposition.delayFactor=1.0
disposition.maxDelayMs=1000
disposition.minDelayMs=300
disposition.maxPerHostBandwidthUsageKbSec=500

preparer.preferenceEmbedHops=1
preparer.preferenceDepthHops=-1

frontier.maxRetries=3
frontier.retryDelaySeconds=300
frontier.recoveryLogEnabled=false
frontier.balanceReplenishAmount=3000
frontier.errorPenaltyAmount=100
## Can be used instead of the QuotaEnforcer module. In this case the following line should look 
## like: frontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}
## instead of: frontier.queueTotalBudget=

frontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}
frontier.snoozeLongMs=300000

preselector.enabled=true
preselector.logToFile=false
preselector.recheckScope=true
preselector.blockAll=false

preconditions.enabled=true
preconditions.ipValidityDurationSeconds=21600
preconditions.robotsValidityDurationSeconds=86400
preconditions.calculateRobotsOnly=false

fetchDns.enabled=true
fetchDns.acceptNonDnsResolves=false
fetchDns.digestContent=true
fetchDns.digestAlgorithm=sha1

fetchHttp.enabled=true
fetchHttp.timeoutSeconds=1200
fetchHttp.soTimeoutMs=20000
fetchHttp.maxFetchKBSec=0
fetchHttp.maxLengthBytes=0
fetchHttp.ignoreCookies=false
fetchHttp.sslTrustLevel=OPEN
fetchHttp.defaultEncoding=ISO-8859-1
fetchHttp.digestContent=true
fetchHttp.digestAlgorithm=sha1
fetchHttp.sendIfModifiedSince=true
fetchHttp.sendIfNoneMatch=true
fetchHttp.sendConnectionClose=true
fetchHttp.sendReferer=true
fetchHttp.sendRange=false
extractorHttp.enabled=true
extractorHtml.enabled=true
extractorHtml.extractJavascript=true
extractorHtml.treatFramesAsEmbedLinks=false
extractorHtml.ignoreFormActionUrls=true
extractorHtml.extractValueAttributes=false
extractorHtml.ignoreUnexpectedHtml=true
extractorCss.enabled=true
extractorJs.enabled=true
     extractorSwf.enabled=true
     candidates.seedsRedirectNewSeeds=false
     statisticsTracker.intervalSeconds=20
     warcWriter.compress=true
   </value>
  </property>
 </bean>

 <!-- overrides from declared <prop> elements, more easily allowing
      multiline values or even declared beans -->
 <bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
  <property name="properties">
   <props>
   </props>
  </property>
 </bean>

 <!-- CRAWL METADATA: including identification of crawler/operator -->
 <bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName">
       <property name="operatorContactUrl" value="[see override above]"/>
       <property name="jobName" value="[see override above]"/>
       <property name="description" value="[see override above]"/>
       <property name="robotsPolicyName" value="ignore"/>
  <!-- <property name="operator" value=""/> -->
  <!-- <property name="operatorFrom" value=""/> -->
  <!-- <property name="organization" value=""/> -->
  <!-- <property name="audience" value=""/> -->
  <!-- <property name="userAgentTemplate" 
         value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> -->
       
 </bean>
 
 <!-- SEEDS: crawl starting points -->
 <!-- ConfigFile approach: specifying external seeds.txt file -->
 <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
  <property name="textSource">
   <bean class="org.archive.spring.ConfigFile">
    <property name="path" value="seeds.txt" />
   </bean>
  </property>
  <property name="sourceTagSeeds" value="false"/> 
 </bean>

 <!-- SCOPE: rules for which discovered URIs to crawl; order is very 
      important because last decision returned other than 'NONE' wins. -->
 <bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">
  <property name="rules">
   <list>
    <!-- Begin by REJECTing all... -->
    <bean class="org.archive.modules.deciderules.RejectDecideRule">
    </bean>
    <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
    <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
     <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
     <!-- <property name="alsoCheckVia" value="true" /> -->
     <!-- <property name="surtsSourceFile" value="" /> -->
     <!-- <property name="surtsDumpFile" value="surts.dump" /> -->
    </bean>
    <!-- ...but REJECT those more than a configured link-hop-count from start... -->
    <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">
     <!-- <property name="maxHops" value="20" /> -->
    </bean>
    <!-- ...but ACCEPT those more than a configured link-hop-count from start... -->
    <bean class="org.archive.modules.deciderules.TransclusionDecideRule">
     <!-- <property name="maxTransHops" value="2" /> -->
     <!-- <property name="maxSpeculativeHops" value="1" /> -->
    </bean>
    <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->
    <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
          <property name="decision" value="REJECT"/>
          <property name="seedsAsSurtPrefixes" value="false"/>
          <property name="surtsDumpFile" value="negative-surts.dump" />
     <!-- <property name="surtsSourceFile" value="" /> -->
    </bean>
    <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
    <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
     <property name="listLogicalOr" value="true" />
     <property name="regexList">
           <list>	
<!-- Here we inject our global crawlertraps, domain specific crawlertraps -->
%{CRAWLERTRAPS_PLACEHOLDER}
           </list>
          </property> 
    </bean>

    <!-- ...and REJECT those with suspicious repeating path-segments... -->
    <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">
     <!-- <property name="maxRepetitions" value="2" /> -->
    </bean>
    <!-- ...and REJECT those with more than threshold number of path-segments... -->
    <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">
     <!-- <property name="maxPathDepth" value="20" /> -->
    </bean>
    <!-- ...but always ACCEPT those marked as prerequisites for another URI... -->
    <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">
    </bean>
    <!-- ...but always REJECT those with unsupported URI schemes -->
    <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">
    </bean>
   </list>
  </property>
 </bean>
 
 <!-- 
   PROCESSING CHAINS
    Much of the crawler's work is specified by the sequential 
    application of swappable Processor modules. These Processors
    are collected into three 'chains. The CandidateChain is applied 
    to URIs being considered for inclusion, before a URI is enqueued
    for collection. The FetchChain is applied to URIs when their 
    turn for collection comes up. The DispositionChain is applied 
    after a URI is fetched and analyzed/link-extracted.
  -->
  
 <!-- CANDIDATE CHAIN --> 
 <!-- processors declared as named beans -->
 <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
 </bean>
 <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
  <!-- <property name="preferenceDepthHops" value="-1" /> -->
  <!-- <property name="preferenceEmbedHops" value="1" /> -->
  <!-- <property name="canonicalizationPolicy"> 
        <ref bean="canonicalizationPolicy" />
       </property> -->
   <property name="queueAssignmentPolicy"> 
        <ref bean="queueAssignmentPolicy" /> 
<!-- Bundled with NAS is two queueAssignPolicies (code is in heritrix3-extensions): 
 dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy
 dk.netarkivet.harvester.harvesting.SeedUriDomainnameQueueAssignmentPolicy 
-->
       </property>
  
 <!-- <property name="uriPrecedencePolicy"> 
        <ref bean="uriPrecedencePolicy" />
       </property> -->
  <!-- <property name="costAssignmentPolicy"> 
        <ref bean="costAssignmentPolicy" />
       </property> -->
 </bean>
 <!-- assembled into ordered CandidateChain bean -->
 <bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
  <property name="processors">
   <list>
    <!-- apply scoping rules to each individual candidate URI... -->
    <ref bean="candidateScoper"/>
    <!-- ...then prepare those ACCEPTed for enqueuing to frontier. -->
    <ref bean="preparer"/>
   </list>
  </property>
 </bean>
  
 <!-- FETCH CHAIN --> 
 <!-- processors declared as named beans -->
 <bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
  <!-- <property name="recheckScope" value="false" /> -->
  <!-- <property name="blockAll" value="false" /> -->
  <!-- <property name="blockByRegex" value="" /> -->
  <!-- <property name="allowByRegex" value="" /> -->
 </bean>
 <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">

  <!-- refer to a list of credentials -->
  <property name="credentialStore">
    <ref bean="credentialStore" />
  </property> 
 </bean>
 <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">
 </bean>

 <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">
 </bean>

 <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
 </bean>

 <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
 </bean>

 <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
 </bean> 

 <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">
 </bean>

 <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
 </bean> 

 <!-- assembled into ordered FetchChain bean -->
 <bean id="fetchProcessors" class="org.archive.modules.FetchChain">
  <property name="processors">
   <list>
    <!-- recheck scope, if so enabled... -->
    <ref bean="preselector"/>
    <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
    <ref bean="preconditions"/>

    <!-- check, if quotas is already superseded --> 
    <ref bean="quotaenforcer"/>  <!-- always required by NAS ? -->

    <!-- ...fetch if DNS URI... -->
    <ref bean="fetchDns"/>
    <!-- ...fetch if HTTP URI... -->
    <ref bean="fetchHttp"/>
    <!-- ...extract oulinks from HTTP headers... -->
    <ref bean="extractorHttp"/>
    <!-- ...extract oulinks from HTML content... -->
    <ref bean="extractorHtml"/>
    <!-- ...extract oulinks from CSS content... -->
    <ref bean="extractorCss"/>
    <!-- ...extract oulinks from Javascript content... -->
    <ref bean="extractorJs"/>
    <!-- ...extract oulinks from Flash content... -->
    <ref bean="extractorSwf"/>
   </list>
  </property>
 </bean>

  <bean id="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator">
    <!-- DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is replaced by path on harvest-server -->
    <property name="indexLocation" value="%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"/>
    <property name="matchingMethod" value="URL"/>
    <property name="tryEquivalent" value="TRUE"/>
    <property name="changeContentSize" value="false"/>
    <property name="mimeFilter" value="^text/.*"/>
    <property name="filterMode" value="BLACKLIST"/>
    <!--  <property name="analysisMode" value="TIMESTAMP"/> TODO does not work. but isn't a problem, as the default is always USED -->
    <property name="origin" value=""/>
    <property name="originHandling" value="INDEX"/>
    <property name="statsPerHost" value="true"/>
    <property name="enabled" value="%{DEDUPLICATION_ENABLED_PLACEHOLDER}" />
  </bean>

  <!-- DISPOSITION CHAIN -->
 <!-- processors declared as named beans -->

<!-- Here the (W)arc writer is inserted -->

%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}


 <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">
  <!-- <property name="seedsRedirectNewSeeds" value="true" /> -->
 </bean>
 <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">
 </bean>

 <!-- assembled into ordered DispositionChain bean -->
 <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
  <property name="processors">
   <list>
     <!-- remove the reference below, and the DeDuplicator bean itself to disable Deduplication -->
     <ref bean="DeDuplicator"/>
    <!-- write to aggregate archival files... -->
    <!-- Here the reference to the (w)arcWriter bean is inserted -->	

    %{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}
 
    <!-- ...send each outlink candidate URI to CandidatesChain, 
         and enqueue those ACCEPTed to the frontier... -->
    <ref bean="candidates"/>
    <!-- ...then update stats, shared-structures, frontier decisions -->
    <ref bean="disposition"/>
   </list>
  </property>
 </bean>
 
 <!-- CRAWLCONTROLLER: Control interface, unifying context -->
 <bean id="crawlController" 
   class="org.archive.crawler.framework.CrawlController">
 </bean>
 
 <!-- FRONTIER: Record of all URIs discovered and queued-for-collection -->
 <bean id="frontier" 
   class="org.archive.crawler.frontier.BdbFrontier">
 </bean>
 
 <!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs --> 
 <bean id="uriUniqFilter" 
   class="org.archive.crawler.util.BdbUriUniqFilter">
 </bean>

 <!-- 
   OPTIONAL BUT RECOMMENDED BEANS
  -->
  
 <!-- ACTIONDIRECTORY: disk directory for mid-crawl operations
      Running job will watch directory for new files with URIs, 
      scripts, and other data to be processed during a crawl. -->
 <bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">
 </bean> 
 
 <!--  CRAWLLIMITENFORCER: stops crawl when it reaches configured limits -->
 <bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer">
  </bean>

 <!-- CHECKPOINTSERVICE: checkpointing assistance -->
 <bean id="checkpointService" 
   class="org.archive.crawler.framework.CheckpointService">
  </bean>
 
 <!-- 
   OPTIONAL BEANS
    Uncomment and expand as needed, or if non-default alternate 
    implementations are preferred.
  -->
  
 <!-- QUEUE ASSIGNMENT POLICY -->
 
<!-- NAS queue assignement policy. 
default H3 policy is org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy
-->

 <bean id="queueAssignmentPolicy"
  class="dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy"> 
  <property name="forceQueueAssignment" value="" /> <!-- TODO evaluate this default -->
  <property name="deferToPrevious" value="true" />  <!-- TODO evaluate this default -->
  <property name="parallelQueues" value="1" />      <!-- TODO evaluate this default -->
 </bean>

 <!-- URI PRECEDENCE POLICY -->
 <!--
 <bean id="uriPrecedencePolicy" 
   class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy">
 </bean>
 -->
 
 <!-- COST ASSIGNMENT POLICY -->
 
 <bean id="costAssignmentPolicy" 
   class="org.archive.crawler.frontier.UnitCostAssignmentPolicy">
 </bean>

 
<!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials -->

<!-- sample use of credentialStore http://stackoverflow.com/questions/17756520/use-of-heritrixs-htmlformcredential-and-credentialstore  -->

 <bean id="credentialStore" 
   class="org.archive.modules.credential.CredentialStore">
 <property name="credentials">
 <map>
    <entry key="licitationen" value-ref="licitationen_login_1"/>
    <entry key="mymusic" value-ref="mymusic_login_1"/>
    <entry key="arto" value-ref="arto_login_1"/>
    <entry key="heerfordt" value-ref="heerfordt_login_1"/>
 </map>
</property>
</bean>

<bean id="licitationen_login_1" class="org.archive.modules.credential.HttpAuthenticationCredential"> <!-- renamed from Rfc2617Credential -->
  <property name="domain" value="www.licitationen.dk" />
  <property name="realm" value="Dagbladet Licitationen"/>
  <property name="login" value="*****"/>
  <property name="password" value="*****"/>
</bean>

<bean id="mymusic_login_1" class="org.archive.modules.credential.HtmlFormCredential">
  <property name="domain" value="www.mymusic.dk"/>
  <property name="loginUri" value="http://www.mymusic.dk/konto/login2.asp"/>
  <!-- <property name="httpMethod" value="Method.POST"/> -->
  <property name="formItems">
    <map>
	<entry key="username" value="*****"/>
	<entry key="password" value="*****"/>
	<entry key="autologin" value="y"/>
    </map>
  </property>
</bean>

<bean id="arto_login_1" class="org.archive.modules.credential.HtmlFormCredential">
  <property name="domain" value="www.arto.dk"/>
  <property name="loginUri" value="http://www.arto.dk/r2/frames/navigation.asp"/>
  <!-- <property name="httpMethod" value="Method.POST"/> -->
  <property name="formItems">
    <map>
        <entry key="action" value="submit"/>
	<entry key="brugernavn" value="****"/>
	<entry key="kodeord" value="*****"/>
	<entry key="AutoLogin" value="ja"/>
	<entry key="loginKnap" value="Log ind"/>
    </map>
  </property>
</bean>

<bean id="heerfordt_login_1" class="org.archive.modules.credential.HtmlFormCredential">
  <property name="domain" value="heerfordt.dk"/>
  <property name="loginUri" value="http://heerfordt.dk/"/>
<!--  <property name="http-method" value="POST"/> -->
  <property name="formItems">
    <map>
	<entry key="Brugernavn" value="*****"/>
	<entry key="Pw" value="*****"/>
	<entry key="Login" value="Login"/>
    </map>
  </property>
</bean>

<!-- sample credentials ended -->

<!-- QUOTA ENFORCER BEAN -->

<bean id="quotaenforcer" 
  class="org.archive.crawler.prefetch.QuotaEnforcer">
  <property name="forceRetire" value="false"></property>

  <property name="serverMaxFetchSuccesses" value="-1"></property>
  <property name="serverMaxSuccessKb" value="-1"></property>
  <property name="serverMaxFetchResponses" value="-1"></property>
  <property name="serverMaxAllKb" value="-1"></property>

  <property name="hostMaxFetchSuccesses" value="-1"></property>
  <property name="hostMaxSuccessKb" value="-1"></property>
  <property name="hostMaxFetchResponses" value="-1"></property>
  <property name="hostMaxAllKb" value="-1"></property>

  <property name="groupMaxFetchSuccesses" value="%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}">
  </property>
  <property name="groupMaxSuccessKb" value="-1"></property>
  <property name="groupMaxFetchResponses" value="-1"></property>
  <property name="groupMaxAllKb" value="%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}"></property> 
 </bean>

 <!-- 
   REQUIRED STANDARD BEANS
    It will be very rare to replace or reconfigure the following beans.
  -->

 <!-- STATISTICSTRACKER: standard stats/reporting collector -->
 <bean id="statisticsTracker" 
   class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName">
 </bean>
 
 <!-- CRAWLERLOGGERMODULE: shared logging facility -->
 <bean id="loggerModule" 
   class="org.archive.crawler.reporting.CrawlerLoggerModule">
 </bean>
 
 <!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays
      Autowired to include any SheetForSurtPrefix or 
      SheetForDecideRuled beans -->
 <bean id="sheetOverlaysManager" autowire="byType"
   class="org.archive.crawler.spring.SheetOverlaysManager">
 </bean>

 <!-- BDBMODULE: shared BDB-JE disk persistence manager -->
 <bean id="bdb" 
  class="org.archive.bdb.BdbModule">
 </bean>
 
 <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->
 <bean id="cookieStorage" 
   class="org.archive.modules.fetcher.BdbCookieStore">
 </bean>
 
 <!-- SERVERCACHE: shared cache of server/host info -->
 <bean id="serverCache" 
   class="org.archive.modules.net.BdbServerCache">
 </bean>

 <!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative
      to crawler-beans.cxml file, and tracking crawl files for web UI -->
 <bean id="configPathConfigurer" 
   class="org.archive.spring.ConfigPathConfigurer">
 </bean>

</beans>