/
settings.xml
117 lines (116 loc) · 5.2 KB
/
settings.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
<?xml version="1.0" encoding="UTF-8"?>
<!--
* File: $Id$
* Revision: $Revision$
* Author: $Author$
* Date: $Date$
*
* The Netarchive Suite - Software to harvest and preserve websites
* Copyright 2004-2012 The Royal Danish Library, the Danish State and
University Library, the National Library of France and the Austrian
National Library.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-->
<!--
This file contains the default settings used by the harvester module.
For documentation of the individual settings, please refer to the
javadoc for dk.netarkivet.harvester.HarvesterSettings.java
-->
<settings>
<harvester>
<monitor>
<!-- 1 minute -->
<refreshInterval>60</refreshInterval>
<displayedHistorySize>30</displayedHistorySize>
<historySampleRate>300</historySampleRate><!-- in seconds, 5 minutes -->
<historyChartGenInterval>300</historyChartGenInterval><!-- in seconds, 5 minutes -->
</monitor>
<harvesting>
<serverDir>server</serverDir>
<!-- Default is 400000000 bytes (~400 Mbytes). -->
<minSpaceLeft>400000000</minSpaceLeft>
<oldjobsDir>oldjobs</oldjobsDir>
<channel></channel>
<sendReadyInterval>30</sendReadyInterval>
<sendReadyDelay>300</sendReadyDelay>
<continuationFromHeritrixRecoverlogEnabled>false</continuationFromHeritrixRecoverlogEnabled>
<heritrix>
<!-- 30 minutes -->
<inactivityTimeout>1800</inactivityTimeout>
<!-- 30 minutes -->
<noresponseTimeout>1800</noresponseTimeout>
<!-- 10 minutes -->
<waitForReportGenerationTimeout>600</waitForReportGenerationTimeout>
<!-- 20 seconds -->
<crawlLoopWaitTime>20</crawlLoopWaitTime>
<abortIfConnectionLost>true</abortIfConnectionLost>
<adminName>admin</adminName>
<adminPassword>adminPassword</adminPassword>
<guiPort>8090</guiPort>
<jmxPort>8091</jmxPort>
<jmxUsername>controlRole</jmxUsername>
<jmxPassword>JMX_CONTROL_ROLE_PASSWORD_PLACEHOLDER</jmxPassword>
<heapSize>1598M</heapSize>
<javaOpts></javaOpts>
<archiveFormat>warc</archiveFormat>
<archiveNaming>
<class>dk.netarkivet.harvester.harvesting.LegacyNamingConvention</class>
</archiveNaming>
<warc>
<skipIdenticalDigests>false</skipIdenticalDigests>
<writeRequests>false</writeRequests>
<writeMetadata>false</writeMetadata>
<writeMetadataOutlinks>false</writeMetadataOutlinks>
<writeRevisitForIdenticalDigests>false</writeRevisitForIdenticalDigests>
<writeRevisitForNotModified>false</writeRevisitForNotModified>
</warc>
</heritrix>
<frontier>
<!-- 2,5 minutes -->
<frontierReportWaitTime>150</frontierReportWaitTime>
<filter>
<class>dk.netarkivet.harvester.harvesting.frontier.TopTotalEnqueuesFilter</class>
<args></args>
</filter>
</frontier>
<heritrixLauncher>
<class>dk.netarkivet.harvester.heritrix3.controller.HeritrixLauncher</class>
</heritrixLauncher>
<heritrixController>
<class>dk.netarkivet.harvester.heritrix3.controller.HeritrixController</class>
</heritrixController>
<harvestReport>
<class>dk.netarkivet.harvester.harvesting.report.LegacyHarvestReport</class>
<disregardSeedURLInfo>false</disregardSeedURLInfo>
</harvestReport>
<deduplication>
<enabled>true</enabled>
</deduplication>
<metadata>
<heritrixFilePattern>.*(\.xml|\.txt|\.log|\.out|\.cxml)</heritrixFilePattern>
<reportFilePattern>.*-report.txt</reportFilePattern>
<logFilePattern>.*(\.log|\.out)</logFilePattern>
<archiveFilesReport>
<generate>true</generate>
<fileName>archivefiles-report.txt</fileName>
<fileHeader>[ARCHIVEFILE] [Closed] [Size]</fileHeader>
</archiveFilesReport>
<metadataFormat>warc</metadataFormat>
</metadata>
</harvesting>
</harvester>
</settings>