-
Notifications
You must be signed in to change notification settings - Fork 23
/
harvestdb.testdata.sql
167 lines (140 loc) · 60 KB
/
harvestdb.testdata.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
SET statement_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = off;
SET check_function_bodies = false;
SET client_min_messages = warning;
SET escape_string_warning = off;
SET search_path = public, pg_catalog;
SET default_tablespace = '';
SET default_with_oids = false;
--
-- Data for Name: config_seedlists; Type: TABLE DATA; Schema: public; Owner: test
--
COPY config_seedlists (config_id, seedlist_id) FROM stdin;
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
17 17
\.
--
-- Data for Name: configurations; Type: TABLE DATA; Schema: public; Owner: test
--
COPY configurations (config_id, name, comments, domain_id, template_id, maxobjects, maxrate, overridelimits, maxbytes) FROM stdin;
1 defaultconfig 1 4 2000 100 0 500000000
2 defaultconfig 2 4 2000 100 0 500000000
3 defaultconfig 3 4 2000 100 0 500000000
4 defaultconfig 4 4 2000 100 0 500000000
5 defaultconfig 5 4 2000 100 0 500000000
6 defaultconfig 6 4 2000 100 0 500000000
7 defaultconfig 7 4 2000 100 0 500000000
8 defaultconfig 8 4 2000 100 0 500000000
9 defaultconfig 9 4 2000 100 0 500000000
10 defaultconfig 10 4 2000 100 0 500000000
11 defaultconfig 11 4 2000 100 0 500000000
12 defaultconfig 12 4 2000 100 0 500000000
13 defaultconfig 13 4 2000 100 0 500000000
14 defaultconfig 14 4 2000 100 0 500000000
15 defaultconfig 15 4 2000 100 0 500000000
17 defaultconfig 17 4 2000 100 0 500000000
\.
SELECT setval('configurations_id_seq', 17);
COPY domains (domain_id, name, comments, defaultconfig, crawlertraps, edition, alias, lastaliasupdate) FROM stdin;
1 kb.dk 1 1 \N \N
2 statsbiblioteket.dk 2 1 \N \N
3 netarkivet.dk 3 1 \N \N
4 kum.dk 4 1 \N \N
5 raeder.dk 5 1 \N \N
6 kaarefc.dk 6 1 \N \N
7 trineogkaare.dk 7 1 \N \N
8 kaareogtrine.dk 8 1 \N \N
9 trinekc.dk 9 1 \N \N
10 sulnudu.dk 10 1 \N \N
11 slothchristensen.dk 11 1 \N \N
12 oernhoej.dk 12 1 \N \N
13 pligtaflevering.dk 13 1 \N \N
14 dbc.dk 14 1 \N \N
15 bs.dk 15 1 \N \N
17 sy-jonna.dk 17 1 \N \N
\.
SELECT setval('domains_id_seq', 17);
COPY harvestchannel (id, name, issnapshot, isdefault, comments) FROM stdin;
1 LOWPRIORITY t t Channel for snapshot harvests
2 HIGHPRIORITY f t Channel for selective harvests
\.
SELECT setval('harvestchannel_id_seq', 17);
COPY ordertemplates (template_id, name, orderxml, isactive) FROM stdin;
2 default_obeyrobots <?xml version="1.0" encoding="UTF-8"?>\n<!-- \n HERITRIX 3 CRAWL JOB CONFIGURATION FILE - For use with NetarchiveSuite 5.0\n\n -->\n<beans xmlns="http://www.springframework.org/schema/beans"\n\t xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n xmlns:context="http://www.springframework.org/schema/context"\n\t xmlns:aop="http://www.springframework.org/schema/aop"\n\t xmlns:tx="http://www.springframework.org/schema/tx"\n\t xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd\n http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd\n http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd\n http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">\n \n <context:annotation-config/>\n\n<!-- \n OVERRIDES\n Values elsewhere in the configuration may be replaced ('overridden') \n by a Properties map declared in a PropertiesOverrideConfigurer, \n using a dotted-bean-path to address individual bean properties. \n This allows us to collect a few of the most-often changed values\n in an easy-to-edit format here at the beginning of the model\n configuration. \n -->\n <!-- overrides from a text property list -->\n <bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">\n <property name="properties">\n<!-- Overrides the default values used by Heritrix -->\n <value>\n# This Properties map is specified in the Java 'property list' text format\n# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29\n\n###\n### some of these overrides is actually just the default value, so they can be skipped\n###\n\nmetadata.jobName=default_orderxml\nmetadata.description=Default Profile \nmetadata.operator=Admin\nmetadata.userAgentTemplate=Mozilla/5.0 (compatible; heritrix/3.3.0 +@OPERATOR_CONTACT_URL@)\n## Edit the two following lines to match your setup.\nmetadata.operatorContactUrl=http://netarkivet.dk/webcrawler/\nmetadata.operatorFrom=info@netarkivet.dk\n\n## Replace YOUR_ORGANIZATION with the name of your organization\nmetadata.organization=YOUR_ORGANIZATION\n\n## This field is not available in the CrawlMetadata class bundled with heritrix\n## So we extended the class to add this field.\nmetadata.date=20080118111217\n\n## Select robots policy here (one of: default seems to be obey)\nmetadata.robotsPolicyName=%{HONOR_ROBOTS_DOT_TXT}\n\ncrawlLimiter.maxBytesDownload=0\ncrawlLimiter.maxDocumentsDownload=0\n## MaxTimeseconds inserted by NetarchiveSuite (Delete line, if behaviour unwanted)\ncrawlLimiter.maxTimeSeconds=%{MAX_TIME_SECONDS_PLACEHOLDER}\n\ncrawlController.maxToeThreads=50\ncrawlController.recorderOutBufferBytes=4096\ncrawlController.recorderInBufferBytes=65536\ncrawlController.pauseAtStart=false\ncrawlController.runWhileEmpty=false\ncrawlController.scratchDir=scratch\n\n## org.archive.bdb.BdbModule overrides\nbdb.dir=state\nbdb.cachePercent=40\n\n## seeds properties\n## no source-report.txt if this is false\nseeds.sourceTagSeeds=true\n\n## Override properties for org.archive.modules.deciderules.TooManyHopsDecideRule\nscope.rules[2].maxHops=%{MAX_HOPS}\n\n## Override properties for org.archive.modules.deciderules.TransclusionDecideRule\nscope.rules[3].maxTransHops=5\nscope.rules[3].maxSpeculativeHops=1\n\n## Override properties org.archive.modules.deciderules.PathologicalPathDecideRule\nscope.rules[6].maxRepetitions=3\n\n## Politeness overrides\ndisposition.delayFactor=1.0\ndisposition.maxDelayMs=1000\ndisposition.minDelayMs=300\ndisposition.maxPerHostBandwidthUsageKbSec=500\n\npreparer.preferenceEmbedHops=1\npreparer.preferenceDepthHops=-1\n\n## Frontier settings\nfrontier.maxRetries=3\nfrontier.retryDelaySeconds=300\nfrontier.recoveryLogEnabled=false\nfrontier.balanceReplenishAmount=3000\nfrontier.errorPenaltyAmount=100\nfrontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}\nfrontier.snoozeLongMs=300000\nfrontier.extract404s=false\nfrontier.extractIndependently=false\n\npreselector.enabled=true\npreselector.logToFile=false\npreselector.recheckScope=true\npreselector.blockAll=false\n\npreconditions.enabled=true\npreconditions.ipValidityDurationSeconds=21600\npreconditions.robotsValidityDurationSeconds=86400\npreconditions.calculateRobotsOnly=false\n\nfetchDns.enabled=true\nfetchDns.acceptNonDnsResolves=false\nfetchDns.digestContent=true\nfetchDns.digestAlgorithm=sha1\n\nfetchHttp.enabled=true\nfetchHttp.timeoutSeconds=1200\n#fetchHttp.soTimeoutMs=20000\nfetchHttp.soTimeoutMs=120000\nfetchHttp.maxFetchKBSec=0\nfetchHttp.maxLengthBytes=0\nfetchHttp.ignoreCookies=false\nfetchHttp.sslTrustLevel=OPEN\n\n#fetchHttp.defaultEncoding=ISO-8859-1\nfetchHttp.defaultEncoding=UTF-8\nfetchHttp.digestContent=true\nfetchHttp.digestAlgorithm=sha1\nfetchHttp.sendIfModifiedSince=true\nfetchHttp.sendIfNoneMatch=true\nfetchHttp.sendConnectionClose=true\nfetchHttp.sendReferer=true\nfetchHttp.sendRange=false\n\n\n## Accept headers for HTTP fetching\nfetchHttp.acceptHeaders[0]=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\n\nextractorHttp.enabled=true\nextractorHtml.enabled=true\nextractorHtml.extractJavascript=%{EXTRACT_JAVASCRIPT}\nextractorHtml.treatFramesAsEmbedLinks=false\nextractorHtml.ignoreFormActionUrls=true\nextractorHtml.extractValueAttributes=false\nextractorHtml.ignoreUnexpectedHtml=true\nextractorCss.enabled=true\nextractorJs.enabled=true\nextractorSwf.enabled=true\n\n# allow redirected seeds to be accepted as seeds\n# In H1, this property belonged to the LinkScoper object, in H3, it is part of the CandidatesProcessor object\ncandidates.seedsRedirectNewSeeds=true\n\nstatisticsTracker.intervalSeconds=20\n\n## Quotaenforcing\nquotaenforcer.groupMaxFetchSuccesses=%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}\nquotaenforcer.groupMaxAllKb=%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}\n\n## sample overrides of the warcwriter\nwarcWriter.template=${prefix}-${timestamp17}-${serialno}-ciblee_2015_${heritrix.hostname}\nwarcWriter.writeRequests=false\nwarcWriter.writeMetadata=false\nwarcWriter.poolMaxActive=3\n\nloggerModule.path=logs\n\n </value>\n </property>\n </bean>\n\n <!-- overrides from declared <prop> elements, more easily allowing\n multiline values or even declared beans -->\n <bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">\n <property name="properties">\n <props>\n </props>\n </property>\n </bean>\n\n <!-- CRAWL METADATA: including identification of crawler/operator \n Using NetarchiveSuites own extended version of the org.archive.modules.CrawlMetadata\n -->\n <bean id="metadata" class="dk.netarkivet.harvester.harvesting.NasCrawlMetadata" autowire="byName">\n <property name="operatorContactUrl" value="[see override above]"/>\n <property name="jobName" value="[see override above]"/>\n <property name="description" value="[see override above]"/>\n <!-- <property name="robotsPolicyName" value="ignore"/> -->\n <!-- <property name="operator" value=""/> -->\n <!-- <property name="operatorFrom" value=""/> -->\n <!-- <property name="organization" value=""/> -->\n <!-- <property name="audience" value=""/> -->\n <!-- <property name="userAgentTemplate" \n value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> -->\n \n </bean>\n \n <!-- SEEDS: crawl starting points -->\n <!-- ConfigFile approach: specifying external seeds.txt file -->\n <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">\n <property name="textSource">\n <bean class="org.archive.spring.ConfigFile">\n <property name="path" value="seeds.txt" />\n </bean>\n </property>\n <property name="sourceTagSeeds" value="false"/> \n </bean>\n\n <!-- SCOPE: rules for which discovered URIs to crawl; order is very \n important because last decision returned other than 'NONE' wins. -->\n <bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">\n <property name="logToFile" value="true" />\n <property name="logExtraInfo" value="true" />\n <property name="rules">\n <list>\n <!-- Begin by REJECTing all... -->\n <bean class="org.archive.modules.deciderules.RejectDecideRule">\n </bean>\n <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->\n <!-- <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> -->\n <bean class="dk.netarkivet.harvester.harvesting.OnNSDomainsDecideRule"> \n <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->\n <!-- <property name="alsoCheckVia" value="true" /> -->\n <!-- <property name="surtsSourceFile" value="" /> -->\n <!-- <property name="surtsDumpFile" value="surts.dump" /> -->\n </bean>\n <!-- ...but REJECT those more than a configured link-hop-count from start... -->\n <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">\n <!-- <property name="maxHops" value="20" /> -->\n </bean>\n <!-- ...but ACCEPT those more than a configured link-hop-count from start... -->\n <bean class="org.archive.modules.deciderules.TransclusionDecideRule">\n <!-- <property name="maxTransHops" value="2" /> -->\n <!-- <property name="maxSpeculativeHops" value="1" /> -->\n </bean>\n <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->\n <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">\n <property name="decision" value="REJECT"/>\n <property name="seedsAsSurtPrefixes" value="false"/>\n <property name="surtsDumpFile" value="negative-surts.dump" />\n <!-- <property name="surtsSourceFile" value="" /> -->\n </bean>\n <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->\n <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">\n <property name="decision" value="REJECT"/>\n <property name="listLogicalOr" value="true" />\n <property name="regexList">\n <list>\n\t\t<value>.*core\\.UserAdmin.*core\\.UserLogin.*</value>\n\t\t<value>.*core\\.UserAdmin.*register\\.UserSelfRegistration.*</value>\n\t\t<value>.*\\/w\\/index\\.php\\?title=Speci[ae]l:Recentchanges.*</value>\n\t\t<value>.*act=calendar&cal_id=.*</value>\n\t\t<value>.*advCalendar_pi.*</value>\n\t\t<value>.*cal\\.asp\\?date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=monthly&date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=weekly&date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=yearly&date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=yearly&year=.*</value>\n\t\t<value>.*cal\\/cal_day\\.php\\?op=day&date=.*</value>\n\t\t<value>.*cal\\/cal_week\\.php\\?op=week&date=.*</value>\n\t\t<value>.*cal\\/calendar\\.php\\?op=cal&month=.*</value>\n\t\t<value>.*cal\\/yearcal\\.php\\?op=yearcal&ycyear=.*</value>\n\t\t<value>.*calendar\\.asp\\?calmonth=.*</value>\n\t\t<value>.*calendar\\.asp\\?qMonth=.*</value>\n\t\t<value>.*calendar\\.php\\?sid=.*</value>\n\t\t<value>.*calendar\\.php\\?start=.*</value>\n\t\t<value>.*calendar\\.php\\?Y=.*</value>\n\t\t<value>.*calendar\\/\\?CLmDemo_horizontal=.*</value>\n\t\t<value>.*calendar_menu\\/calendar\\.php\\?.*</value>\n\t\t<value>.*calendar_scheduler\\.php\\?d=.*</value>\n\t\t<value>.*calendar_year\\.asp\\?qYear=.*</value>\n\t\t<value>.*calendarix\\/calendar\\.php\\?op=.*</value>\n\t\t<value>.*calendarix\\/yearcal\\.php\\?op=.*</value>\n\t\t<value>.*calender\\/default\\.asp\\?month=.*</value>\n\t\t<value>.*Default\\.asp\\?month=.*</value>\n\t\t<value>.*events\\.asp\\?cat=0&mDate=.*</value>\n\t\t<value>.*events\\.asp\\?cat=1&mDate=.*</value>\n\t\t<value>.*events\\.asp\\?MONTH=.*</value>\n\t\t<value>.*events\\.asp\\?month=.*</value>\n\t\t<value>.*index\\.php\\?iDate=.*</value>\n\t\t<value>.*index\\.php\\?module=PostCalendar&func=view.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_day&year=.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_detail&year=.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_month&year=.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_week&year=.*</value>\n <value>.*index\\.php\\?option=com_events&task=view_year&year=.*</value>\n <value>.*index\\.php\\?option=com_extcalendar&Itemid.*</value>\n <value>.*modules\\.php\\?name=Calendar&op=modload&file=index.*</value>\n <value>.*modules\\.php\\?name=vwar&file=calendar&action=list&month=.*</value>\n <value>.*modules\\.php\\?name=vwar&file=calendar.*</value>\n <value>.*modules\\.php\\?name=vWar&mod=calendar.*</value>\n <value>.*modules\\/piCal\\/index\\.php\\?caldate=.*</value>\n <value>.*modules\\/piCal\\/index\\.php\\?cid=.*</value>\n <value>.*option,com_events\\/task,view_day\\/year.*</value>\n <value>.*option,com_events\\/task,view_month\\/year.*</value>\n <value>.*option,com_extcalendar\\/Itemid.*</value>\n <value>.*task,view_month\\/year.*</value>\n <value>.*shopping_cart\\.php.*</value>\n <value>.*action.add_product.*</value>\n <value>.*action.remove_product.*</value>\n <value>.*action.buy_now.*</value>\n <value>.*checkout_payment\\.php.*</value>\n <value>.*login.*login.*login.*login.*</value>\n <value>.*homepage_calendar\\.asp.*</value>\n <value>.*MediaWiki.*Movearticle.*</value>\n <value>.*index\\.php.*action=edit.*</value>\n <value>.*comcast\\.net.*othastar.*</value>\n <value>.*Login.*Login.*Login.*</value>\n <value>.*redir.*redir.*redir.*</value>\n <value>.*bookingsystemtime\\.asp\\?dato=.*</value>\n <value>.*bookingsystem\\.asp\\?date=.*</value>\n <value>.*cart\\.asp\\?mode=add.*</value>\n <value>.*\\/photo.*\\/photo.*\\/photo.*</value>\n <value>.*\\/skins.*\\/skins.*\\/skins.*</value>\n <value>.*\\/scripts.*\\/scripts.*\\/scripts.*</value>\n <value>.*\\/styles.*\\/styles.*\\/styles.*</value>\n <value>.*\\/coppermine\\/login\\.php\\?referer=.*</value>\n <value>.*\\/images.*\\/images.*\\/images.*</value>\n <value>.*\\/stories.*\\/stories.*\\/stories.*</value>\t\n<!-- Here we inject our global crawlertraps, domain specific crawlertraps -->\n%{CRAWLERTRAPS_PLACEHOLDER}\n </list>\n </property> \n </bean>\n\n <!-- ...and REJECT those with suspicious repeating path-segments... -->\n <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">\n <!-- <property name="maxRepetitions" value="2" /> -->\n </bean>\n <!-- ...and REJECT those with more than threshold number of path-segments... -->\n <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">\n <!-- <property name="maxPathDepth" value="20" /> -->\n </bean>\n <!-- ...but always ACCEPT those marked as prerequisites for another URI... -->\n <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">\n </bean>\n <!-- ...but always REJECT those with unsupported URI schemes -->\n <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">\n </bean>\n </list>\n </property>\n </bean>\n \n <!-- \n PROCESSING CHAINS\n Much of the crawler's work is specified by the sequential \n application of swappable Processor modules. These Processors\n are collected into three 'chains. The CandidateChain is applied \n to URIs being considered for inclusion, before a URI is enqueued\n for collection. The FetchChain is applied to URIs when their \n turn for collection comes up. The DispositionChain is applied \n after a URI is fetched and analyzed/link-extracted.\n -->\n \n <!-- CANDIDATE CHAIN --> \n <!-- processors declared as named beans -->\n <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">\n </bean>\n <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">\n <!-- <property name="preferenceDepthHops" value="-1" /> -->\n <!-- <property name="preferenceEmbedHops" value="1" /> -->\n <!-- <property name="canonicalizationPolicy"> \n <ref bean="canonicalizationPolicy" />\n </property> -->\n <property name="queueAssignmentPolicy"> <ref bean="ourQueueAssignmentPolicy" /> \n<!-- Bundled with NAS is two queueAssignPolicies (code is in heritrix3-extensions): \n dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy\n dk.netarkivet.harvester.harvesting.SeedUriDomainnameQueueAssignmentPolicy \n-->\n </property>\n \n <!-- <property name="uriPrecedencePolicy"> \n <ref bean="uriPrecedencePolicy" />\n </property> -->\n <!-- <property name="costAssignmentPolicy"> \n <ref bean="costAssignmentPolicy" />\n </property> -->\n </bean>\n <!-- assembled into ordered CandidateChain bean -->\n <bean id="candidateProcessors" class="org.archive.modules.CandidateChain">\n <property name="processors">\n <list>\n <!-- apply scoping rules to each individual candidate URI... -->\n <ref bean="candidateScoper"/>\n <!-- ...then prepare those ACCEPTed for enqueuing to frontier. -->\n <ref bean="preparer"/>\n </list>\n </property>\n </bean>\n \n <!-- FETCH CHAIN --> \n <!-- processors declared as named beans -->\n <bean id="preselector" class="org.archive.crawler.prefetch.Preselector">\n <!-- <property name="recheckScope" value="false" /> -->\n <!-- <property name="blockAll" value="false" /> -->\n <!-- <property name="blockByRegex" value="" /> -->\n <!-- <property name="allowByRegex" value="" /> -->\n </bean>\n <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">\n </bean>\n\n\n<!-- set username and password set for the FTP fetcher. \n should probably be configured using overlays to allow different username/passwords for\ndifferent sites. \nThe username/password values is for Publizon pubhub.dk using ftp://ftp.pubhub.dk \n-->\n <bean id="fetchFtp" class="org.archive.modules.fetcher.FetchFTP">\t\n\t<property name="username" value="Pligtaflevering"/>\n\t<property name="password" value="Sund2010Hed"/>\n\t<property name="extractFromDirs" value="true"/>\n\t<property name="extractParent" value="false"/>\n\t<property name="maxLengthBytes" value="0"/>\n\t<property name="maxFetchKBSec" value="0"/>\n\t<property name="timeoutSeconds" value="1200"/>\n \n </bean>\n\n\n <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">\n </bean>\n <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">\n </bean>\n \n <bean id="extractorOAI" class="dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI">\n </bean>\n\n <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">\n </bean>\n <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">\n </bean>\n <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">\n </bean> \n <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">\n </bean>\n <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">\n </bean> \n\n <bean id="extractorXML" class="org.archive.modules.extractor.ExtractorXML">\n</bean>\n \n<!-- assembled into ordered FetchChain bean -->\n <bean id="fetchProcessors" class="org.archive.modules.FetchChain">\n <property name="processors">\n <list>\n <!-- recheck scope, if so enabled... -->\n <ref bean="preselector"/>\n <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->\n <ref bean="preconditions"/>\n\n <!-- check, if quotas is already superseded --> \n <ref bean="quotaenforcer"/> <!-- always required by NAS ? -->\n\n <!-- ...fetch if DNS URI... -->\n <ref bean="fetchDns"/>\n <!-- ...fetch if HTTP URI... -->\n <ref bean="fetchHttp"/>\n <!-- ...fetch if FTP URI... -->\t\n <ref bean="fetchFtp"/> \n <!-- ...extract oulinks from HTTP headers... -->\n <ref bean="extractorHttp"/>\n <!-- ...extract oulinks from HTML content... -->\n <ref bean="extractorHtml"/>\n <!-- ...extract oulinks from CSS content... -->\n <ref bean="extractorCss"/>\n <!-- ...extract oulinks from Javascript content... -->\n <ref bean="extractorJs"/>\n <!-- ...then extract oulinks from extractorOAI content... -->\n <ref bean="extractorOAI"/>\n <!-- ...then extract oulinks from extractorXML content... -->\n <ref bean="extractorXML" />\n <!-- ...extract oulinks from Flash content... -->\n <ref bean="extractorSwf"/>\n \n </list>\n </property>\n </bean>\n \n <!-- DISPOSITION CHAIN -->\n <!-- processors declared as named beans -->\n\n<!-- Here the (W)arc writer is inserted -->\n%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}\n\n<bean id="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator">\n<!-- DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is replaced by path on harvest-server -->\n <property name="indexLocation" value="%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"/> \n <property name="matchingMethod" value="URL"/> \n <property name="tryEquivalent" value="TRUE"/> \n <property name="changeContentSize" value="false"/>\n <property name="mimeFilter" value="^text/.*"/>\n <property name="filterMode" value="BLACKLIST"/>\n<!-- <property name="analysisMode" value="TIMESTAMP"/> TODO does not work. but isn't a problem, as the default is always USED --> \n <property name="origin" value=""/>\n <property name="originHandling" value="INDEX"/>\n <property name="statsPerHost" value="true"/>\n</bean> \n\n <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">\n <!-- <property name="seedsRedirectNewSeeds" value="true" /> -->\n </bean>\n <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">\n </bean>\n\n <!-- assembled into ordered DispositionChain bean -->\n <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">\n <property name="processors">\n <list>\n <!-- write to aggregate archival files... -->\n\n <!-- remove the reference below, and the DeDuplicator bean itself to disable Deduplication -->\n <ref bean="DeDuplicator"/>\n\n <!-- Here the reference to the (w)arcWriter bean is inserted during job-generation -->\t\n\n %{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}\n \n <!-- This bean is required to report back the number of bytes harvested for each domain. -->\n <bean id="ContentSizeAnnotationPostProcessor" class="dk.netarkivet.harvester.harvesting.ContentSizeAnnotationPostProcessor"/>\n\n <!-- ...send each outlink candidate URI to CandidatesChain, \n and enqueue those ACCEPTed to the frontier... -->\n <ref bean="candidates"/>\n <!-- ...then update stats, shared-structures, frontier decisions -->\n <ref bean="disposition"/>\n </list>\n </property>\n </bean>\n \n <!-- CRAWLCONTROLLER: Control interface, unifying context -->\n <bean id="crawlController" \n class="org.archive.crawler.framework.CrawlController">\n </bean>\n \n <!-- FRONTIER: Record of all URIs discovered and queued-for-collection -->\n <bean id="frontier" \n class="org.archive.crawler.frontier.BdbFrontier">\n </bean>\n \n <!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs --> \n <bean id="uriUniqFilter" \n class="org.archive.crawler.util.BdbUriUniqFilter">\n </bean>\n\n <!-- \n OPTIONAL BUT RECOMMENDED BEANS\n -->\n \n <!-- ACTIONDIRECTORY: disk directory for mid-crawl operations\n Running job will watch directory for new files with URIs, \n scripts, and other data to be processed during a crawl. -->\n <bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">\n </bean> \n \n <!-- CRAWLLIMITENFORCER: stops crawl when it reaches configured limits -->\n <bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer">\n </bean>\n\n <!-- CHECKPOINTSERVICE: checkpointing assistance -->\n <bean id="checkpointService" \n class="org.archive.crawler.framework.CheckpointService">\n </bean>\n \n <!-- \n OPTIONAL BEANS\n Uncomment and expand as needed, or if non-default alternate \n implementations are preferred.\n -->\n \n <!-- QUEUE ASSIGNMENT POLICY -->\n \n<!-- NAS queue assignement policy. \ndefault H3 policy is org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy\n-->\n\n <bean id="ourQueueAssignmentPolicy"\n class="dk.netarkivet.harvester.harvesting.SeedUriDomainnameQueueAssignmentPolicy"> \n <property name="forceQueueAssignment" value=""/> <!-- the default is "" -->\n <property name="deferToPrevious" value="true"/> <!-- the default is true -->\n <property name="parallelQueues" value="1" /> <!-- the default is 1 -->\n </bean>\n\n <!-- URI PRECEDENCE POLICY -->\n <!--\n <bean id="uriPrecedencePolicy" \n class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy">\n </bean>\n -->\n \n <!-- COST ASSIGNMENT POLICY -->\n \n <bean id="costAssignmentPolicy" \n class="org.archive.crawler.frontier.UnitCostAssignmentPolicy">\n </bean>\n\n<!-- QUOTA ENFORCER BEAN -->\n\n<bean id="quotaenforcer" \n class="org.archive.crawler.prefetch.QuotaEnforcer">\n <property name="forceRetire" value="false"></property>\n\n <property name="serverMaxFetchSuccesses" value="-1"></property>\n <property name="serverMaxSuccessKb" value="-1"></property>\n <property name="serverMaxFetchResponses" value="-1"></property>\n <property name="serverMaxAllKb" value="-1"></property>\n\n <property name="hostMaxFetchSuccesses" value="-1"></property>\n <property name="hostMaxSuccessKb" value="-1"></property>\n <property name="hostMaxFetchResponses" value="-1"></property>\n <property name="hostMaxAllKb" value="-1"></property>\n <property name="groupMaxFetchSuccesses" value="-1"></property>\n <property name="groupMaxSuccessKb" value="-1"></property>\n <property name="groupMaxFetchResponses" value="-1"></property>\n <property name="groupMaxAllKb" value="-1"></property>\n </bean>\n\n <!-- \n REQUIRED STANDARD BEANS\n It will be very rare to replace or reconfigure the following beans.\n -->\n\n <!-- STATISTICSTRACKER: standard stats/reporting collector -->\n <bean id="statisticsTracker" \n class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName">\n </bean>\n \n <!-- CRAWLERLOGGERMODULE: shared logging facility -->\n <bean id="loggerModule" \n class="org.archive.crawler.reporting.CrawlerLoggerModule">\n </bean>\n \n <!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays\n Autowired to include any SheetForSurtPrefix or \n SheetForDecideRuled beans -->\n <bean id="sheetOverlaysManager" autowire="byType"\n class="org.archive.crawler.spring.SheetOverlaysManager">\n </bean>\n\n <!-- BDBMODULE: shared BDB-JE disk persistence manager -->\n <bean id="bdb" \n class="org.archive.bdb.BdbModule">\n </bean>\n \n <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->\n <bean id="cookieStorage" \n class="org.archive.modules.fetcher.BdbCookieStore">\n </bean>\n \n <!-- SERVERCACHE: shared cache of server/host info -->\n <bean id="serverCache" \n class="org.archive.modules.net.BdbServerCache">\n </bean>\n\n <!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative\n to crawler-beans.cxml file, and tracking crawl files for web UI -->\n <bean id="configPathConfigurer" \n class="org.archive.spring.ConfigPathConfigurer">\n </bean>\n\n<!-- A processor to enforce runtime limits on crawls if wanted \nThe operations available is Pause, Terminate, Block_Uris\n\nTODO: CHECK, if this bean can coexist with the crawlLimitenforcer\n-->\n<!--\n<bean id="runtimeLimitEnforcer" class="org.archive.crawler.prefetch.RuntimeLimitEnforcer">\n<property name="runtimeSeconds" value="82800"/>\n<property name="operation" value="Terminate"/>\n</bean>\n-->\n\n\n\n</beans>\n t
4 default_orderxml <?xml version="1.0" encoding="UTF-8"?>\n<!-- \n HERITRIX 3 CRAWL JOB CONFIGURATION FILE - For use with NetarchiveSuite 5.0\n\n -->\n<beans xmlns="http://www.springframework.org/schema/beans"\n\t xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n xmlns:context="http://www.springframework.org/schema/context"\n\t xmlns:aop="http://www.springframework.org/schema/aop"\n\t xmlns:tx="http://www.springframework.org/schema/tx"\n\t xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd\n http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd\n http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd\n http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">\n \n <context:annotation-config/>\n\n<!-- \n OVERRIDES\n Values elsewhere in the configuration may be replaced ('overridden') \n by a Properties map declared in a PropertiesOverrideConfigurer, \n using a dotted-bean-path to address individual bean properties. \n This allows us to collect a few of the most-often changed values\n in an easy-to-edit format here at the beginning of the model\n configuration. \n -->\n <!-- overrides from a text property list -->\n <bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">\n <property name="properties">\n<!-- Overrides the default values used by Heritrix -->\n <value>\n# This Properties map is specified in the Java 'property list' text format\n# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29\n\n###\n### some of these overrides is actually just the default value, so they can be skipped\n###\n\nmetadata.jobName=default_orderxml\nmetadata.description=Default Profile \nmetadata.operator=Admin\nmetadata.userAgentTemplate=Mozilla/5.0 (compatible; heritrix/3.3.0 +@OPERATOR_CONTACT_URL@)\n## Edit the two following lines to match your setup.\nmetadata.operatorContactUrl=http://netarkivet.dk/webcrawler/\nmetadata.operatorFrom=info@netarkivet.dk\n\n## Replace YOUR_ORGANIZATION with the name of your organization\nmetadata.organization=YOUR_ORGANIZATION\n\n## This field is not available in the CrawlMetadata class bundled with heritrix\n## So we extended the class to add this field.\nmetadata.date=20080118111217\n\n## Select robots policy here (one of: default seems to be obey)\nmetadata.robotsPolicyName=%{HONOR_ROBOTS_DOT_TXT}\n\ncrawlLimiter.maxBytesDownload=0\ncrawlLimiter.maxDocumentsDownload=0\n## MaxTimeseconds inserted by NetarchiveSuite (Delete line, if behaviour unwanted)\ncrawlLimiter.maxTimeSeconds=%{MAX_TIME_SECONDS_PLACEHOLDER}\n\ncrawlController.maxToeThreads=50\ncrawlController.recorderOutBufferBytes=4096\ncrawlController.recorderInBufferBytes=65536\ncrawlController.pauseAtStart=false\ncrawlController.runWhileEmpty=false\ncrawlController.scratchDir=scratch\n\n## org.archive.bdb.BdbModule overrides\nbdb.dir=state\nbdb.cachePercent=40\n\n## seeds properties\n## no source-report.txt if this is false\nseeds.sourceTagSeeds=true\n\n## Override properties for org.archive.modules.deciderules.TooManyHopsDecideRule\nscope.rules[2].maxHops=%{MAX_HOPS}\n\n## Override properties for org.archive.modules.deciderules.TransclusionDecideRule\nscope.rules[3].maxTransHops=5\nscope.rules[3].maxSpeculativeHops=1\n\n## Override properties org.archive.modules.deciderules.PathologicalPathDecideRule\nscope.rules[6].maxRepetitions=3\n\n## Politeness overrides\ndisposition.delayFactor=1.0\ndisposition.maxDelayMs=1000\ndisposition.minDelayMs=300\ndisposition.maxPerHostBandwidthUsageKbSec=500\n\npreparer.preferenceEmbedHops=1\npreparer.preferenceDepthHops=-1\n\n## Frontier settings\nfrontier.maxRetries=3\nfrontier.retryDelaySeconds=300\nfrontier.recoveryLogEnabled=false\nfrontier.balanceReplenishAmount=3000\nfrontier.errorPenaltyAmount=100\nfrontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}\nfrontier.snoozeLongMs=300000\nfrontier.extract404s=false\nfrontier.extractIndependently=false\n\npreselector.enabled=true\npreselector.logToFile=false\npreselector.recheckScope=true\npreselector.blockAll=false\n\npreconditions.enabled=true\npreconditions.ipValidityDurationSeconds=21600\npreconditions.robotsValidityDurationSeconds=86400\npreconditions.calculateRobotsOnly=false\n\nfetchDns.enabled=true\nfetchDns.acceptNonDnsResolves=false\nfetchDns.digestContent=true\nfetchDns.digestAlgorithm=sha1\n\nfetchHttp.enabled=true\nfetchHttp.timeoutSeconds=1200\n#fetchHttp.soTimeoutMs=20000\nfetchHttp.soTimeoutMs=120000\nfetchHttp.maxFetchKBSec=0\nfetchHttp.maxLengthBytes=0\nfetchHttp.ignoreCookies=false\nfetchHttp.sslTrustLevel=OPEN\n\n#fetchHttp.defaultEncoding=ISO-8859-1\nfetchHttp.defaultEncoding=UTF-8\nfetchHttp.digestContent=true\nfetchHttp.digestAlgorithm=sha1\nfetchHttp.sendIfModifiedSince=true\nfetchHttp.sendIfNoneMatch=true\nfetchHttp.sendConnectionClose=true\nfetchHttp.sendReferer=true\nfetchHttp.sendRange=false\n\n\n## Accept headers for HTTP fetching\nfetchHttp.acceptHeaders[0]=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\n\nextractorHttp.enabled=true\nextractorHtml.enabled=true\nextractorHtml.extractJavascript=%{EXTRACT_JAVASCRIPT}\nextractorHtml.treatFramesAsEmbedLinks=false\nextractorHtml.ignoreFormActionUrls=true\nextractorHtml.extractValueAttributes=false\nextractorHtml.ignoreUnexpectedHtml=true\nextractorCss.enabled=true\nextractorJs.enabled=true\nextractorSwf.enabled=true\n\n# allow redirected seeds to be accepted as seeds\n# In H1, this property belonged to the LinkScoper object, in H3, it is part of the CandidatesProcessor object\ncandidates.seedsRedirectNewSeeds=true\n\nstatisticsTracker.intervalSeconds=20\n\n## Quotaenforcing\nquotaenforcer.groupMaxFetchSuccesses=%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}\nquotaenforcer.groupMaxAllKb=%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}\n\n## sample overrides of the warcwriter\nwarcWriter.template=${prefix}-${timestamp17}-${serialno}-ciblee_2015_${heritrix.hostname}\nwarcWriter.writeRequests=false\nwarcWriter.writeMetadata=false\nwarcWriter.poolMaxActive=3\n\nloggerModule.path=logs\n\n </value>\n </property>\n </bean>\n\n <!-- overrides from declared <prop> elements, more easily allowing\n multiline values or even declared beans -->\n <bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">\n <property name="properties">\n <props>\n </props>\n </property>\n </bean>\n\n <!-- CRAWL METADATA: including identification of crawler/operator \n Using NetarchiveSuites own extended version of the org.archive.modules.CrawlMetadata\n -->\n <bean id="metadata" class="dk.netarkivet.harvester.harvesting.NasCrawlMetadata" autowire="byName">\n <property name="operatorContactUrl" value="[see override above]"/>\n <property name="jobName" value="[see override above]"/>\n <property name="description" value="[see override above]"/>\n <!-- <property name="robotsPolicyName" value="ignore"/> -->\n <!-- <property name="operator" value=""/> -->\n <!-- <property name="operatorFrom" value=""/> -->\n <!-- <property name="organization" value=""/> -->\n <!-- <property name="audience" value=""/> -->\n <!-- <property name="userAgentTemplate" \n value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> -->\n \n </bean>\n \n <!-- SEEDS: crawl starting points -->\n <!-- ConfigFile approach: specifying external seeds.txt file -->\n <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">\n <property name="textSource">\n <bean class="org.archive.spring.ConfigFile">\n <property name="path" value="seeds.txt" />\n </bean>\n </property>\n <property name="sourceTagSeeds" value="false"/> \n </bean>\n\n <!-- SCOPE: rules for which discovered URIs to crawl; order is very \n important because last decision returned other than 'NONE' wins. -->\n <bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">\n <property name="logToFile" value="true" />\n <property name="logExtraInfo" value="true" />\n <property name="rules">\n <list>\n <!-- Begin by REJECTing all... -->\n <bean class="org.archive.modules.deciderules.RejectDecideRule">\n </bean>\n <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->\n <!-- <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> -->\n <bean class="dk.netarkivet.harvester.harvesting.OnNSDomainsDecideRule"> \n <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->\n <!-- <property name="alsoCheckVia" value="true" /> -->\n <!-- <property name="surtsSourceFile" value="" /> -->\n <!-- <property name="surtsDumpFile" value="surts.dump" /> -->\n </bean>\n <!-- ...but REJECT those more than a configured link-hop-count from start... -->\n <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">\n <!-- <property name="maxHops" value="20" /> -->\n </bean>\n <!-- ...but ACCEPT those more than a configured link-hop-count from start... -->\n <bean class="org.archive.modules.deciderules.TransclusionDecideRule">\n <!-- <property name="maxTransHops" value="2" /> -->\n <!-- <property name="maxSpeculativeHops" value="1" /> -->\n </bean>\n <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->\n <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">\n <property name="decision" value="REJECT"/>\n <property name="seedsAsSurtPrefixes" value="false"/>\n <property name="surtsDumpFile" value="negative-surts.dump" />\n <!-- <property name="surtsSourceFile" value="" /> -->\n </bean>\n <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->\n <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">\n <property name="decision" value="REJECT"/>\n <property name="listLogicalOr" value="true" />\n <property name="regexList">\n <list>\n\t\t<value>.*core\\.UserAdmin.*core\\.UserLogin.*</value>\n\t\t<value>.*core\\.UserAdmin.*register\\.UserSelfRegistration.*</value>\n\t\t<value>.*\\/w\\/index\\.php\\?title=Speci[ae]l:Recentchanges.*</value>\n\t\t<value>.*act=calendar&cal_id=.*</value>\n\t\t<value>.*advCalendar_pi.*</value>\n\t\t<value>.*cal\\.asp\\?date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=monthly&date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=weekly&date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=yearly&date=.*</value>\n\t\t<value>.*cal\\.asp\\?view=yearly&year=.*</value>\n\t\t<value>.*cal\\/cal_day\\.php\\?op=day&date=.*</value>\n\t\t<value>.*cal\\/cal_week\\.php\\?op=week&date=.*</value>\n\t\t<value>.*cal\\/calendar\\.php\\?op=cal&month=.*</value>\n\t\t<value>.*cal\\/yearcal\\.php\\?op=yearcal&ycyear=.*</value>\n\t\t<value>.*calendar\\.asp\\?calmonth=.*</value>\n\t\t<value>.*calendar\\.asp\\?qMonth=.*</value>\n\t\t<value>.*calendar\\.php\\?sid=.*</value>\n\t\t<value>.*calendar\\.php\\?start=.*</value>\n\t\t<value>.*calendar\\.php\\?Y=.*</value>\n\t\t<value>.*calendar\\/\\?CLmDemo_horizontal=.*</value>\n\t\t<value>.*calendar_menu\\/calendar\\.php\\?.*</value>\n\t\t<value>.*calendar_scheduler\\.php\\?d=.*</value>\n\t\t<value>.*calendar_year\\.asp\\?qYear=.*</value>\n\t\t<value>.*calendarix\\/calendar\\.php\\?op=.*</value>\n\t\t<value>.*calendarix\\/yearcal\\.php\\?op=.*</value>\n\t\t<value>.*calender\\/default\\.asp\\?month=.*</value>\n\t\t<value>.*Default\\.asp\\?month=.*</value>\n\t\t<value>.*events\\.asp\\?cat=0&mDate=.*</value>\n\t\t<value>.*events\\.asp\\?cat=1&mDate=.*</value>\n\t\t<value>.*events\\.asp\\?MONTH=.*</value>\n\t\t<value>.*events\\.asp\\?month=.*</value>\n\t\t<value>.*index\\.php\\?iDate=.*</value>\n\t\t<value>.*index\\.php\\?module=PostCalendar&func=view.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_day&year=.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_detail&year=.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_month&year=.*</value>\n\t\t<value>.*index\\.php\\?option=com_events&task=view_week&year=.*</value>\n <value>.*index\\.php\\?option=com_events&task=view_year&year=.*</value>\n <value>.*index\\.php\\?option=com_extcalendar&Itemid.*</value>\n <value>.*modules\\.php\\?name=Calendar&op=modload&file=index.*</value>\n <value>.*modules\\.php\\?name=vwar&file=calendar&action=list&month=.*</value>\n <value>.*modules\\.php\\?name=vwar&file=calendar.*</value>\n <value>.*modules\\.php\\?name=vWar&mod=calendar.*</value>\n <value>.*modules\\/piCal\\/index\\.php\\?caldate=.*</value>\n <value>.*modules\\/piCal\\/index\\.php\\?cid=.*</value>\n <value>.*option,com_events\\/task,view_day\\/year.*</value>\n <value>.*option,com_events\\/task,view_month\\/year.*</value>\n <value>.*option,com_extcalendar\\/Itemid.*</value>\n <value>.*task,view_month\\/year.*</value>\n <value>.*shopping_cart\\.php.*</value>\n <value>.*action.add_product.*</value>\n <value>.*action.remove_product.*</value>\n <value>.*action.buy_now.*</value>\n <value>.*checkout_payment\\.php.*</value>\n <value>.*login.*login.*login.*login.*</value>\n <value>.*homepage_calendar\\.asp.*</value>\n <value>.*MediaWiki.*Movearticle.*</value>\n <value>.*index\\.php.*action=edit.*</value>\n <value>.*comcast\\.net.*othastar.*</value>\n <value>.*Login.*Login.*Login.*</value>\n <value>.*redir.*redir.*redir.*</value>\n <value>.*bookingsystemtime\\.asp\\?dato=.*</value>\n <value>.*bookingsystem\\.asp\\?date=.*</value>\n <value>.*cart\\.asp\\?mode=add.*</value>\n <value>.*\\/photo.*\\/photo.*\\/photo.*</value>\n <value>.*\\/skins.*\\/skins.*\\/skins.*</value>\n <value>.*\\/scripts.*\\/scripts.*\\/scripts.*</value>\n <value>.*\\/styles.*\\/styles.*\\/styles.*</value>\n <value>.*\\/coppermine\\/login\\.php\\?referer=.*</value>\n <value>.*\\/images.*\\/images.*\\/images.*</value>\n <value>.*\\/stories.*\\/stories.*\\/stories.*</value>\t\n<!-- Here we inject our global crawlertraps, domain specific crawlertraps -->\n%{CRAWLERTRAPS_PLACEHOLDER}\n </list>\n </property> \n </bean>\n\n <!-- ...and REJECT those with suspicious repeating path-segments... -->\n <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">\n <!-- <property name="maxRepetitions" value="2" /> -->\n </bean>\n <!-- ...and REJECT those with more than threshold number of path-segments... -->\n <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">\n <!-- <property name="maxPathDepth" value="20" /> -->\n </bean>\n <!-- ...but always ACCEPT those marked as prerequisites for another URI... -->\n <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">\n </bean>\n <!-- ...but always REJECT those with unsupported URI schemes -->\n <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">\n </bean>\n </list>\n </property>\n </bean>\n \n <!-- \n PROCESSING CHAINS\n Much of the crawler's work is specified by the sequential \n application of swappable Processor modules. These Processors\n are collected into three 'chains. The CandidateChain is applied \n to URIs being considered for inclusion, before a URI is enqueued\n for collection. The FetchChain is applied to URIs when their \n turn for collection comes up. The DispositionChain is applied \n after a URI is fetched and analyzed/link-extracted.\n -->\n \n <!-- CANDIDATE CHAIN --> \n <!-- processors declared as named beans -->\n <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">\n </bean>\n <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">\n <!-- <property name="preferenceDepthHops" value="-1" /> -->\n <!-- <property name="preferenceEmbedHops" value="1" /> -->\n <!-- <property name="canonicalizationPolicy"> \n <ref bean="canonicalizationPolicy" />\n </property> -->\n <property name="queueAssignmentPolicy"> <ref bean="ourQueueAssignmentPolicy" /> \n<!-- Bundled with NAS is two queueAssignPolicies (code is in heritrix3-extensions): \n dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy\n dk.netarkivet.harvester.harvesting.SeedUriDomainnameQueueAssignmentPolicy \n-->\n </property>\n \n <!-- <property name="uriPrecedencePolicy"> \n <ref bean="uriPrecedencePolicy" />\n </property> -->\n <!-- <property name="costAssignmentPolicy"> \n <ref bean="costAssignmentPolicy" />\n </property> -->\n </bean>\n <!-- assembled into ordered CandidateChain bean -->\n <bean id="candidateProcessors" class="org.archive.modules.CandidateChain">\n <property name="processors">\n <list>\n <!-- apply scoping rules to each individual candidate URI... -->\n <ref bean="candidateScoper"/>\n <!-- ...then prepare those ACCEPTed for enqueuing to frontier. -->\n <ref bean="preparer"/>\n </list>\n </property>\n </bean>\n \n <!-- FETCH CHAIN --> \n <!-- processors declared as named beans -->\n <bean id="preselector" class="org.archive.crawler.prefetch.Preselector">\n <!-- <property name="recheckScope" value="false" /> -->\n <!-- <property name="blockAll" value="false" /> -->\n <!-- <property name="blockByRegex" value="" /> -->\n <!-- <property name="allowByRegex" value="" /> -->\n </bean>\n <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">\n </bean>\n\n\n<!-- set username and password set for the FTP fetcher. \n should probably be configured using overlays to allow different username/passwords for\ndifferent sites. \nThe username/password values is for Publizon pubhub.dk using ftp://ftp.pubhub.dk \n-->\n <bean id="fetchFtp" class="org.archive.modules.fetcher.FetchFTP">\t\n\t<property name="username" value="Pligtaflevering"/>\n\t<property name="password" value="Sund2010Hed"/>\n\t<property name="extractFromDirs" value="true"/>\n\t<property name="extractParent" value="false"/>\n\t<property name="maxLengthBytes" value="0"/>\n\t<property name="maxFetchKBSec" value="0"/>\n\t<property name="timeoutSeconds" value="1200"/>\n \n </bean>\n\n\n <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">\n </bean>\n <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">\n </bean>\n \n <bean id="extractorOAI" class="dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI">\n </bean>\n\n <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">\n </bean>\n <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">\n </bean>\n <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">\n </bean> \n <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">\n </bean>\n <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">\n </bean> \n\n <bean id="extractorXML" class="org.archive.modules.extractor.ExtractorXML">\n</bean>\n \n<!-- assembled into ordered FetchChain bean -->\n <bean id="fetchProcessors" class="org.archive.modules.FetchChain">\n <property name="processors">\n <list>\n <!-- recheck scope, if so enabled... -->\n <ref bean="preselector"/>\n <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->\n <ref bean="preconditions"/>\n\n <!-- check, if quotas is already superseded --> \n <ref bean="quotaenforcer"/> <!-- always required by NAS ? -->\n\n <!-- ...fetch if DNS URI... -->\n <ref bean="fetchDns"/>\n <!-- ...fetch if HTTP URI... -->\n <ref bean="fetchHttp"/>\n <!-- ...fetch if FTP URI... -->\t\n <ref bean="fetchFtp"/> \n <!-- ...extract oulinks from HTTP headers... -->\n <ref bean="extractorHttp"/>\n <!-- ...extract oulinks from HTML content... -->\n <ref bean="extractorHtml"/>\n <!-- ...extract oulinks from CSS content... -->\n <ref bean="extractorCss"/>\n <!-- ...extract oulinks from Javascript content... -->\n <ref bean="extractorJs"/>\n <!-- ...then extract oulinks from extractorOAI content... -->\n <ref bean="extractorOAI"/>\n <!-- ...then extract oulinks from extractorXML content... -->\n <ref bean="extractorXML" />\n <!-- ...extract oulinks from Flash content... -->\n <ref bean="extractorSwf"/>\n \n </list>\n </property>\n </bean>\n \n <!-- DISPOSITION CHAIN -->\n <!-- processors declared as named beans -->\n\n<!-- Here the (W)arc writer is inserted -->\n%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}\n\n<bean id="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator">\n<!-- DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is replaced by path on harvest-server -->\n <property name="indexLocation" value="%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"/> \n <property name="matchingMethod" value="URL"/> \n <property name="tryEquivalent" value="TRUE"/> \n <property name="changeContentSize" value="false"/>\n <property name="mimeFilter" value="^text/.*"/>\n <property name="filterMode" value="BLACKLIST"/>\n<!-- <property name="analysisMode" value="TIMESTAMP"/> TODO does not work. but isn't a problem, as the default is always USED --> \n <property name="origin" value=""/>\n <property name="originHandling" value="INDEX"/>\n <property name="statsPerHost" value="true"/>\n</bean> \n\n <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">\n <!-- <property name="seedsRedirectNewSeeds" value="true" /> -->\n </bean>\n <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">\n </bean>\n\n <!-- assembled into ordered DispositionChain bean -->\n <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">\n <property name="processors">\n <list>\n <!-- write to aggregate archival files... -->\n\n <!-- remove the reference below, and the DeDuplicator bean itself to disable Deduplication -->\n <ref bean="DeDuplicator"/>\n\n <!-- Here the reference to the (w)arcWriter bean is inserted during job-generation -->\t\n\n %{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}\n \n <!-- This bean is required to report back the number of bytes harvested for each domain. -->\n <bean id="ContentSizeAnnotationPostProcessor" class="dk.netarkivet.harvester.harvesting.ContentSizeAnnotationPostProcessor"/>\n\n <!-- ...send each outlink candidate URI to CandidatesChain, \n and enqueue those ACCEPTed to the frontier... -->\n <ref bean="candidates"/>\n <!-- ...then update stats, shared-structures, frontier decisions -->\n <ref bean="disposition"/>\n </list>\n </property>\n </bean>\n \n <!-- CRAWLCONTROLLER: Control interface, unifying context -->\n <bean id="crawlController" \n class="org.archive.crawler.framework.CrawlController">\n </bean>\n \n <!-- FRONTIER: Record of all URIs discovered and queued-for-collection -->\n <bean id="frontier" \n class="org.archive.crawler.frontier.BdbFrontier">\n </bean>\n \n <!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs --> \n <bean id="uriUniqFilter" \n class="org.archive.crawler.util.BdbUriUniqFilter">\n </bean>\n\n <!-- \n OPTIONAL BUT RECOMMENDED BEANS\n -->\n \n <!-- ACTIONDIRECTORY: disk directory for mid-crawl operations\n Running job will watch directory for new files with URIs, \n scripts, and other data to be processed during a crawl. -->\n <bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">\n </bean> \n \n <!-- CRAWLLIMITENFORCER: stops crawl when it reaches configured limits -->\n <bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer">\n </bean>\n\n <!-- CHECKPOINTSERVICE: checkpointing assistance -->\n <bean id="checkpointService" \n class="org.archive.crawler.framework.CheckpointService">\n </bean>\n \n <!-- \n OPTIONAL BEANS\n Uncomment and expand as needed, or if non-default alternate \n implementations are preferred.\n -->\n \n <!-- QUEUE ASSIGNMENT POLICY -->\n \n<!-- NAS queue assignement policy. \ndefault H3 policy is org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy\n-->\n\n <bean id="ourQueueAssignmentPolicy"\n class="dk.netarkivet.harvester.harvesting.SeedUriDomainnameQueueAssignmentPolicy"> \n <property name="forceQueueAssignment" value=""/> <!-- the default is "" -->\n <property name="deferToPrevious" value="true"/> <!-- the default is true -->\n <property name="parallelQueues" value="1" /> <!-- the default is 1 -->\n </bean>\n\n <!-- URI PRECEDENCE POLICY -->\n <!--\n <bean id="uriPrecedencePolicy" \n class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy">\n </bean>\n -->\n \n <!-- COST ASSIGNMENT POLICY -->\n \n <bean id="costAssignmentPolicy" \n class="org.archive.crawler.frontier.UnitCostAssignmentPolicy">\n </bean>\n\n<!-- QUOTA ENFORCER BEAN -->\n\n<bean id="quotaenforcer" \n class="org.archive.crawler.prefetch.QuotaEnforcer">\n <property name="forceRetire" value="false"></property>\n\n <property name="serverMaxFetchSuccesses" value="-1"></property>\n <property name="serverMaxSuccessKb" value="-1"></property>\n <property name="serverMaxFetchResponses" value="-1"></property>\n <property name="serverMaxAllKb" value="-1"></property>\n\n <property name="hostMaxFetchSuccesses" value="-1"></property>\n <property name="hostMaxSuccessKb" value="-1"></property>\n <property name="hostMaxFetchResponses" value="-1"></property>\n <property name="hostMaxAllKb" value="-1"></property>\n <property name="groupMaxFetchSuccesses" value="-1"></property>\n <property name="groupMaxSuccessKb" value="-1"></property>\n <property name="groupMaxFetchResponses" value="-1"></property>\n <property name="groupMaxAllKb" value="-1"></property>\n </bean>\n\n <!-- \n REQUIRED STANDARD BEANS\n It will be very rare to replace or reconfigure the following beans.\n -->\n\n <!-- STATISTICSTRACKER: standard stats/reporting collector -->\n <bean id="statisticsTracker" \n class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName">\n </bean>\n \n <!-- CRAWLERLOGGERMODULE: shared logging facility -->\n <bean id="loggerModule" \n class="org.archive.crawler.reporting.CrawlerLoggerModule">\n </bean>\n \n <!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays\n Autowired to include any SheetForSurtPrefix or \n SheetForDecideRuled beans -->\n <bean id="sheetOverlaysManager" autowire="byType"\n class="org.archive.crawler.spring.SheetOverlaysManager">\n </bean>\n\n <!-- BDBMODULE: shared BDB-JE disk persistence manager -->\n <bean id="bdb" \n class="org.archive.bdb.BdbModule">\n </bean>\n \n <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->\n <bean id="cookieStorage" \n class="org.archive.modules.fetcher.BdbCookieStore">\n </bean>\n \n <!-- SERVERCACHE: shared cache of server/host info -->\n <bean id="serverCache" \n class="org.archive.modules.net.BdbServerCache">\n </bean>\n\n <!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative\n to crawler-beans.cxml file, and tracking crawl files for web UI -->\n <bean id="configPathConfigurer" \n class="org.archive.spring.ConfigPathConfigurer">\n </bean>\n\n<!-- A processor to enforce runtime limits on crawls if wanted \nThe operations available is Pause, Terminate, Block_Uris\n\nTODO: CHECK, if this bean can coexist with the crawlLimitenforcer\n-->\n<!--\n<bean id="runtimeLimitEnforcer" class="org.archive.crawler.prefetch.RuntimeLimitEnforcer">\n<property name="runtimeSeconds" value="82800"/>\n<property name="operation" value="Terminate"/>\n</bean>\n-->\n\n\n\n</beans>\n t
\.
SELECT setval('ordertemplates_id_seq', 3);
COPY schedules (schedule_id, name, comments, startdate, enddate, maxrepeats, timeunit, numtimeunits, anytime, onminute, onhour, ondayofweek, ondayofmonth, edition) FROM stdin;
1 Once_a_day En gang om dagen, med det samme \N \N \N 2 1 t \N \N \N \N 1
2 Once_an_hour En gang i timen, med det samme \N \N \N 1 1 t \N \N \N \N 1
3 Once_a_week En gang om ugen, med det samme \N \N \N 3 1 t \N \N \N \N 1
4 Once_a_month En gang om måneden, med det samme \N \N \N 4 1 t \N \N \N \N 1
\.
SELECT setval('schedules_id_seq', 4);
--
-- Data for Name: schemaversions; Type: TABLE DATA; Schema: public; Owner: test
--
COPY schemaversions (tablename, version) FROM stdin;
domains 3
configurations 5
seedlists 1
passwords 1
ownerinfo 1
historyinfo 2
config_passwords 1
config_seedlists 1
partialharvests 1
fullharvests 5
harvest_configs 1
schedules 1
ordertemplates 1
job_configs 1
global_crawler_trap_lists 1
global_crawler_trap_expressions 1
runningjobshistory 2
runningjobsmonitor 2
frontierreportmonitor 1
extendedfieldtype 1
harvestdefinitions 4
jobs 10
extendedfield 2
extendedfieldvalue 2
harvestchannel 1
\.
--
-- Data for Name: seedlists; Type: TABLE DATA; Schema: public; Owner: test
--
COPY seedlists (seedlist_id, name, comments, domain_id, seeds) FROM stdin;
1 defaultseeds 1 www.kb.dk\n
2 defaultseeds 2 www.statsbiblioteket.dk\n
3 defaultseeds 3 www.netarkivet.dk\n
4 defaultseeds 4 www.kum.dk\n
5 defaultseeds 5 www.raeder.dk\n
6 defaultseeds 6 www.kaarefc.dk\n
7 defaultseeds 7 www.trineogkaare.dk\n
8 defaultseeds 8 www.kaareogtrine.dk\n
9 defaultseeds 9 www.trinekc.dk\n
10 defaultseeds 10 www.sulnudu.dk\n
11 defaultseeds 11 www.slothchristensen.dk\n
12 defaultseeds 12 www.oernhoej.dk\n
13 defaultseeds 13 www.pligtaflevering.dk\n
14 defaultseeds 14 www.dbc.dk\n
15 defaultseeds 15 www.bs.dk\n
17 defaultseeds 17 www.sy-jonna.dk\n
\.
SELECT setval('seedlists_id_seq', 17);