-
Notifications
You must be signed in to change notification settings - Fork 23
/
CrawlLogLinesMatchingRegexp.java
144 lines (128 loc) · 4.84 KB
/
CrawlLogLinesMatchingRegexp.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*
* #%L
* Netarchivesuite - harvester
* %%
* Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
* the National Library of France and the Austrian National Library.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 2.1 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Lesser Public License for more details.
*
* You should have received a copy of the GNU General Lesser Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/lgpl-2.1.html>.
* #L%
*/
package dk.netarkivet.viewerproxy.webinterface;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.archive.ArchiveBatchJob;
import dk.netarkivet.common.utils.archive.ArchiveRecordBase;
import dk.netarkivet.common.utils.batch.ArchiveBatchFilter;
/**
* Batchjob that extracts lines from a crawl log matching a regular expression The batch job should be restricted to run
* on metadata files for a specific job only, using the {@link #processOnlyFilesMatching(String)} construct.
*/
@SuppressWarnings({"serial"})
public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob {
/** The logger. */
//private final Log log = LogFactory.getLog(getClass().getName());
private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class);
/** Metadata URL for crawl logs. */
private static final String SETUP_URL_FORMAT = String.format("metadata://%s/crawl/logs/crawl.log",
Settings.get(CommonSettings.ORGANIZATION));
/** The regular expression to match in the crawl.log line. */
private final String regexp;
/**
* Initialise the batch job.
*
* @param regexp The regexp to match in the crawl.log lines.
*/
public CrawlLogLinesMatchingRegexp(String regexp) {
ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp");
this.regexp = regexp;
/**
* One week in milliseconds.
*/
batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
}
/**
* Does nothing, no initialisation is needed.
*
* @param os Not used.
*/
@Override
public void initialize(OutputStream os) {
}
@Override
public ArchiveBatchFilter getFilter() {
return new ArchiveBatchFilter("OnlyCrawlLog") {
public boolean accept(ArchiveRecordBase record) {
String URL = record.getHeader().getUrl();
if (URL == null) {
return false;
} else {
return URL.startsWith(SETUP_URL_FORMAT);
}
}
};
}
/**
* Process a record on crawl log concerning the given domain to result.
*
* @param record The record to process.
* @param os The output stream for the result.
* @throws ArgumentNotValid on null parameters
* @throws IOFailure on trouble processing the record.
*/
@Override
public void processRecord(ArchiveRecordBase record, OutputStream os) {
ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record");
ArgumentNotValid.checkNotNull(os, "OutputStream os");
BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream()));
try {
for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) {
if (line.matches(regexp)) {
os.write(line.getBytes("UTF-8"));
os.write('\n');
}
}
} catch (IOException e) {
throw new IOFailure("Unable to process (w)arc record", e);
} finally {
try {
arcreader.close();
} catch (IOException e) {
log.warn("unable to close arcreader probably", e);
}
}
}
/**
* Does nothing, no finishing is needed.
*
* @param os Not used.
*/
@Override
public void finish(OutputStream os) {
}
@Override
public String toString() {
return getClass().getName() + ", with arguments: Regexp = " + regexp + ", Filter = " + getFilter();
}
}