/
IngestableFiles.java
348 lines (310 loc) · 12.5 KB
/
IngestableFiles.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/*
* #%L
* Netarchivesuite - harvester
* %%
* Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
* the National Library of France and the Austrian National Library.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 2.1 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Lesser Public License for more details.
*
* You should have received a copy of the GNU General Lesser Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/lgpl-2.1.html>.
* #L%
*/
package dk.netarkivet.harvester.harvesting;
import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
/**
* Encapsulation of files to be ingested into the archive. These files are presently placed subdirectories under the
* crawldir.
*/
public class IngestableFiles {
private static final Logger log = LoggerFactory.getLogger(IngestableFiles.class);
/** Subdir with final metadata file in it. */
protected static final String METADATA_SUB_DIR = "metadata";
/** Subdir with temporary metadata file in it. */
private static final String TMP_SUB_DIR = "tmp-meta";
/** jobId for present harvestjob. */
private long jobId;
/** crawlDir for present harvestjob. */
private File crawlDir;
/**
* Writer to this jobs metadatafile. This is closed when the metadata is marked as ready.
*/
private MetadataFileWriter writer = null;
/** Whether we've had an error in metadata generation. */
private boolean error = false;
private String harvestnamePrefix;
public static final String METADATA_FILENAME_FORMAT = Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT);
private Long harvestId;
/**
* Constructor for this class. HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix for a
* specific finished harvestjob.
*
* @param files An instance of HeritrixFiles
* @throws ArgumentNotValid if null-arguments are given; if jobID < 1; if crawlDir does not exist
*/
public IngestableFiles(HeritrixFiles files) {
ArgumentNotValid.checkNotNull(files, "files");
ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir");
ArgumentNotValid.checkPositive(files.getJobID(), "jobID");
ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix");
this.crawlDir = files.getCrawlDir();
if (!crawlDir.exists()) {
throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist");
}
this.jobId = files.getJobID();
this.harvestnamePrefix = files.getArchiveFilePrefix();
this.harvestId = files.getHarvestID();
// Create subdir 'metadata' if not already exists.
FileUtils.createDir(getMetadataDir());
// Create/scratch subdir 'tmp-meta'
FileUtils.removeRecursively(getTmpMetadataDir());
FileUtils.createDir(getTmpMetadataDir());
}
/**
* Check, if the metadatafile already exists. If this is true, metadata has been successfully generated. If false,
* either metadata has not finished being generated, or there was an error generating them.
*
* @return true, if it does exist; false otherwise.
*/
public boolean isMetadataReady() {
return getMetadataFile().isFile();
}
/**
* Return true if the metadata generation process is known to have failed.
*
* @return True if metadata generation is finished without success, false if generation is still ongoing or has been
* successfully done.
*/
public boolean isMetadataFailed() {
return error;
}
/**
* Marks generated metadata as final, closes the writer, and moves the temporary metadata file to its final
* position, if successful.
*
* @param success True if metadata was successfully generated, false otherwise.
* @throws PermissionDenied If the metadata has already been marked as ready, or if no metadata file exists upon
* success.
* @throws IOFailure if there is an error marking the metadata as ready.
*/
public void setMetadataGenerationSucceeded(boolean success) {
if (isMetadataReady()) {
throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
}
if (success) {
writer.close(); // close writer down
if (!getTmpMetadataFile().exists()) {
String message = "No metadata was generated despite claims that metadata generation was successfull.";
throw new PermissionDenied(message);
}
getTmpMetadataFile().renameTo(getMetadataFile());
} else {
error = true;
}
}
/**
* Get a MetaDatafileWriter for the temporary metadata file. Successive calls to this method on the same object will
* return the same writer. Once the metadata have been finalized, calling this method will fail.
*
* @return a MetaDatafileWriter for the temporary metadata file.
* @throws PermissionDenied if metadata generation is already finished.
*/
public MetadataFileWriter getMetadataWriter() {
if (isMetadataReady()) {
throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
}
if (isMetadataFailed()) {
throw new PermissionDenied("Metadata generation of file " + getMetadataFile().getAbsolutePath()
+ " has already failed.");
}
if (writer == null) {
writer = MetadataFileWriter.createWriter(getTmpMetadataFile());
}
return writer;
}
/**
* Gets the files containing the metadata.
*
* @return the files in the metadata dir
* @throws PermissionDenied if the metadata file is not ready, either because generation is still going on or there
* was an error generating the metadata.
*/
public List<File> getMetadataArcFiles() {
// Our one known metadata file must exist.
if (!isMetadataReady()) {
throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist");
}
return Arrays.asList(new File[] {getMetadataFile()});
}
/**
* Constructs the metadata subdir from the crawlDir.
*
* @return The metadata subdir as a File
*/
private File getMetadataDir() {
return new File(crawlDir, METADATA_SUB_DIR);
}
/**
* Constructs the single metadata arc file from the crawlDir and the jobID.
*
* @return metadata arc file as a File
*/
protected File getMetadataFile() {
return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
}
/**
* Constructs the TEMPORARY metadata subdir from the crawlDir.
*
* @return The tmp-metadata subdir as a File
*/
public File getTmpMetadataDir() {
return new File(crawlDir, TMP_SUB_DIR);
}
/**
* Constructs the TEMPORARY metadata arc file from the crawlDir and the jobID.
*
* @return tmp-metadata arc file as a File
*/
private File getTmpMetadataFile() {
return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
}
/**
* Get a list of all ARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
*
* @return The ARC files that are ready to get ingested.
*/
public List<File> getArcFiles() {
File arcsdir = getArcsDir();
if (arcsdir.exists()) {
if (!arcsdir.isDirectory()) {
throw new IOFailure(arcsdir.getPath() + " is not a directory");
}
return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER));
} else {
return new LinkedList<File>();
}
}
/**
* @return the arcs dir in the our crawl directory.
*/
public File getArcsDir() {
return new File(crawlDir, Constants.ARCDIRECTORY_NAME);
}
/**
* @return the warcs dir in the our crawl directory.
*/
public File getWarcsDir() {
return new File(crawlDir, Constants.WARCDIRECTORY_NAME);
}
/**
* Get a list of all WARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
*
* @return The WARC files that are ready to get ingested.
*/
public List<File> getWarcFiles() {
File warcsdir = getWarcsDir();
if (warcsdir.exists()) {
if (!warcsdir.isDirectory()) {
throw new IOFailure(warcsdir.getPath() + " is not a directory");
}
return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER));
} else {
return new LinkedList<File>();
}
}
/**
* Close any ".open" files left by a crashed Heritrix. ARC and/or WARC files ending in .open indicate that Heritrix
* is still writing to them. If Heritrix has died, we can just rename them before we upload. This must not be done
* while harvesting is still in progress.
*
* @param waitSeconds How many seconds to wait before closing files. This may be done in order to allow Heritrix to
* finish writing before we close the files.
*/
public void closeOpenFiles(int waitSeconds) {
// wait for Heritrix threads to create and close last arc or warc files
try {
Thread.sleep(waitSeconds * 1000L);
} catch (InterruptedException e) {
log.debug("Thread woken prematurely from sleep.", e);
}
closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER);
closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER);
}
/**
* Given an archive sub-directory name and a filter to match against this method tries to rename the matched files.
* Files that can not be renamed generate a log message. The filter should always match files that end with ".open"
* as a minimum.
*
* @param archiveDirName archive directory name, currently "arc" or "warc"
* @param filter filename filter used to select ".open" files to rename
*/
protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) {
File arcsdir = new File(crawlDir, archiveDirName);
File[] files = arcsdir.listFiles(filter);
if (files != null) {
for (File file : files) {
final String fname = file.getAbsolutePath();
// Note: Due to regexp we know filename is at least 5 characters
File tofile = new File(fname.substring(0, fname.length() - 5));
if (!file.renameTo(tofile)) {
log.warn("Failed to rename '{}' to '{}'", file.getAbsolutePath(), tofile.getAbsolutePath());
}
}
}
}
/**
* Remove any temporary files.
*/
public void cleanup() {
FileUtils.removeRecursively(getTmpMetadataDir());
writer = null;
}
/**
* @return the jobID of the harvest job being processed.
*/
public long getJobId() {
return this.jobId;
}
/**
* @return the harvestID of the harvest job being processed.
*/
public long getHarvestID() {
return this.harvestId;
}
/**
* @return the harvestnamePrefix of the harvest job being processed.
*/
public String getHarvestnamePrefix() {
return this.harvestnamePrefix;
}
/**
* @return the crawlDir of the harvest job being processed.
*/
public File getCrawlDir() {
return this.crawlDir;
}
}