Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added settings for new job and finished last refactoring parts
- Loading branch information
Showing
9 changed files
with
412 additions
and
269 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
43 changes: 43 additions & 0 deletions
43
common/common-core/src/main/java/dk/netarkivet/common/utils/hadoop/HadoopJobStrategy.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package dk.netarkivet.common.utils.hadoop; | ||
|
||
import java.util.UUID; | ||
|
||
import org.apache.hadoop.fs.Path; | ||
|
||
/** | ||
* Interface for a HadoopJob's strategy of how to perform the job. | ||
*/ | ||
public interface HadoopJobStrategy { | ||
|
||
/** | ||
* Runs a Hadoop job (HadoopJobTool) according to the specification of the used strategy. | ||
* | ||
* @param jobInputFile The Path specifying the job's input file. | ||
* @param jobOutputDir The Path specifying the job's output directory. | ||
* @return | ||
*/ | ||
int runJob(Path jobInputFile, Path jobOutputDir); | ||
|
||
/** | ||
* Create the job input file with name from a uuid. | ||
* | ||
* @param uuid The UUID to create a unique name from. | ||
* @return Path specifying where the input file is located. | ||
*/ | ||
Path createJobInputFile(UUID uuid); | ||
|
||
/** | ||
* Create the job output directory with name from a uuid. | ||
* | ||
* @param uuid The UUID to create a unique name from. | ||
* @return Path specifying where the output directory is located. | ||
*/ | ||
Path createJobOutputDir(UUID uuid); | ||
|
||
/** | ||
* Return a string specifying which kind of job is being run. | ||
* | ||
* @return String specifying the job's type. | ||
*/ | ||
String getJobType(); | ||
} |
19 changes: 0 additions & 19 deletions
19
common/common-core/src/main/java/dk/netarkivet/common/utils/hadoop/JobType.java
This file was deleted.
Oops, something went wrong.
77 changes: 77 additions & 0 deletions
77
...mmon-core/src/main/java/dk/netarkivet/common/utils/hadoop/MetadataExtractionStrategy.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package dk.netarkivet.common.utils.hadoop; | ||
|
||
import java.util.UUID; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.util.ToolRunner; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import dk.netarkivet.common.CommonSettings; | ||
import dk.netarkivet.common.utils.Settings; | ||
|
||
/** | ||
* Strategy to give a HadoopJob when wanting to extract selected content from metadata files matching specific | ||
* URL- and MIME-patterns. The mapper expects the used Configuration to have these patterns set before use. | ||
* Otherwise, it will use all-matching patterns. | ||
* | ||
* This type of job is the Hadoop counterpart to running | ||
* {@link dk.netarkivet.common.utils.archive.GetMetadataArchiveBatchJob}. | ||
*/ | ||
public class MetadataExtractionStrategy implements HadoopJobStrategy { | ||
private final Logger log = LoggerFactory.getLogger(MetadataExtractionStrategy.class); | ||
private final long jobID; | ||
private final FileSystem fileSystem; | ||
private final Configuration hadoopConf; | ||
private final Pattern urlPattern; | ||
private final Pattern mimePattern; | ||
|
||
/** | ||
* Constructor. | ||
* | ||
* @param jobID The ID for the job. | ||
* @param fileSystem The Hadoop FileSystem used. | ||
*/ | ||
public MetadataExtractionStrategy(long jobID, FileSystem fileSystem) { | ||
this.jobID = jobID; | ||
this.fileSystem = fileSystem; | ||
hadoopConf = fileSystem.getConf(); | ||
urlPattern = hadoopConf.getPattern(GetMetadataMapper.URL_PATTERN, Pattern.compile(".*")); | ||
mimePattern = hadoopConf.getPattern(GetMetadataMapper.MIME_PATTERN, Pattern.compile(".*")); | ||
} | ||
|
||
@Override public int runJob(Path jobInputFile, Path jobOutputDir) { | ||
int exitCode; | ||
try { | ||
log.info("URL/MIME patterns used for metadata extraction job {} are '{}' and '{}'", | ||
jobID, urlPattern, mimePattern); | ||
exitCode = ToolRunner.run(new HadoopJobTool(hadoopConf, new GetMetadataMapper()), | ||
new String[] {jobInputFile.toString(), jobOutputDir.toString()}); | ||
} catch (Exception e) { | ||
log.warn("Metadata extraction job with ID {} failed to run normally.", jobID, e); | ||
exitCode = 1; | ||
} | ||
return exitCode; | ||
} | ||
|
||
@Override public Path createJobInputFile(UUID uuid) { | ||
Path jobInputFile = HadoopFileUtils.createUniquePathInDir( | ||
fileSystem, Settings.get(CommonSettings.HADOOP_MAPRED_METADATA_EXTRACTIONJOB_INPUT_DIR), uuid); | ||
log.info("Input file for metadata extraction job '{}' will be '{}'", jobID, jobInputFile); | ||
return jobInputFile; | ||
} | ||
|
||
@Override public Path createJobOutputDir(UUID uuid) { | ||
Path jobOutputDir = HadoopFileUtils.createUniquePathInDir( | ||
fileSystem, Settings.get(CommonSettings.HADOOP_MAPRED_METADATA_EXTRACTIONJOB_OUTPUT_DIR), uuid); | ||
log.info("Output directory for metadata extraction job '{}' is '{}'", jobID, jobOutputDir); | ||
return jobOutputDir; | ||
} | ||
|
||
@Override public String getJobType() { | ||
return "METADATA EXTRACTION"; | ||
} | ||
} |
Oops, something went wrong.