Permalink
Browse files

Fixed up configuration and logging. Tweaked configuration of buffers.…

… Removed Ohcount logic. Tested.
  • Loading branch information...
anjackson committed Apr 18, 2012
1 parent fedaba1 commit 59b11d6f8daf30c9aed3aee2ddebc3cfabfec54e
Binary file not shown.
View
@@ -32,6 +32,18 @@
<artifactId>wap-recordreaders</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
+ <dependency>
+ <groupId>org.archive.wayback</groupId>
+ <artifactId>wayback-core</artifactId>
+ <version>1.6.1</version>
+ <scope>system</scope>
+ <systemPath>${basedir}/lib/wayback-core-1.6.1.jar</systemPath>
+ </dependency>
+ <dependency>
+ <groupId>org.archive.heritrix</groupId>
+ <artifactId>heritrix-commons</artifactId>
+ <version>3.1.0</version>
+ </dependency>
<dependency>
<groupId>eu.scape-project.nanite</groupId>
<artifactId>nanite-droid</artifactId>
@@ -59,4 +71,22 @@
<version>0.0.1-SNAPSHOT</version>
</dependency>
</dependencies>
+ <repositories>
+ <repository>
+ <releases>
+ <enabled>true</enabled>
+ <updatePolicy>daily</updatePolicy>
+ <checksumPolicy>warn</checksumPolicy>
+ </releases>
+ <snapshots>
+ <enabled>true</enabled>
+ <updatePolicy>daily</updatePolicy>
+ <checksumPolicy>fail</checksumPolicy>
+ </snapshots>
+ <id>internetarchive</id>
+ <name>Internet Archive Maven Repository</name>
+ <url>http://builds.archive.org:8080/maven2</url>
+ <layout>default</layout>
+ </repository>
+ </repositories>
</project>
@@ -6,6 +6,7 @@
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
+import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -34,6 +35,7 @@
@SuppressWarnings( { "deprecation" } )
public class FormatProfiler extends Configured implements Tool {
+ private static final String CONFIG = "/hadoop_utils.config";
public int run( String[] args ) throws IOException {
JobConf conf = new JobConf( getConf(), FormatProfiler.class );
@@ -46,13 +48,7 @@ public int run( String[] args ) throws IOException {
FileOutputFormat.setOutputPath( conf, new Path( args[ 1 ] ) );
- // Add the ohcount binary to the distributed cache.
- //File ohcount = Unpack.streamToTemp(FormatProfiler.class, "native/linux_x64/"+Ohcount.OH_300_STATIC_BIN, true);
- //DistributedCache.addCacheFile( ohcount.toURI(), conf );
-
- // Run in local/debug mode: This requires that mapred.local.dir is a valid writeable folder.
- //conf.set("mapred.job.tracker", "local");
-
+ //this.setProperties( conf );
conf.setJobName( args[ 0 ] + "_" + System.currentTimeMillis() );
conf.setInputFormat( ArchiveFileInputFormat.class );
conf.setMapperClass( FormatProfilerMapper.class );
@@ -69,7 +65,27 @@ public int run( String[] args ) throws IOException {
JobClient.runJob( conf );
return 0;
+ }
+
+ private void setProperties( JobConf conf ) throws IOException {
+ Properties properties = new Properties();
+ properties.load( this.getClass().getResourceAsStream( ( CONFIG ) ) );
+ conf.set( "solr.default", properties.getProperty( "solr_default" ) );
+ conf.set( "solr.image", properties.getProperty( "solr_image" ) );
+ conf.set( "solr.media", properties.getProperty( "solr_media" ) );
+ conf.set( "solr.batch.size", properties.getProperty( "solr_batch_size" ) );
+ conf.set( "solr.threads", properties.getProperty( "solr_threads" ) );
+ conf.set( "solr.image.regex", properties.getProperty( "solr_image_regex" ) );
+ conf.set( "solr.media.regex", properties.getProperty( "solr_media_regex" ) );
+
+ conf.set( "record.exclude.mime", properties.getProperty( "mime_exclude" ) );
+ conf.set( "record.exclude.url", properties.getProperty( "url_exclude" ) );
+ conf.set( "record.size.max", properties.getProperty( "max_payload_size" ) );
+ conf.set( "record.include.response", properties.getProperty( "response_include" ) );
+ conf.set( "record.include.protocol", properties.getProperty( "protocol_include" ) );
+ conf.set( "tika.exclude.mime", properties.getProperty( "mime_exclude" ) );
+ conf.set( "tika.timeout", properties.getProperty( "tika_timeout" ) );
}
public static void main( String[] args ) throws Exception {
@@ -12,7 +12,6 @@
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
-import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -27,11 +26,8 @@
import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.sax.WriteOutContentHandler;
import org.archive.io.ArchiveRecordHeader;
import org.xml.sax.ContentHandler;
@@ -41,7 +37,6 @@
import uk.bl.wap.hadoop.WritableArchiveRecord;
import uk.bl.wap.hadoop.format.Ohcount;
import uk.bl.wap.hadoop.util.Unpack;
-import uk.bl.wap.tika.parser.pdf.PDFParser;
import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResult;
import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResultCollection;
@@ -86,18 +81,6 @@ public void configure( JobConf job ) {
}*/
- // Instanciate Ohcount using the location of the binary:
- try {
- //File ohcount = new File( DistributedCache.getLocalCacheFiles(job)[0].toString() );
- File ohcount = Unpack.streamToTemp(FormatProfiler.class, "native/linux_x64/"+Ohcount.OH_300_STATIC_BIN, true);
- oh = new Ohcount( ohcount );
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
-
-
-
// This returns a hdfs URI:
//this.workingDirectory = job.get( "mapred.work.output.dir" );
this.workingDirectory = job.getWorkingDirectory().toString();
@@ -114,6 +97,7 @@ public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Te
// Get the ID:
String wctID = this.getWctTi( key.toString() );
+ log.debug("Processing wcID: "+wctID);
// Get the server header data:
if( !header.getHeaderFields().isEmpty() ) {
@@ -130,13 +114,15 @@ public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Te
} else {
output.collect( new Text("LOG: Empty header fields. "), new Text(key));
}
+ log.debug("Server Type: "+serverType);
try {
// Type according to Tiki:
tikaType = tika.detect( value.getPayload() );
} catch( Throwable e ) {
log.error( e.getMessage() );
+ System.err.println("Failed: "+e.getMessage());
e.printStackTrace();
//output.collect( new Text("LOG:ERROR Tika.detect threw exception: "+e+"\n"+getStackTrace(e)), new Text(key+" "+tmpFile+" "+value));
}
@@ -147,7 +133,7 @@ public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Te
// Now perform the parsing:
try {
// Abort handler, limiting the output size, to avoid OOM:
- ContentHandler ch = new WriteOutContentHandler(BUF_8KB);
+ WriteOutContentHandler ch = new WriteOutContentHandler(MAX_BUF);
// Silent handler:
//ContentHandler ch = new DefaultHandler();
// Set up a parse context:
@@ -159,35 +145,13 @@ public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Te
// One could forcibly limit the size if OOM is still causing problems, like this:
//parser.parse( new ByteArrayInputStream( value.getPayload(), 0, BUF_8KB ), ch, md, ctx );
} catch( Throwable e ) {
- log.error( e.getMessage() );
- e.printStackTrace();
- //output.collect( new Text("LOG:ERROR Tika.parse threw exception: "+e+"\n"+getStackTrace(e)), new Text(key+" "+tmpFile+" "+value));
+ log.debug( "Tika Exception: " + e.getMessage() );
+ //e.printStackTrace();
}
// Use the extended MIME type generated by the PreservationParser:
tikaType = md.get(PreservationParser.EXT_MIME_TYPE);
-
- // Ohcount
- /*
- String ohType = "application/octet-stream";
- if( tikaType.startsWith("text") ) {
- try {
- // This could maybe be made to work, but the real name is often hidden, e.g. javascript downloads as resource.asp?identifier.
- //String name = value.getRecord().getHeader().getUrl();
- //name = name.substring( name.lastIndexOf("/") + 1);
- String name = wctID;
- File contentTmp = this.copyToTempFile(name, value.getPayload());
- ohType = oh.identify(contentTmp);
- contentTmp.delete();
- } catch (Exception e) {
- log.error( e.getMessage() );
- e.printStackTrace();
- output.collect( new Text("LOG:ERROR Analysis threw exception: "+e+"\n"+getStackTrace(e)), new Text(key+" "+tmpFile+" "+value));
- }
- }
- */
-
// Type according to Droid/Nanite:
droidType = "application/octet-stream";
try {
@@ -202,9 +166,8 @@ public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Te
droidType = Nanite.getMimeTypeFromResult(res);
}
} catch( Exception e ) {
- log.error("Exception on Nanite invocation: "+e);
+ log.error("Exception on DroidNanite invocation: "+e);
e.printStackTrace();
- output.collect( new Text("LOG:ERROR Droid threw exception: "+e+"\n"+getStackTrace(e)), new Text(wctID) );
}
// Return the output for collation:
@@ -221,11 +184,11 @@ private File copyToTempFile( String name, byte[] content, int max_bytes ) throws
return tmp;
}
- private static int BUF_8KB = 8*1024;
+ private static int MAX_BUF = 16*1024;
private File copyToTempFile( String name, byte[] content ) throws Exception {
//if( content.length < BUF_8KB )
- return copyToTempFile(name, content, BUF_8KB);
+ return copyToTempFile(name, content, MAX_BUF);
}
private static String getStackTrace(Throwable aThrowable) {
@@ -19,4 +19,9 @@
<level value="error"/>
</logger>
+ <!-- Increase logging from this code -->
+ <logger name="uk.bl.wap">
+ <level value="info"/>
+ </logger>
+
</log4j:configuration>

0 comments on commit 59b11d6

Please sign in to comment.