Skip to content

Commit

Permalink
#70 add projection operates for predicate and subject, and internal o…
Browse files Browse the repository at this point in the history
…bject URIs (filter out if not rdf.basekb.com) and refactor UniqTool out to simplify development of various uniq tools
  • Loading branch information
paulhoule committed Nov 12, 2013
1 parent 0fedc4f commit 7ce483b
Show file tree
Hide file tree
Showing 9 changed files with 362 additions and 204 deletions.
1 change: 0 additions & 1 deletion .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

293 changes: 176 additions & 117 deletions .idea/workspace.xml

Large diffs are not rendered by default.

119 changes: 119 additions & 0 deletions bakemono/src/main/java/com/ontology2/bakemono/mapmap/UniqTool.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package com.ontology2.bakemono.mapmap;

import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import com.ontology2.bakemono.Main;
import com.ontology2.bakemono.uniq.Uniq;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;


/**
* Created with IntelliJ IDEA.
* User: paul_000
* Date: 11/12/13
* Time: 1:22 PM
* To change this template use File | Settings | File Templates.
*/

public abstract class UniqTool implements Tool {

abstract protected Class getMapperClass();
abstract protected String getJobName();

private Configuration conf;

@Override
public Configuration getConf() {
return this.conf;
}

@Override
public void setConf(Configuration arg0) {
this.conf=arg0;
}

@Override
public int run(String[] arg0) throws Exception {
try {
PeekingIterator<String> a= Iterators.peekingIterator(Iterators.forArray(arg0));
Integer reduceTasks = parseRArgument(a);

if (!a.hasNext())
usage();

String input=a.next();

if (!a.hasNext())
usage();

String output=a.next();

conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");
conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

Job job=new Job(conf,getJobName());
job.setSpeculativeExecution(false);
job.setJarByClass(this.getClass());
job.setMapperClass(getMapperClass());
job.setReducerClass(Uniq.class);

if(reduceTasks==null) {
reduceTasks=29; // about right for AWS runs
}

job.setNumReduceTasks(reduceTasks);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

// Gotcha -- this has to run before the definitions above associated with the output format because
// this is going to be configured against the job as it stands a moment from now

job.setOutputFormatClass(TextOutputFormat.class);

return job.waitForCompletion(true) ? 0 : 1;
} catch(Main.IncorrectUsageException iue) {
return 2;
}
}

public static Integer parseRArgument(PeekingIterator<String> a)
throws Main.IncorrectUsageException {
Integer reduceTasks=null;
while(a.hasNext() && a.peek().startsWith("-")) {
String flagName=a.next().substring(1).intern();
if (!a.hasNext())
usage();

String flagValue=a.next();
if (flagName=="r") {
reduceTasks=Integer.parseInt(flagValue);
} else {
usage();
};
}
return reduceTasks;
}

private static void usage() throws Main.IncorrectUsageException {
throw new Main.IncorrectUsageException("incorrect arguments");
};

}
Original file line number Diff line number Diff line change
Expand Up @@ -15,92 +15,14 @@
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;

public class UniqURIObjectTool implements Tool {
private Configuration conf;

@Override
public Configuration getConf() {
return this.conf;
}

public class UniqURIObjectTool extends UniqTool {
@Override
public void setConf(Configuration arg0) {
this.conf=arg0;
protected Class getMapperClass() {
return UniqueURIObjectMapper.class;
}

@Override
public int run(String[] arg0) throws Exception {
try {
PeekingIterator<String> a= Iterators.peekingIterator(Iterators.forArray(arg0));
Integer reduceTasks = parseRArgument(a);

if (!a.hasNext())
usage();

String input=a.next();

if (!a.hasNext())
usage();

String output=a.next();

conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");
conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

Job job=new Job(conf,"uniqURIobjects");
job.setSpeculativeExecution(false);
job.setJarByClass(UniqURIObjectTool.class);
job.setMapperClass(UniqueURIObjectMapper.class);
job.setReducerClass(Uniq.class);

if(reduceTasks==null) {
reduceTasks=29; // about right for AWS runs
}

job.setNumReduceTasks(reduceTasks);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

// Gotcha -- this has to run before the definitions above associated with the output format because
// this is going to be configured against the job as it stands a moment from now

job.setOutputFormatClass(TextOutputFormat.class);

return job.waitForCompletion(true) ? 0 : 1;
} catch(Main.IncorrectUsageException iue) {
return 2;
}
protected String getJobName() {
return "uniqURIObjects";
}

public static Integer parseRArgument(PeekingIterator<String> a)
throws Main.IncorrectUsageException {
Integer reduceTasks=null;
while(a.hasNext() && a.peek().startsWith("-")) {
String flagName=a.next().substring(1).intern();
if (!a.hasNext())
usage();

String flagValue=a.next();
if (flagName=="r") {
reduceTasks=Integer.parseInt(flagValue);
} else {
usage();
};
}
return reduceTasks;
}

private static void usage() throws Main.IncorrectUsageException {
throw new Main.IncorrectUsageException("incorrect arguments");
};

}
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package com.ontology2.bakemono.mapmap;

import com.google.common.base.Function;
import com.ontology2.bakemono.primitiveTriples.ExtractURIObject;
import com.ontology2.bakemono.primitiveTriples.ProjectURIObject;
import com.ontology2.bakemono.primitiveTriples.PrimitiveTriple;
import org.apache.hadoop.io.Text;

public class UniqueURIObjectMapper extends PTUniqueMapMapper<Text> {

private final Function<PrimitiveTriple,Text> primitiveTriple = new ExtractURIObject();
private final Function<PrimitiveTriple,Text> primitiveTriple = new ProjectURIObject();

@Override
Function<PrimitiveTriple, Text> getKeyFunction() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package com.ontology2.bakemono.primitiveTriples;

import com.google.common.base.Function;
import org.apache.hadoop.io.Text;

/**
* Created with IntelliJ IDEA.
* User: paul_000
* Date: 11/12/13
* Time: 1:15 PM
* To change this template use File | Settings | File Templates.
*/
public class ExtractInternalURIObject implements Function<PrimitiveTriple,Text> {
@Override
public Text apply(PrimitiveTriple t) {
String o=t.getObject();
if(o.startsWith("<http://rdf.basekb.com/") && o.endsWith(">")) {
return new Text(t.getObject());
}

return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.ontology2.bakemono.primitiveTriples;

import com.google.common.base.Function;
import org.apache.hadoop.io.Text;

/**
* Created with IntelliJ IDEA.
* User: paul_000
* Date: 11/12/13
* Time: 1:18 PM
* To change this template use File | Settings | File Templates.
*/
public class ProjectPredicate implements Function<PrimitiveTriple,Text> {
@Override
public Text apply(PrimitiveTriple t) {
return new Text(t.getSubject());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.ontology2.bakemono.primitiveTriples;

import com.google.common.base.Function;
import org.apache.hadoop.io.Text;

/**
* Created with IntelliJ IDEA.
* User: paul_000
* Date: 11/12/13
* Time: 1:16 PM
* To change this template use File | Settings | File Templates.
*/
public class ProjectSubject implements Function<PrimitiveTriple,Text> {
@Override
public Text apply(PrimitiveTriple t) {
return new Text(t.getSubject());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* To change this template use File | Settings | File Templates.
*/

public class ExtractURIObject implements Function<PrimitiveTriple,Text> {
public class ProjectURIObject implements Function<PrimitiveTriple,Text> {
@Override
public Text apply(PrimitiveTriple t) {
String o=t.getObject();
Expand Down

0 comments on commit 7ce483b

Please sign in to comment.