Permalink
Browse files

Changes since version 0.1: 1) code refactoring, 2) bug fixes, 3) impr…

…oved error handling, 4) new and improved scorers, 5) uses latest stanford corenlp pipeline,

 6) default named entity tagger now supports 7 types, instead of just 3.
  • Loading branch information...
1 parent f243e40 commit 209919b56f4359cd8cd2a1733f1a1262797233b1 Don Metzler committed Jan 20, 2012
Showing with 1,837 additions and 67 deletions.
  1. +2 −2 ivy/ivy.xml
  2. BIN ivy/local-repo/{stanford-corenlp-2011-09-14-models.jar → stanford-corenlp-2011-12-27-models.jar}
  3. BIN ivy/local-repo/{stanford-corenlp-2011-09-16.jar → stanford-corenlp-2012-01-08.jar}
  4. BIN models/all.3class.distsim.crf.ser.gz
  5. BIN models/muc.7class.distsim.crf.ser.gz
  6. +17 −7 src/edu/isi/mavuno/app/ie/HarvestSAPInstances.java
  7. +16 −7 src/edu/isi/mavuno/app/ie/HarvestUDAPInstances.java
  8. +1 −1 src/edu/isi/mavuno/app/nlp/ProcessStanfordNLP.java
  9. +29 −9 src/edu/isi/mavuno/app/nlp/TratzParse.java
  10. +2 −1 src/edu/isi/mavuno/extract/Extract.java
  11. +1 −1 src/edu/isi/mavuno/extract/NAryChunkExtractor.java
  12. +1 −1 src/edu/isi/mavuno/nlp/NLProcTools.java
  13. +2 −1 src/edu/isi/mavuno/score/FMeasureScorer.java
  14. +2 −1 src/edu/isi/mavuno/score/PMIScorer.java
  15. +2 −1 src/edu/isi/mavuno/score/TFIDFScorer.java
  16. +1 −1 src/edu/isi/mavuno/util/ContextPatternWritable.java
  17. +13 −0 src/edu/isi/mavuno/util/ContextPatternWritableScorePair.java
  18. +13 −0 src/edu/isi/mavuno/util/IdWeightPair.java
  19. +6 −1 src/edu/isi/mavuno/util/PairWritable.java
  20. +3 −0 src/edu/isi/mavuno/util/Relation.java
  21. +45 −1 src/edu/isi/mavuno/util/TextLongPairWritable.java
  22. +1 −1 src/edu/isi/mavuno/util/TokenWritable.java
  23. +3 −0 src/edu/isi/mavuno/util/TypedTextSpan.java
  24. +562 −0 src/edu/umd/cloud9/io/Tuple.java
  25. +30 −31 src/edu/{isi/mavuno/util/TextTextPairWritable.java → umd/cloud9/io/TupleException.java}
  26. +148 −0 src/edu/umd/cloud9/io/array/ArrayListOfDoublesWritable.java
  27. +147 −0 src/edu/umd/cloud9/io/array/ArrayListOfFloatsWritable.java
  28. +159 −0 src/edu/umd/cloud9/io/array/ArrayListOfIntsWritable.java
  29. +157 −0 src/edu/umd/cloud9/io/array/ArrayListOfLongsWritable.java
  30. +156 −0 src/edu/umd/cloud9/io/array/ArrayListOfShortsWritable.java
  31. +126 −0 src/edu/umd/cloud9/io/array/ArrayListWritable.java
  32. +192 −0 src/edu/umd/cloud9/io/array/ArrayListWritableComparable.java
View
@@ -17,8 +17,8 @@
<dependency org="log4j" name="log4j" rev="1.2.16" />
<!-- Stanford Core NLP -->
- <dependency org="edu.stanford" name="stanford-corenlp" rev="2011-09-16" />
- <dependency org="edu.stanford" name="stanford-corenlp" rev="2011-09-14-models" />
+ <dependency org="edu.stanford" name="stanford-corenlp" rev="2012-01-08" />
+ <dependency org="edu.stanford" name="stanford-corenlp" rev="2011-12-27-models" />
<dependency org="joda-time" name="joda-time" rev="2.0" />
<dependency org="xom" name="xom" rev="1.2.5" />
Binary file not shown.
Binary file not shown.
@@ -69,18 +69,28 @@ public HarvestSAPInstances(Configuration conf) {
@Override
public void setup(Mapper<Writable, Indexable, Text, LongWritable>.Context context) throws IOException {
+ // initialize WordNet (needed by POS tagger)
try {
- // initialize POS tagger
- mTextUtils.initializePOSTagger();
-
- // initialize WordNet (needed by POS tagger)
mTextUtils.initializeWordNet();
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing WordNet instance -- " + e);
+ }
+
+ // initialize POS tagger
+ try {
+ mTextUtils.initializePOSTagger();
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing POS tagger -- " + e);
+ }
- // initialize named entity tagger
+ // initialize named entity tagger
+ try {
mTextUtils.initializeNETagger();
}
- catch(ClassNotFoundException e) {
- throw new RuntimeException(e);
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing named entity tagger -- " + e);
}
}
@@ -78,20 +78,29 @@ public HarvestUDAPInstances(Configuration conf) {
@Override
public void setup(Mapper<Writable, Indexable, Text, Text>.Context context) throws IOException {
+ // initialize WordNet (needed by POS tagger)
try {
- // initialize WordNet (needed by POS tagger)
mTextUtils.initializeWordNet();
-
- // initialize POS tagger
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing WordNet instance -- " + e);
+ }
+
+ // initialize POS tagger
+ try {
mTextUtils.initializePOSTagger();
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing POS tagger -- " + e);
+ }
- // initialize named entity tagger
+ // initialize named entity tagger
+ try {
mTextUtils.initializeNETagger();
}
- catch(ClassNotFoundException e) {
- throw new RuntimeException(e);
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing named entity tagger -- " + e);
}
-
}
@Override
@@ -61,7 +61,7 @@
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
-import edu.stanford.nlp.ling.CorefCoreAnnotations.CorefClusterIdAnnotation;
+import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefClusterIdAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
@@ -89,24 +89,44 @@ public TratzParse(Configuration conf) {
@Override
public void setup(Mapper<Writable, Indexable, Text, TratzParsedDocument>.Context context) throws IOException {
+ // initialize WordNet (needed by POS tagger)
try {
- // initialize POS tagger
- mTextUtils.initializePOSTagger();
-
- // initialize WordNet (needed by POS tagger)
mTextUtils.initializeWordNet();
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing WordNet instance -- " + e);
+ }
+
+ // initialize POS tagger
+ try {
+ mTextUtils.initializePOSTagger();
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing POS tagger -- " + e);
+ }
- // initialize chunker
+ // initialize chunker
+ try {
mTextUtils.initializeChunker();
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing chunker -- " + e);
+ }
- // initialize named entity tagger
+ // initialize named entity tagger
+ try {
mTextUtils.initializeNETagger();
+ }
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing named entity tagger -- " + e);
+ }
- // initialize parser
+ // initialize parser
+ try {
mTextUtils.initializeTratzParser();
}
- catch(ClassNotFoundException e) {
- throw new RuntimeException(e);
+ catch(Exception e) {
+ throw new RuntimeException("Error initializing Tratz parser -- " + e);
}
}
@@ -351,7 +351,6 @@ public int run() throws ClassNotFoundException, InterruptedException, IOExceptio
FileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
- job.setMapOutputKeyClass(ContextPatternWritable.class);
if("pattern".equals(extractorTarget)) {
job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class);
@@ -363,6 +362,8 @@ else if("context".equals(extractorTarget)) {
else {
throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget);
}
+
+ job.setMapOutputKeyClass(ContextPatternWritable.class);
job.setMapOutputValueClass(ContextPatternStatsWritable.class);
job.setOutputKeyClass(ContextPatternWritable.class);
@@ -321,7 +321,7 @@ private Chunk createChunk(List<TratzParsedTokenWritable> terms) {
return chunk;
}
- public class Chunk {
+ public static class Chunk {
public final Text text = new Text();
public final Text type = new Text();
}
@@ -71,7 +71,7 @@
public static final int MAX_SENTENCE_LENGTH = 100;
public static final String DEFAULT_CHUNKER_MODEL = "models/en-chunker.bin";
- public static final String DEFAULT_NER_MODEL = "models/all.3class.distsim.crf.ser.gz";
+ public static final String DEFAULT_NER_MODEL = "models/muc.7class.distsim.crf.ser.gz";
public static final String DEFAULT_SENTENCE_MODEL = "models/en-sent.bin";
public static final String DEFAULT_TRATZ_POS_MODEL = "models/tratzPosTaggingModel.gz";
public static final String DEFAULT_TRATZ_PARSER_MODEL = "models/tratzParseModel.gz";
@@ -17,6 +17,7 @@
package edu.isi.mavuno.score;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
@@ -44,7 +45,7 @@ public void setup(Configuration conf) throws IOException {
String [] params = paramSpec.split(":");
if(params.length != 1) {
- throw new RuntimeException("Invalid FMeasureScorer arguments --" + params);
+ throw new RuntimeException("Invalid FMeasureScorer arguments --" + Arrays.toString(params));
}
mLambda = Float.parseFloat(params[0]);
@@ -17,6 +17,7 @@
package edu.isi.mavuno.score;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
@@ -46,7 +47,7 @@ public void setup(Configuration conf) throws IOException {
String [] params = paramSpec.split(":");
if(params.length == 0 || params.length > 2) {
- throw new RuntimeException("Invalid PMIScorer arguments --" + params);
+ throw new RuntimeException("Invalid PMIScorer arguments --" + Arrays.toString(params));
}
String weightType = params[0].toLowerCase().trim();
@@ -17,6 +17,7 @@
package edu.isi.mavuno.score;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
@@ -83,7 +84,7 @@ else if("okapi".equals(idfType)) {
}
}
else {
- throw new RuntimeException("Invalid TFIDFScorer arguments --" + params);
+ throw new RuntimeException("Invalid TFIDFScorer arguments --" + Arrays.toString(params));
}
}
}
@@ -41,7 +41,7 @@
public static final int EOW_BYTES_LENGTH = EOW.getLength();
public static final Text ASTERISK = new Text("\u0003");
- public static final String ASTERISK_STRING = new String("\u0003");
+ public static final String ASTERISK_STRING = "\u0003";
public static final Text TAB = new Text("\t");
public static final byte [] TAB_BYTES = TAB.getBytes();
@@ -44,4 +44,17 @@ else if(this.score.get() < score) {
return 0;
}
}
+
+ @Override
+ public boolean equals(Object o) {
+ if(o == null || !(o instanceof ContextPatternWritableScorePair)) {
+ return false;
+ }
+ return compareTo((ContextPatternWritableScorePair)o) == 0;
+ }
+
+ @Override
+ public int hashCode() {
+ return Double.valueOf(this.score.get()).hashCode();
+ }
}
@@ -46,6 +46,19 @@ else if(this.weight < w) {
}
@Override
+ public boolean equals(Object o) {
+ if(o == null || !(o instanceof IdWeightPair)) {
+ return false;
+ }
+ return compareTo((IdWeightPair)o) == 0;
+ }
+
+ @Override
+ public int hashCode() {
+ return Float.valueOf(weight).hashCode();
+ }
+
+ @Override
public String toString() {
return "[id=" + id + ", weight=" + weight + "]";
}
@@ -21,17 +21,19 @@
import java.io.IOException;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
/**
* @author metzler
*
*/
-public class PairWritable<A extends Writable, B extends Writable> implements Writable {
+public abstract class PairWritable<A extends Writable, B extends Writable> implements WritableComparable<PairWritable<A,B>> {
public A left;
public B right;
public PairWritable() {
+ super();
}
public PairWritable(A left, B right) {
@@ -61,4 +63,7 @@ public void write(DataOutput out) throws IOException {
public String toString() {
return left.toString() + "\t" + right.toString();
}
+
+ @Override
+ public abstract int compareTo(PairWritable<A, B> obj);
}
@@ -85,6 +85,9 @@ public int hashCode() {
@Override
public boolean equals(Object o) {
+ if(o == null) {
+ return false;
+ }
return toString().compareTo(o.toString()) == 0;
}
@@ -16,17 +16,61 @@
package edu.isi.mavuno.util;
+import java.io.IOException;
+
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.io.WritableUtils;
/**
* @author metzler
*
*/
public class TextLongPairWritable extends PairWritable<Text, LongWritable> {
-
+
public TextLongPairWritable() {
super(new Text(), new LongWritable());
}
+
+ @Override
+ public int compareTo(PairWritable<Text, LongWritable> obj) {
+ int cmp = this.left.compareTo(obj.left);
+ if(cmp == 0) {
+ return (int)(this.right.get() - obj.right.get());
+ }
+ return cmp;
+ }
+
+ public static final class Comparator extends WritableComparator {
+
+ public Comparator() {
+ super(TextLongPairWritable.class);
+ }
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ try {
+ int n1 = WritableUtils.decodeVIntSize(b1[s1]);
+ int n2 = WritableUtils.decodeVIntSize(b2[s2]);
+ int len1 = WritableComparator.readVInt(b1, s1);
+ int len2 = WritableComparator.readVInt(b2, s2);
+ int cmp = WritableComparator.compareBytes(b1, s1+n1, len1, b2, s2+n2, len2);
+ if(cmp == 0) {
+ long x1 = WritableComparator.readLong(b1, s1+n1+len1+1);
+ long x2 = WritableComparator.readLong(b2, s2+n2+len2+1);
+ return (int)(x1 - x2);
+ }
+ return cmp;
+ }
+ catch(IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ }
+ static {
+ WritableComparator.define(TextLongPairWritable.class, new Comparator());
+ }
}
@@ -29,7 +29,7 @@
*/
public class TokenWritable implements Writable {
- public static final String SEP = new String("\t");
+ public static final String SEP = "\t";
// token
private final Text mToken = new Text();
Oops, something went wrong.

0 comments on commit 209919b

Please sign in to comment.