Permalink
Browse files

1) Updated build process (now relies on Maven/Ivy for resolving and d…

…ownloading dependencies), 2) Eclipse project now makes use of Apache IveDE plugin for managed dependencies, 3) Added FMeasureScorer, 4) Misc. code refactoring.
  • Loading branch information...
Don Metzler
Don Metzler committed Nov 13, 2011
1 parent 4c82acf commit f243e404250412d75af93e1c30180b3d47268dd7
Showing with 13,724 additions and 108 deletions.
  1. +3 −10 .classpath
  2. +2 −11 .settings/org.eclipse.jdt.core.prefs
  3. +5 −0 .settings/org.eclipse.m2e.core.prefs
  4. +127 −39 build.xml
  5. +32 −0 ivy/ivy.xml
  6. +15 −0 ivy/ivysettings.xml
  7. BIN {lib → ivy/local-repo}/fanseparser-0.2.2.jar
  8. BIN {lib → ivy/local-repo}/joda-time.jar
  9. BIN {lib → ivy/local-repo}/stanford-corenlp-2011-09-14-models.jar
  10. BIN {lib → ivy/local-repo}/stanford-corenlp-2011-09-16.jar
  11. BIN lib/cloud9-1.2.5.jar
  12. BIN lib/commons-cli-1.2.jar
  13. BIN lib/commons-logging-1.1.jar
  14. BIN lib/hadoop-0.20.2-core.jar
  15. BIN lib/log4j-1.2.13.jar
  16. BIN lib/opennlp-maxent-3.0.1-incubating.jar
  17. BIN lib/opennlp-tools-1.5.1-incubating.jar
  18. BIN lib/xom.jar
  19. +18 −0 pom.xml
  20. +18 −4 src/edu/isi/mavuno/app/ie/ExtractRelations.java
  21. +1 −1 src/edu/isi/mavuno/app/ie/HarvestSAPInstances.java
  22. +1 −1 src/edu/isi/mavuno/app/ie/HarvestUDAPInstances.java
  23. +1 −1 src/edu/isi/mavuno/app/mine/HarvestContextPatternPairs.java
  24. +1 −1 src/edu/isi/mavuno/app/mine/HarvestSentences.java
  25. +1 −1 src/edu/isi/mavuno/app/nlp/HarvestParseGraph.java
  26. +1 −1 src/edu/isi/mavuno/app/nlp/ProcessStanfordNLP.java
  27. +1 −1 src/edu/isi/mavuno/app/nlp/TratzParse.java
  28. +1 −1 src/edu/isi/mavuno/extract/ChunkExtractor.java
  29. +1 −1 src/edu/isi/mavuno/extract/CooccurExtractor.java
  30. +1 −1 src/edu/isi/mavuno/extract/DIRTExtractor.java
  31. +27 −3 src/edu/isi/mavuno/extract/Extract.java
  32. +1 −1 src/edu/isi/mavuno/extract/ExtractGlobalStats.java
  33. +1 −1 src/edu/isi/mavuno/extract/Extractor.java
  34. +1 −1 src/edu/isi/mavuno/extract/MultiExtractor.java
  35. +1 −1 src/edu/isi/mavuno/extract/NAryChunkExtractor.java
  36. +1 −1 src/edu/isi/mavuno/extract/NGramExtractor.java
  37. +1 −1 src/edu/isi/mavuno/extract/PassageExtractor.java
  38. +1 −1 src/edu/isi/mavuno/extract/TwitterCooccurExtractor.java
  39. +1 −1 src/edu/isi/mavuno/extract/TwitterGeoTemporalExtractor.java
  40. +0 −1 src/edu/isi/mavuno/input/ClueWarcInputFormat.java
  41. +643 −0 src/edu/isi/mavuno/input/ClueWarcRecord.java
  42. +58 −0 src/edu/isi/mavuno/input/Indexable.java
  43. +0 −1 src/edu/isi/mavuno/input/IndexableFileInputFormat.java
  44. +0 −1 src/edu/isi/mavuno/input/Passagifiable.java
  45. +0 −1 src/edu/isi/mavuno/input/SentenceSegmentedDocument.java
  46. +0 −1 src/edu/isi/mavuno/input/TextDocument.java
  47. +0 −1 src/edu/isi/mavuno/input/TrecDocument.java
  48. +0 −1 src/edu/isi/mavuno/input/TwitterDocument.java
  49. +72 −0 src/edu/isi/mavuno/score/FMeasureScorer.java
  50. +1 −1 src/edu/isi/mavuno/score/PMIScorer.java
  51. +2 −2 src/edu/isi/mavuno/score/ScoreContexts.java
  52. +9 −9 src/edu/isi/mavuno/score/ScorePatterns.java
  53. +10 −0 src/edu/isi/mavuno/score/Scorer.java
  54. +6 −4 src/edu/isi/mavuno/util/SentenceWritable.java
  55. +305 −0 src/edu/umd/cloud9/util/array/ArrayListOfDoubles.java
  56. +305 −0 src/edu/umd/cloud9/util/array/ArrayListOfFloats.java
  57. +400 −0 src/edu/umd/cloud9/util/array/ArrayListOfInts.java
  58. +389 −0 src/edu/umd/cloud9/util/array/ArrayListOfLongs.java
  59. +390 −0 src/edu/umd/cloud9/util/array/ArrayListOfShorts.java
  60. +974 −0 src/edu/umd/cloud9/util/map/HMapIF.java
  61. +963 −0 src/edu/umd/cloud9/util/map/HMapII.java
  62. +970 −0 src/edu/umd/cloud9/util/map/HMapIL.java
  63. +970 −0 src/edu/umd/cloud9/util/map/HMapIS.java
  64. +896 −0 src/edu/umd/cloud9/util/map/HMapIV.java
  65. +1,033 −0 src/edu/umd/cloud9/util/map/HMapKF.java
  66. +1,073 −0 src/edu/umd/cloud9/util/map/HMapKI.java
  67. +905 −0 src/edu/umd/cloud9/util/map/HMapKL.java
  68. +905 −0 src/edu/umd/cloud9/util/map/HMapKS.java
  69. +247 −0 src/edu/umd/cloud9/util/map/MapIF.java
  70. +247 −0 src/edu/umd/cloud9/util/map/MapII.java
  71. +231 −0 src/edu/umd/cloud9/util/map/MapIL.java
  72. +231 −0 src/edu/umd/cloud9/util/map/MapIS.java
  73. +243 −0 src/edu/umd/cloud9/util/map/MapIV.java
  74. +247 −0 src/edu/umd/cloud9/util/map/MapKF.java
  75. +247 −0 src/edu/umd/cloud9/util/map/MapKI.java
  76. +247 −0 src/edu/umd/cloud9/util/map/MapKL.java
  77. +240 −0 src/edu/umd/cloud9/util/map/MapKS.java
View
@@ -1,14 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
- <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
- <classpathentry kind="lib" path="lib/cloud9-1.2.5.jar"/>
- <classpathentry kind="lib" path="lib/commons-logging-1.1.jar"/>
- <classpathentry kind="lib" path="lib/fanseparser-0.2.2.jar"/>
- <classpathentry kind="lib" path="lib/hadoop-0.20.2-core.jar"/>
- <classpathentry kind="lib" path="lib/log4j-1.2.13.jar"/>
- <classpathentry kind="lib" path="lib/opennlp-maxent-3.0.1-incubating.jar"/>
- <classpathentry kind="lib" path="lib/opennlp-tools-1.5.1-incubating.jar"/>
- <classpathentry kind="lib" path="lib/stanford-corenlp-2011-09-16.jar"/>
- <classpathentry kind="output" path="bin"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5"/>
+ <classpathentry kind="con" path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?ivyXmlPath=ivy%2Fivy.xml&amp;confs=*&amp;ivySettingsPath=%24%7Bworkspace_loc%3Amavuno%2Fivy%2Fivysettings.xml%7D&amp;loadSettingsOnDemand=false&amp;propertyFiles="/>
+ <classpathentry kind="output" path="build"/>
</classpath>
@@ -1,12 +1,3 @@
-#Fri Sep 03 11:40:04 PDT 2010
+#Sat Nov 12 17:22:21 PST 2011
eclipse.preferences.version=1
-org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
-org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
-org.eclipse.jdt.core.compiler.compliance=1.6
-org.eclipse.jdt.core.compiler.debug.lineNumber=generate
-org.eclipse.jdt.core.compiler.debug.localVariable=generate
-org.eclipse.jdt.core.compiler.debug.sourceFile=generate
-org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
-org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
-org.eclipse.jdt.core.compiler.source=1.6
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
@@ -0,0 +1,5 @@
+#Tue Nov 08 10:55:37 PST 2011
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
View
166 build.xml
@@ -1,55 +1,141 @@
-<project name="mavuno" default="jar" basedir=".">
+<project name="mavuno" default="jar" xmlns:ivy="antlib:org.apache.ivy.ant"
+ xmlns:artifact="antlib:org.apache.maven.artifact.ant" basedir=".">
<description>Mavuno build file.</description>
-
- <!-- set global properties for this build -->
+
+ <property name="lib.dir" value="lib" />
+ <property name="build.dir" value="build" />
+ <property name="src.dir" value="src" />
+ <property name="dist.dir" value="dist" />
+ <property name="javadoc.dir" location="docs/api/" />
+
<property name="version" value="0.1" />
- <property name="src" location="src"/>
- <property name="build" location="build"/>
- <property name="models" location="models" />
- <property name="lib" location="lib" />
- <property name="docs" location="docs" />
-
- <path id="classpath">
- <fileset dir="lib">
- <include name="**/*.jar"/>
- </fileset>
+
+ <!-- paths used for compilation and run -->
+ <path id="lib.path.id">
+ <fileset dir="${lib.dir}" />
</path>
+ <path id="run.path.id">
+ <path refid="lib.path.id" />
+ <path location="${build.dir}" />
+ </path>
+
+ <property name="maven.ant.task.version" value="2.1.2" />
+ <property name="maven.jar.dir" value="${basedir}/${lib.dir}" />
+ <property name="maven.jar.file" value="${maven.jar.dir}/maven.jar" />
+
+ <property name="ivy.install.version" value="2.2.0" />
+ <property name="ivy.jar.dir" value="${basedir}/ivy" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+ <property name="ivy.settings.file" value="${basedir}/ivy/ivysettings.xml" />
+ <property name="ivy.dep.file" value="${basedir}/ivy/ivy.xml" />
+
+ <!-- define Maven coordinates, repository url and artifacts name etc -->
+ <property name="groupId" value="edu.isi" />
+ <property name="artifactId" value="mavuno" />
+ <property name="version" value="${version}" />
<target name="init">
- <tstamp/>
- <mkdir dir="${build}"/>
+ <tstamp />
+ <condition property="platform" value="unix">
+ <os family="unix" />
+ </condition>
+ <condition property="platform" value="unix">
+ <os family="mac" />
+ </condition>
+ <condition property="platform" value="windows">
+ <os family="windows" />
+ </condition>
+ <mkdir dir="${build.dir}" />
+ <mkdir dir="${lib.dir}" />
+ <mkdir dir="${dist.dir}" />
+ </target>
+
+ <!-- download Ivy from web site so that it can be used even without any
+ special installation -->
+ <target name="download-ivy" unless="skip.download">
+ <echo message="installing ivy..." />
+ <get
+ src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true" />
+ </target>
+
+ <!-- try to load ivy here from local ivy dir, in case the user has not already
+ dropped it into ant's lib dir (note that the latter copy will always take
+ precedence). We will not fail as long as local lib dir exists (it may be
+ empty) and ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <target name="install-ivy" depends="download-ivy" description="--> install ivy">
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar" />
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml" uri="antlib:org.apache.ivy.ant"
+ classpathref="ivy.lib.path" />
+ </target>
+
+ <!-- download maven from web site so that it can be used even without any
+ special installation -->
+ <target name="download-maven" depends="init" unless="skip.download">
+ <mkdir dir="${maven.jar.dir}" />
+ <echo message="installing maven ant tasks..." />
+ <get
+ src="http://repo1.maven.org/maven2/org/apache/maven/maven-ant-tasks/${maven.ant.task.version}/maven-ant-tasks-${maven.ant.task.version}.jar"
+ dest="${maven.jar.file}" usetimestamp="true" />
+ </target>
+
+ <!-- try to load ivy here from local ivy dir, in case the user has not already
+ dropped it into ant's lib dir (note that the latter copy will always take
+ precedence). We will not fail as long as local lib dir exists (it may be
+ empty) and ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <target name="install-maven" depends="download-maven"
+ description="--> install ivy">
+ <path id="maven.lib.path">
+ <fileset dir="${maven.jar.dir}" includes="*.jar" />
+ </path>
+ <taskdef uri="antlib:org.apache.maven.artifact.ant" classpathref="maven.lib.path" />
+ </target>
+
+ <target name="resolve" depends="install-ivy"
+ description="--> retreive dependencies with ivy" unless="skip.resolve">
+ <ivy:retrieve />
+ </target>
+
+ <target name="report" depends="resolve"
+ description="--> generates a report of dependencies">
+ <ivy:report todir="${build.dir}" />
</target>
- <target name="compile" depends="init" description="compile the source " >
- <javac classpathref="classpath" srcdir="${src}" destdir="${build}"/>
+ <target name="compile" depends="init,resolve" description="compile the source ">
+ <javac classpathref="lib.path.id" srcdir="${src.dir}" destdir="${build.dir}" />
</target>
<target name="jar" depends="compile" description="builds the jar">
- <jar jarfile="mavuno-${version}.jar" basedir="${build}" />
+ <jar jarfile="mavuno-${version}.jar" basedir="${build.dir}" />
<jar jarfile="mavuno-${version}-models.jar">
<fileset dir=".">
<include name="models/**" />
<include name="wordnet/**" />
</fileset>
</jar>
</target>
-
- <target name="javadoc">
- <javadoc destdir="${docs}" access="public" use="false" notree="false" nonavbar="false" noindex="false" splitindex="no" author="true" version="true" nodeprecatedlist="true" nodeprecated="true" classpathref="classpath">
- <fileset dir="${src}">
- <include name="**/*.java" />
- <exclude name="ivory/**/*.java" />
- <exclude name="org/**/*.java" />
- </fileset>
- <link href="http://download.oracle.com/javase/6/docs/api/" />
- <link href="http://hadoop.apache.org/common/docs/current/api/" />
- <link href="http://www.umiacs.umd.edu/~jimmylin/Cloud9/docs/api/" />
- <link href="http://nlp.stanford.edu/nlp/javadoc/javanlp/" />
- </javadoc>
- </target>
-
- <target name="dist" depends="clean,jar,javadoc" description="generate the distribution" >
+
+ <target name="javadoc">
+ <javadoc destdir="${javadoc.dir}" access="public" use="false"
+ notree="false" nonavbar="false" noindex="false" splitindex="no"
+ author="true" version="true" nodeprecatedlist="true" nodeprecated="true"
+ classpathref="lib.path.id">
+ <fileset dir="${src.dir}">
+ <include name="**/*.java" />
+ <exclude name="ivory/**/*.java" />
+ <exclude name="org/**/*.java" />
+ </fileset>
+ <link href="http://download.oracle.com/javase/6/docs/api/" />
+ <link href="http://hadoop.apache.org/common/docs/current/api/" />
+ <link href="http://www.umiacs.umd.edu/~jimmylin/Cloud9/docs/api/" />
+ <link href="http://nlp.stanford.edu/nlp/javadoc/javanlp/" />
+ </javadoc>
+ </target>
+
+ <target name="dist" depends="clean,jar,javadoc" description="generate the distribution">
<tar destfile="mavuno-${version}.tar.gz">
<fileset dir=".">
<exclude name="build/**" />
@@ -64,14 +150,16 @@
</fileset>
</zip>
</target>
-
- <target name="clean" description="clean up" >
- <delete dir="${build}"/>
- <delete dir="${docs}"/>
+
+ <target name="clean" description="clean up">
+ <delete dir="${lib.dir}" />
+ <delete dir="${build.dir}" />
+ <delete dir="${dist.dir}" />
+ <delete dir="${javadoc.dir}" />
<delete file="mavuno-${version}.jar" />
<delete file="mavuno-${version}-models.jar" />
<delete file="mavuno-${version}.zip" />
- <delete file="mavuno-${version}.tar.gz" />
+ <delete file="mavuno-${version}.tar.gz" />
</target>
</project>
View
@@ -0,0 +1,32 @@
+<ivy-module version="2.0">
+ <info organisation="edu.isi" module="mavuno" revision="${version}"/>
+ <publications>
+ <artifact type="pom" ext="pom" conf="default"/>
+ <artifact type="jar" ext="jar" conf="default"/>
+ </publications>
+
+ <dependencies>
+
+ <!-- Hadoop -->
+ <dependency org="org.apache.hadoop" name="hadoop-core" rev="0.20.2" />
+
+ <!-- Google Guava -->
+ <dependency org="com.google.guava" name="guava" rev="r09" />
+
+ <!-- Log4j -->
+ <dependency org="log4j" name="log4j" rev="1.2.16" />
+
+ <!-- Stanford Core NLP -->
+ <dependency org="edu.stanford" name="stanford-corenlp" rev="2011-09-16" />
+ <dependency org="edu.stanford" name="stanford-corenlp" rev="2011-09-14-models" />
+ <dependency org="joda-time" name="joda-time" rev="2.0" />
+ <dependency org="xom" name="xom" rev="1.2.5" />
+
+ <!-- FANSE Parser -->
+ <dependency org="edu.isi" name="fanseparser" rev="0.2.2" />
+
+ <!-- OpenNLP -->
+ <dependency org="org.apache.opennlp" name="opennlp-tools" rev="1.5.1-incubating" />
+
+ </dependencies>
+</ivy-module>
View
@@ -0,0 +1,15 @@
+<ivysettings>
+ <settings defaultResolver="chain-resolver"/>
+ <resolvers>
+ <chain name="chain-resolver" returnFirst="true">
+ <filesystem name="libraries">
+ <artifact pattern="${basedir}/ivy/local-repo/[artifact]-[revision].[ext]" />
+ </filesystem>
+
+ <ibiblio name="ibiblio" m2compatible="true" />
+
+ <!-- jwnl -->
+ <ibiblio name="opennlp" m2compatible="true" root="http://opennlp.sourceforge.net/maven2/" />
+ </chain>
+ </resolvers>
+</ivysettings>
File renamed without changes.
File renamed without changes.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
Binary file not shown.
Binary file not shown.
View
Binary file not shown.
View
18 pom.xml
@@ -0,0 +1,18 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>edu.isi</groupId>
+ <artifactId>mavuno</artifactId>
+ <version>0.1</version>
+ <name>Mavuno</name>
+ <description>Mavuno: A Hadoop-Based Text Mining Toolkit</description>
+ <url>http://mavuno.isi.edu</url>
+ <organization>
+ <name>University of Southern California - Information Sciences Institute</name>
+ <url>http://www.isi.edu</url>
+ </organization>
+ <scm>
+ <url>scm:git@github.com:metzlerd/mavuno.git</url>
+ <connection>scm:git@github.com:metzlerd/mavuno.git</connection>
+ <developerConnection>scm:git@github.com:metzlerd/mavuno.git</developerConnection>
+ </scm>
+</project>
@@ -360,6 +360,7 @@ public void map(Writable key, SentenceSegmentedDocument<TratzParsedTokenWritable
// skip if we can't find an alignment for some reason
if(argOffset == -1) {
+ sLogger.warn("Can't find alignment for: " + arg + " in sentence: " + sentenceText);
continue;
}
@@ -406,14 +407,15 @@ public void map(Writable key, SentenceSegmentedDocument<TratzParsedTokenWritable
Text [] expectedNames = mArgNames.get(pair.id);
Text [] expectedTypes = mArgTypes.get(pair.id);
Text [] expectedClasses = mArgClasses.get(pair.id);
-
- // skip this pair if we're missing name and/or type information
+
+ // uh oh, we're missing name and/or type information
if(expectedNames == null || expectedTypes == null || expectedClasses == null) {
- continue;
+ throw new RuntimeException("Missing name, type, and/or class information for: " + pair);
}
// perform length count checking
if(expectedClasses.length != expectedNames.length || expectedNames.length != expectedTypes.length || expectedTypes.length != allArgClasses.size()) {
+ sLogger.warn("Argument length mismatch for: " + pair + " -- skipping!");
continue;
}
@@ -694,6 +696,7 @@ private IntPair getChunkSpan(SentenceWritable<TratzParsedTokenWritable> tokens,
return new IntPair(beginPos, tokens.getNumTokens() - beginPos);
}
+ // TODO: there has to be a better way...
private int getOffset(Text text, String sentence) {
String paddedText = " " + text + " ";
@@ -702,6 +705,17 @@ private int getOffset(Text text, String sentence) {
if(sentence.charAt(i) == ' ') {
offset++;
}
+
+ if(i == 0) {
+ paddedText = text + " ";
+ }
+ else if(i == 1) {
+ paddedText = " " + text + " ";
+ }
+ else if(i == sentence.length() - text.getLength() - 1) {
+ paddedText = " " + text;
+ }
+
if(sentence.regionMatches(i, paddedText, 0, paddedText.length())) {
return offset;
}
@@ -775,7 +789,7 @@ private int getLength(Text text) {
for(int i = offset; i < offset + length; i++) {
tokens[i-offset] = new Text(sentence.getTokenAt(i).getToken().toString());
}
-
+
Set<IdWeightPair> matches = new HashSet<IdWeightPair>();
Text pattern = new Text();
@@ -41,11 +41,11 @@
import org.apache.log4j.Logger;
import tratz.parse.types.Token;
+import edu.isi.mavuno.input.Indexable;
import edu.isi.mavuno.nlp.NLProcTools;
import edu.isi.mavuno.util.MavunoUtils;
import edu.isi.mavuno.util.TratzParsedTokenWritable;
import edu.stanford.nlp.ling.Word;
-import edu.umd.cloud9.collection.Indexable;
/**
* @author metzler
@@ -40,12 +40,12 @@
import org.apache.log4j.Logger;
import tratz.parse.types.Token;
+import edu.isi.mavuno.input.Indexable;
import edu.isi.mavuno.nlp.NLProcTools;
import edu.isi.mavuno.util.ContextPatternWritable;
import edu.isi.mavuno.util.MavunoUtils;
import edu.isi.mavuno.util.TratzParsedTokenWritable;
import edu.stanford.nlp.ling.Word;
-import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.util.map.HMapKF;
import edu.umd.cloud9.util.map.HMapKL;
import edu.umd.cloud9.util.map.MapKF;
Oops, something went wrong.

0 comments on commit f243e40

Please sign in to comment.