Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initial import of mavuno-0.1

  • Loading branch information...
commit 53e1e9fc0f4cb48b83ebf619b222207b8b73e2fe 0 parents
Don Metzler authored
Showing with 590,346 additions and 0 deletions.
  1. +14 −0 .classpath
  2. +17 −0 .project
  3. +12 −0 .settings/org.eclipse.jdt.core.prefs
  4. +71 −0 LIBRARY-LICENSES
  5. +202 −0 LICENSE-2.0.txt
  6. +83 −0 README
  7. +77 −0 build.xml
  8. +3 −0  data/examples.txt
  9. +4 −0 data/patterns.txt
  10. +4,721 −0 data/wizard-of-oz.txt
  11. BIN  lib/cloud9-1.2.5.jar
  12. BIN  lib/commons-cli-1.2.jar
  13. BIN  lib/commons-logging-1.1.jar
  14. BIN  lib/fanseparser-0.2.2.jar
  15. BIN  lib/hadoop-0.20.2-core.jar
  16. BIN  lib/joda-time.jar
  17. BIN  lib/log4j-1.2.13.jar
  18. BIN  lib/opennlp-maxent-3.0.1-incubating.jar
  19. BIN  lib/opennlp-tools-1.5.1-incubating.jar
  20. BIN  lib/stanford-corenlp-2011-09-14-models.jar
  21. BIN  lib/stanford-corenlp-2011-09-16.jar
  22. BIN  lib/xom.jar
  23. BIN  models/all.3class.distsim.crf.ser.gz
  24. BIN  models/en-chunker.bin
  25. BIN  models/en-sent.bin
  26. BIN  models/tratzParseModel.gz
  27. BIN  models/tratzPosTaggingModel.gz
  28. +121 −0 src/edu/isi/mavuno/app/distsim/ComputeContextScores.java
  29. +121 −0 src/edu/isi/mavuno/app/distsim/ComputePatternScores.java
  30. +108 −0 src/edu/isi/mavuno/app/distsim/ContextToContext.java
  31. +146 −0 src/edu/isi/mavuno/app/distsim/ContextToPattern.java
  32. +146 −0 src/edu/isi/mavuno/app/distsim/PatternToContext.java
  33. +108 −0 src/edu/isi/mavuno/app/distsim/PatternToPattern.java
  34. +891 −0 src/edu/isi/mavuno/app/ie/ExtractRelations.java
  35. +177 −0 src/edu/isi/mavuno/app/ie/HarvestEspressoContexts.java
  36. +177 −0 src/edu/isi/mavuno/app/ie/HarvestEspressoPatterns.java
  37. +279 −0 src/edu/isi/mavuno/app/ie/HarvestSAPInstances.java
  38. +339 −0 src/edu/isi/mavuno/app/ie/HarvestUDAPInstances.java
  39. +199 −0 src/edu/isi/mavuno/app/mine/HarvestContextPatternPairs.java
  40. +204 −0 src/edu/isi/mavuno/app/mine/HarvestParaphraseCandidates.java
  41. +200 −0 src/edu/isi/mavuno/app/mine/HarvestSentences.java
  42. +182 −0 src/edu/isi/mavuno/app/nlp/HarvestParseGraph.java
  43. +370 −0 src/edu/isi/mavuno/app/nlp/ProcessStanfordNLP.java
  44. +336 −0 src/edu/isi/mavuno/app/nlp/TratzParse.java
  45. +124 −0 src/edu/isi/mavuno/app/util/ExamplesToSequenceFile.java
  46. +115 −0 src/edu/isi/mavuno/app/util/SequenceFileToText.java
  47. +246 −0 src/edu/isi/mavuno/extract/ChunkExtractor.java
  48. +123 −0 src/edu/isi/mavuno/extract/CombineGlobalStats.java
  49. +203 −0 src/edu/isi/mavuno/extract/CombineSplits.java
  50. +129 −0 src/edu/isi/mavuno/extract/CooccurExtractor.java
  51. +378 −0 src/edu/isi/mavuno/extract/DIRTExtractor.java
  52. +355 −0 src/edu/isi/mavuno/extract/Extract.java
  53. +351 −0 src/edu/isi/mavuno/extract/ExtractGlobalStats.java
  54. +40 −0 src/edu/isi/mavuno/extract/Extractor.java
  55. +91 −0 src/edu/isi/mavuno/extract/MultiExtractor.java
  56. +326 −0 src/edu/isi/mavuno/extract/NAryChunkExtractor.java
  57. +251 −0 src/edu/isi/mavuno/extract/NGramExtractor.java
  58. +132 −0 src/edu/isi/mavuno/extract/PassageExtractor.java
  59. +211 −0 src/edu/isi/mavuno/extract/Split.java
  60. +137 −0 src/edu/isi/mavuno/extract/TwitterCooccurExtractor.java
  61. +325 −0 src/edu/isi/mavuno/extract/TwitterGeoTemporalExtractor.java
  62. +132 −0 src/edu/isi/mavuno/input/ClueWarcInputFormat.java
  63. +29 −0 src/edu/isi/mavuno/input/IndexableFileInputFormat.java
  64. +109 −0 src/edu/isi/mavuno/input/LineInputFormat.java
  65. +26 −0 src/edu/isi/mavuno/input/Passagifiable.java
  66. +183 −0 src/edu/isi/mavuno/input/SentenceSegmentedDocument.java
  67. +32 −0 src/edu/isi/mavuno/input/SimpleTokenizer.java
  68. +33 −0 src/edu/isi/mavuno/input/StanfordParsedDocument.java
  69. +713 −0 src/edu/isi/mavuno/input/TagTokenizer.java
  70. +73 −0 src/edu/isi/mavuno/input/TextDocument.java
  71. +157 −0 src/edu/isi/mavuno/input/TextFileInputFormat.java
  72. +26 −0 src/edu/isi/mavuno/input/Tokenizer.java
  73. +33 −0 src/edu/isi/mavuno/input/TratzParsedDocument.java
  74. +125 −0 src/edu/isi/mavuno/input/TrecDocument.java
  75. +102 −0 src/edu/isi/mavuno/input/TrecInputFormat.java
  76. +199 −0 src/edu/isi/mavuno/input/TwitterDocument.java
  77. +94 −0 src/edu/isi/mavuno/input/TwitterInputFormat.java
  78. +175 −0 src/edu/isi/mavuno/input/XMLInputFormat.java
  79. +555 −0 src/edu/isi/mavuno/nlp/NLProcTools.java
  80. +136 −0 src/edu/isi/mavuno/score/CombineScores.java
  81. +212 −0 src/edu/isi/mavuno/score/GetTopResults.java
  82. +67 −0 src/edu/isi/mavuno/score/LikelihoodScorer.java
  83. +124 −0 src/edu/isi/mavuno/score/PMIScorer.java
  84. +233 −0 src/edu/isi/mavuno/score/ScoreContexts.java
  85. +233 −0 src/edu/isi/mavuno/score/ScorePatterns.java
  86. +58 −0 src/edu/isi/mavuno/score/Scorer.java
  87. +150 −0 src/edu/isi/mavuno/score/TFIDFScorer.java
  88. +186 −0 src/edu/isi/mavuno/score/UpdateWeights.java
  89. +110 −0 src/edu/isi/mavuno/util/ContextPatternStatsWritable.java
  90. +347 −0 src/edu/isi/mavuno/util/ContextPatternWritable.java
  91. +47 −0 src/edu/isi/mavuno/util/ContextPatternWritableScorePair.java
  92. +52 −0 src/edu/isi/mavuno/util/IdWeightPair.java
  93. +67 −0 src/edu/isi/mavuno/util/Individual.java
  94. +242 −0 src/edu/isi/mavuno/util/MavunoUtils.java
  95. +64 −0 src/edu/isi/mavuno/util/PairWritable.java
  96. +145 −0 src/edu/isi/mavuno/util/PermutationGenerator.java
  97. +116 −0 src/edu/isi/mavuno/util/Relation.java
  98. +79 −0 src/edu/isi/mavuno/util/ScoreWritable.java
  99. +121 −0 src/edu/isi/mavuno/util/SentenceWritable.java
  100. +82 −0 src/edu/isi/mavuno/util/StanfordParsedTokenWritable.java
  101. +32 −0 src/edu/isi/mavuno/util/TextLongPairWritable.java
  102. +31 −0 src/edu/isi/mavuno/util/TextTextPairWritable.java
  103. +23 −0 src/edu/isi/mavuno/util/TokenFactory.java
  104. +91 −0 src/edu/isi/mavuno/util/TokenWritable.java
  105. +210 −0 src/edu/isi/mavuno/util/TratzParsedTokenWritable.java
  106. +73 −0 src/edu/isi/mavuno/util/TypedTextSpan.java
  107. +279 −0 src/org/json/CDL.java
  108. +169 −0 src/org/json/Cookie.java
  109. +90 −0 src/org/json/CookieList.java
  110. +163 −0 src/org/json/HTTP.java
  111. +77 −0 src/org/json/HTTPTokener.java
  112. +918 −0 src/org/json/JSONArray.java
  113. +31 −0 src/org/json/JSONException.java
  114. +455 −0 src/org/json/JSONML.java
  115. +1,584 −0 src/org/json/JSONObject.java
  116. +18 −0 src/org/json/JSONString.java
  117. +78 −0 src/org/json/JSONStringer.java
  118. +435 −0 src/org/json/JSONTokener.java
  119. +323 −0 src/org/json/JSONWriter.java
  120. +678 −0 src/org/json/Test.java
  121. +441 −0 src/org/json/XML.java
  122. +365 −0 src/org/json/XMLTokener.java
  123. +314 −0 wordnet/Makefile
  124. +2 −0  wordnet/Makefile.am
  125. +314 −0 wordnet/Makefile.in
  126. +1,490 −0 wordnet/adj.exc
  127. +7 −0 wordnet/adv.exc
  128. +37,387 −0 wordnet/cntlist
  129. +37,387 −0 wordnet/cntlist.rev
  130. +18,185 −0 wordnet/data.adj
  131. +3,650 −0 wordnet/data.adv
  132. +82,144 −0 wordnet/data.noun
  133. +13,796 −0 wordnet/data.verb
  134. +35 −0 wordnet/frames.vrb
  135. +21,508 −0 wordnet/index.adj
  136. +4,510 −0 wordnet/index.adv
  137. +117,827 −0 wordnet/index.noun
  138. +206,941 −0 wordnet/index.sense
  139. +11,558 −0 wordnet/index.verb
  140. +45 −0 wordnet/lexnames
  141. +89 −0 wordnet/log.grind.3.0
  142. +2,054 −0 wordnet/noun.exc
  143. +3,421 −0 wordnet/sentidx.vrb
  144. +170 −0 wordnet/sents.vrb
  145. +35 −0 wordnet/verb.Framestext
  146. +2,401 −0 wordnet/verb.exc
14 .classpath
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" path="src"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
+ <classpathentry kind="lib" path="lib/cloud9-1.2.5.jar"/>
+ <classpathentry kind="lib" path="lib/commons-logging-1.1.jar"/>
+ <classpathentry kind="lib" path="lib/fanseparser-0.2.2.jar"/>
+ <classpathentry kind="lib" path="lib/hadoop-0.20.2-core.jar"/>
+ <classpathentry kind="lib" path="lib/log4j-1.2.13.jar"/>
+ <classpathentry kind="lib" path="lib/opennlp-maxent-3.0.1-incubating.jar"/>
+ <classpathentry kind="lib" path="lib/opennlp-tools-1.5.1-incubating.jar"/>
+ <classpathentry kind="lib" path="lib/stanford-corenlp-2011-09-16.jar"/>
+ <classpathentry kind="output" path="bin"/>
+</classpath>
17 .project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>mavuno</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ </natures>
+</projectDescription>
12 .settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,12 @@
+#Fri Sep 03 11:40:04 PDT 2010
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.6
71 LIBRARY-LICENSES
@@ -0,0 +1,71 @@
+The following are libraries that Mavuno either directly or indirectly
+ynamically links to. For each library, we provide URL and software license
+information.
+
+-----------------------------------------
+
+cloud9-1.2.5.jar
+
+URL: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/
+License: Apache License 2.0
+
+-----------------------------------------
+
+commons-cli-1.2.jar
+commons-logging-1.1.jar
+
+URL: http://commons.apache.org/
+License: Apache License 2.0
+
+-----------------------------------------
+
+fanseparser-0.2.2.jar
+
+URL: http://www.isi.edu/publications/licensed-sw/fanseparser/index.html
+License: Apache License 2.0
+
+-----------------------------------------
+
+hadoop-0.20.2-core.jar
+
+URL: http://hadoop.apache.org/
+License: Apache License 2.0
+
+-----------------------------------------
+
+joda-time-2.0.jar
+
+URL: http://joda-time.sourceforge.net/
+License: Apache License 2.0
+
+-----------------------------------------
+
+log4j-1.2.13.jar
+
+URL: http://logging.apache.org/log4j/
+License: Apache License 2.0
+
+-----------------------------------------
+
+opennlp-maxent-3.0.1-incubating.jar
+opennlp-tools-1.5.1-incubating.jar
+
+URL: http://incubator.apache.org/opennlp/
+License: Apache License 2.0
+
+-----------------------------------------
+
+stanford-corenlp-2011-09-14-models.jar
+stanford-corenlp-2011-09-16.jar
+
+URL: http://nlp.stanford.edu/software/corenlp.shtml
+License: GNU Public License (GPL)
+
+-----------------------------------------
+
+xom.jar
+
+URL: http://www.xom.nu/
+License: GNU Lesser General Public License (LGPL)
+
+-----------------------------------------
202 LICENSE-2.0.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
83 README
@@ -0,0 +1,83 @@
+==============================================
+* Mavuno: A Hadoop-Based Text Mining Toolkit *
+==============================================
+
+Mavuno is a Hadoop-based open-source research toolkit that supports a variety of
+large-scale distributed text mining operations, including:
+
+* Pattern mining
+* NLP Processing (POS Tagging, Chunking, Parsing, Named Entity Recognition)
+* Distributional similarity
+* Random walks over bi-partite graphs
+* Paraphrase harvsting
+* Class instance mining
+* Semantic relation learning
+* Information extraction
+
+=======================
+* System Requirements *
+=======================
+
+Mavuno requires the following software:
+* Java 1.6 (or greater)
+* Hadoop 0.20.2
+
+======================
+* Configuring Mavuno *
+======================
+
+The only configuration necessary is to ensure that the jar files in the
+"lib" directory of the Mavuno distributions can be found in the Hadoop
+classpath. The simplest way to achieve this is to copy the jars to the "lib"
+directory of your Hadoop installation.
+
+=================
+* Documentation *
+=================
+
+Please see http://mavuno.isi.edu for the most recent documentation and
+examples.
+
+===========
+* Contact *
+===========
+
+Direct all Mavuno-related questions, comments, bugs, and requests to:
+mavuno@isi.edu .
+
+=========
+* About *
+=========
+
+Mavuno was developed at the University of Southern California’s Information
+Sciences Institute by Donald Metzler during 2010-2011. In October 2011, Mavuno
+was released as an open source project for use by the broader research
+community.
+
+Mavuno is available under an Apache License, Version 2.0 (see LICENSE-2.0.txt
+in the root directory of the distribution for more details).
+
+We kindly ask that you use the following reference when citing Mavuno:
+
+Metzler, D., and Hovy, E. "Mavuno: A Scalable and Effective Hadoop-Based
+Paraphrase Acquisition System," to appear in the KDD Workshop on Large-scale
+Data Mining: Theory and Applications(LDMTA 2011), 2011.
+
+@inproceedings{Metzler:2011:MSE:2002945.2002948,
+ author = {Metzler, Donald and Hovy, Eduard},
+ title = {Mavuno: a scalable and effective Hadoop-based paraphrase acquisition system},
+ booktitle = {Proceedings of the Third Workshop on Large Scale Data Mining: Theory and Applications},
+ series = {LDMTA '11},
+ year = {2011},
+ isbn = {978-1-4503-0844-1},
+ location = {San Diego, California},
+ pages = {3:1--3:8},
+ articleno = {3},
+ numpages = {8},
+ url = {http://doi.acm.org/10.1145/2002945.2002948},
+ doi = {http://doi.acm.org/10.1145/2002945.2002948},
+ acmid = {2002948},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {Hadoop, large-scale text mining, paraphrase acquisition},
+}
77 build.xml
@@ -0,0 +1,77 @@
+<project name="mavuno" default="jar" basedir=".">
+
+ <description>Mavuno build file.</description>
+
+ <!-- set global properties for this build -->
+ <property name="version" value="0.1" />
+ <property name="src" location="src"/>
+ <property name="build" location="build"/>
+ <property name="models" location="models" />
+ <property name="lib" location="lib" />
+ <property name="docs" location="docs" />
+
+ <path id="classpath">
+ <fileset dir="lib">
+ <include name="**/*.jar"/>
+ </fileset>
+ </path>
+
+ <target name="init">
+ <tstamp/>
+ <mkdir dir="${build}"/>
+ </target>
+
+ <target name="compile" depends="init" description="compile the source " >
+ <javac classpathref="classpath" srcdir="${src}" destdir="${build}"/>
+ </target>
+
+ <target name="jar" depends="compile" description="builds the jar">
+ <jar jarfile="mavuno-${version}.jar" basedir="${build}" />
+ <jar jarfile="mavuno-${version}-models.jar">
+ <fileset dir=".">
+ <include name="models/**" />
+ <include name="wordnet/**" />
+ </fileset>
+ </jar>
+ </target>
+
+ <target name="javadoc">
+ <javadoc destdir="${docs}" access="public" use="false" notree="false" nonavbar="false" noindex="false" splitindex="no" author="true" version="true" nodeprecatedlist="true" nodeprecated="true" classpathref="classpath">
+ <fileset dir="${src}">
+ <include name="**/*.java" />
+ <exclude name="ivory/**/*.java" />
+ <exclude name="org/**/*.java" />
+ </fileset>
+ <link href="http://download.oracle.com/javase/6/docs/api/" />
+ <link href="http://hadoop.apache.org/common/docs/current/api/" />
+ <link href="http://www.umiacs.umd.edu/~jimmylin/Cloud9/docs/api/" />
+ <link href="http://nlp.stanford.edu/nlp/javadoc/javanlp/" />
+ </javadoc>
+ </target>
+
+ <target name="dist" depends="clean,jar,javadoc" description="generate the distribution" >
+ <tar destfile="mavuno-${version}.tar.gz">
+ <fileset dir=".">
+ <exclude name="build/**" />
+ <exclude name="models/**" />
+ </fileset>
+ </tar>
+ <zip destfile="mavuno-${version}.zip">
+ <fileset dir=".">
+ <exclude name="mavuno-${version}.tar.gz" />
+ <exclude name="build/**" />
+ <exclude name="models/**" />
+ </fileset>
+ </zip>
+ </target>
+
+ <target name="clean" description="clean up" >
+ <delete dir="${build}"/>
+ <delete dir="${docs}"/>
+ <delete file="mavuno-${version}.jar" />
+ <delete file="mavuno-${version}-models.jar" />
+ <delete file="mavuno-${version}.zip" />
+ <delete file="mavuno-${version}.tar.gz" />
+ </target>
+
+</project>
3  data/examples.txt
@@ -0,0 +1,3 @@
+1 * dorothy 1
+2 * tin man 1
+3 * said 1
4 data/patterns.txt
@@ -0,0 +1,4 @@
+yellow brick
+kill(ed)?.*Witch.*(West|East)
+<sentence>\t1\tDorothy\t
+\tPERSON\t.*\tLOCATION\t
4,721 data/wizard-of-oz.txt
4,721 additions, 0 deletions not shown
BIN  lib/cloud9-1.2.5.jar
Binary file not shown
BIN  lib/commons-cli-1.2.jar
Binary file not shown
BIN  lib/commons-logging-1.1.jar
Binary file not shown
BIN  lib/fanseparser-0.2.2.jar
Binary file not shown
BIN  lib/hadoop-0.20.2-core.jar
Binary file not shown
BIN  lib/joda-time.jar
Binary file not shown
BIN  lib/log4j-1.2.13.jar
Binary file not shown
BIN  lib/opennlp-maxent-3.0.1-incubating.jar
Binary file not shown
BIN  lib/opennlp-tools-1.5.1-incubating.jar
Binary file not shown
BIN  lib/stanford-corenlp-2011-09-14-models.jar
Binary file not shown
BIN  lib/stanford-corenlp-2011-09-16.jar
Binary file not shown
BIN  lib/xom.jar
Binary file not shown
BIN  models/all.3class.distsim.crf.ser.gz
Binary file not shown
BIN  models/en-chunker.bin
Binary file not shown
BIN  models/en-sent.bin
Binary file not shown
BIN  models/tratzParseModel.gz
Binary file not shown
BIN  models/tratzPosTaggingModel.gz
Binary file not shown
121 src/edu/isi/mavuno/app/distsim/ComputeContextScores.java
@@ -0,0 +1,121 @@
+/*
+ * Mavuno: A Hadoop-Based Text Mining Toolkit
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package edu.isi.mavuno.app.distsim;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+
+import edu.isi.mavuno.app.util.SequenceFileToText;
+import edu.isi.mavuno.score.ScoreContexts;
+import edu.isi.mavuno.score.ScorePatterns;
+import edu.isi.mavuno.score.UpdateWeights;
+import edu.isi.mavuno.util.MavunoUtils;
+
+/**
+ * @author metzler
+ *
+ */
+public class ComputeContextScores extends Configured implements Tool {
+ private static final Logger sLogger = Logger.getLogger(ComputeContextScores.class);
+
+ public ComputeContextScores(Configuration conf) {
+ super(conf);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
+ */
+ @Override
+ public int run(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
+ MavunoUtils.readParameters(args, "Mavuno.ComputeContextScores", getConf());
+ return run();
+ }
+
+ public int run() throws ClassNotFoundException, InterruptedException, IOException {
+ Configuration conf = getConf();
+
+ String inputPath = MavunoUtils.getRequiredParam("Mavuno.ComputeContextScores.InputPath", conf);
+ String patternScorerClass = MavunoUtils.getOptionalParam("Mavuno.ComputeContextScores.PatternScorerClass", conf);
+ String patternScorerArgs = MavunoUtils.getOptionalParam("Mavuno.ComputeContextScores.PatternScorerArgs", conf);
+ String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.ComputeContextScores.ContextScorerClass", conf);
+ String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.ComputeContextScores.ContextScorerArgs", conf);
+ String outputPath = MavunoUtils.getRequiredParam("Mavuno.ComputeContextScores.OutputPath", conf);
+
+ MavunoUtils.createDirectory(conf, outputPath);
+
+ sLogger.info("Tool name: ComputeContextScores");
+ sLogger.info(" - Input path: " + inputPath);
+ sLogger.info(" - Pattern scorer class: " + patternScorerClass);
+ sLogger.info(" - Pattern scorer args: " + patternScorerArgs);
+ sLogger.info(" - Context scorer class: " + contextScorerClass);
+ sLogger.info(" - Context scorer args: " + contextScorerArgs);
+ sLogger.info(" - Output path: " + outputPath);
+
+ // set total terms path
+ conf.set("Mavuno.TotalTermsPath", inputPath + "/totalTerms");
+
+ if(patternScorerClass != null) {
+ // score patterns
+ conf.set("Mavuno.ScorePatterns.InputPath", inputPath + "/pattern-stats");
+ conf.set("Mavuno.ScorePatterns.OutputPath", outputPath + "/scored-patterns-raw");
+ conf.set("Mavuno.Scorer.Class", patternScorerClass);
+ conf.set("Mavuno.Scorer.Args", patternScorerArgs);
+ new ScorePatterns(conf).run();
+
+ // update pattern weights
+ conf.set("Mavuno.UpdateWeights.StatsPath", inputPath + "/context-stats");
+ conf.set("Mavuno.UpdateWeights.ScoresPath", outputPath + "/scored-patterns-raw");
+ conf.set("Mavuno.UpdateWeights.ExampleType", "pattern");
+ conf.set("Mavuno.UpdateWeights.OutputPath", outputPath + "/context-stats");
+ new UpdateWeights(conf).run();
+
+ conf.set("Mavuno.ScoreContexts.InputPath", outputPath + "/context-stats");
+ }
+ else {
+ conf.set("Mavuno.ScoreContexts.InputPath", inputPath + "/context-stats");
+ }
+
+ // score contexts
+ conf.set("Mavuno.ScoreContexts.OutputPath", outputPath + "/scored-contexts-raw");
+ conf.set("Mavuno.Scorer.Class", contextScorerClass);
+ conf.set("Mavuno.Scorer.Args", contextScorerArgs);
+ new ScoreContexts(conf).run();
+
+ // convert sequence file to text
+ conf.set("Mavuno.SequenceFileToText.InputPath", outputPath + "/scored-contexts-raw");
+ conf.set("Mavuno.SequenceFileToText.OutputPath", outputPath + "/scored-contexts");
+ new SequenceFileToText(conf).run();
+
+ return 0;
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+ int res = ToolRunner.run(new ComputeContextScores(conf), args);
+ System.exit(res);
+ }
+
+}
121 src/edu/isi/mavuno/app/distsim/ComputePatternScores.java
@@ -0,0 +1,121 @@
+/*
+ * Mavuno: A Hadoop-Based Text Mining Toolkit
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package edu.isi.mavuno.app.distsim;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+
+import edu.isi.mavuno.app.util.SequenceFileToText;
+import edu.isi.mavuno.score.ScoreContexts;
+import edu.isi.mavuno.score.ScorePatterns;
+import edu.isi.mavuno.score.UpdateWeights;
+import edu.isi.mavuno.util.MavunoUtils;
+
+/**
+ * @author metzler
+ *
+ */
+public class ComputePatternScores extends Configured implements Tool {
+ private static final Logger sLogger = Logger.getLogger(ComputePatternScores.class);
+
+ public ComputePatternScores(Configuration conf) {
+ super(conf);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
+ */
+ @Override
+ public int run(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
+ MavunoUtils.readParameters(args, "Mavuno.ComputePatternScores", getConf());
+ return run();
+ }
+
+ public int run() throws ClassNotFoundException, InterruptedException, IOException {
+ Configuration conf = getConf();
+
+ String inputPath = MavunoUtils.getRequiredParam("Mavuno.ComputePatternScores.InputPath", conf);
+ String patternScorerClass = MavunoUtils.getRequiredParam("Mavuno.ComputePatternScores.PatternScorerClass", conf);
+ String patternScorerArgs = MavunoUtils.getRequiredParam("Mavuno.ComputePatternScores.PatternScorerArgs", conf);
+ String contextScorerClass = MavunoUtils.getOptionalParam("Mavuno.ComputePatternScores.ContextScorerClass", conf);
+ String contextScorerArgs = MavunoUtils.getOptionalParam("Mavuno.ComputePatternScores.ContextScorerArgs", conf);
+ String outputPath = MavunoUtils.getRequiredParam("Mavuno.ComputePatternScores.OutputPath", conf);
+
+ MavunoUtils.createDirectory(conf, outputPath);
+
+ sLogger.info("Tool name: ComputePatternScores");
+ sLogger.info(" - Input path: " + inputPath);
+ sLogger.info(" - Pattern scorer class: " + patternScorerClass);
+ sLogger.info(" - Pattern scorer args: " + patternScorerArgs);
+ sLogger.info(" - Context scorer class: " + contextScorerClass);
+ sLogger.info(" - Context scorer args: " + contextScorerArgs);
+ sLogger.info(" - Output path: " + outputPath);
+
+ // set total terms path
+ conf.set("Mavuno.TotalTermsPath", inputPath + "/totalTerms");
+
+ if(contextScorerClass != null) {
+ // score contexts
+ conf.set("Mavuno.ScoreContexts.InputPath", inputPath + "/context-stats");
+ conf.set("Mavuno.ScoreContexts.OutputPath", outputPath + "/scored-contexts-raw");
+ conf.set("Mavuno.Scorer.Class", contextScorerClass);
+ conf.set("Mavuno.Scorer.Args", contextScorerArgs);
+ new ScoreContexts(conf).run();
+
+ // update context weights
+ conf.set("Mavuno.UpdateWeights.StatsPath", inputPath + "/pattern-stats");
+ conf.set("Mavuno.UpdateWeights.ScoresPath", outputPath + "/scored-contexts-raw");
+ conf.set("Mavuno.UpdateWeights.ExampleType", "context");
+ conf.set("Mavuno.UpdateWeights.OutputPath", outputPath + "/pattern-stats");
+ new UpdateWeights(conf).run();
+
+ conf.set("Mavuno.ScorePatterns.InputPath", outputPath + "/pattern-stats");
+ }
+ else {
+ conf.set("Mavuno.ScorePatterns.InputPath", inputPath + "/pattern-stats");
+ }
+
+ // score patterns
+ conf.set("Mavuno.ScorePatterns.OutputPath", outputPath + "/scored-patterns-raw");
+ conf.set("Mavuno.Scorer.Class", patternScorerClass);
+ conf.set("Mavuno.Scorer.Args", patternScorerArgs);
+ new ScorePatterns(conf).run();
+
+ // convert sequencefile to text
+ conf.set("Mavuno.SequenceFileToText.InputPath", outputPath + "/scored-patterns-raw");
+ conf.set("Mavuno.SequenceFileToText.OutputPath", outputPath + "/scored-patterns");
+ new SequenceFileToText(conf).run();
+
+ return 0;
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+ int res = ToolRunner.run(new ComputePatternScores(conf), args);
+ System.exit(res);
+ }
+
+}
108 src/edu/isi/mavuno/app/distsim/ContextToContext.java
@@ -0,0 +1,108 @@
+/*
+ * Mavuno: A Hadoop-Based Text Mining Toolkit
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package edu.isi.mavuno.app.distsim;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+
+import edu.isi.mavuno.util.MavunoUtils;
+
+/**
+ * @author metzler
+ *
+ */
+public class ContextToContext extends Configured implements Tool {
+ private static final Logger sLogger = Logger.getLogger(ContextToContext.class);
+
+ public ContextToContext(Configuration conf) {
+ super(conf);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
+ */
+ @Override
+ public int run(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
+ MavunoUtils.readParameters(args, "Mavuno.ContextToContext", getConf());
+ return run();
+ }
+
+ public int run() throws ClassNotFoundException, InterruptedException, IOException {
+ Configuration conf = getConf();
+
+ String contextPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ContextPath", conf);
+ String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusClass", conf);
+ String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusPath", conf);
+ String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorClass", conf);
+ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorArgs", conf);
+ int minMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.MinMatches", conf));
+ boolean harvestGlobalStats = Boolean.parseBoolean(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.GlobalStats", conf));
+ String outputPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.OutputPath", conf);
+
+ MavunoUtils.createDirectory(conf, outputPath);
+
+ sLogger.info("Tool name: ContextToContext");
+ sLogger.info(" - Context path: " + contextPath);
+ sLogger.info(" - Corpus class: " + corpusClass);
+ sLogger.info(" - Corpus path: " + corpusPath);
+ sLogger.info(" - Output path: " + outputPath);
+ sLogger.info(" - Extractor class: " + extractorClass);
+ sLogger.info(" - Extractor arguments: " + extractorArgs);
+ sLogger.info(" - Min matches: " + minMatches);
+ sLogger.info(" - Harvest global stats: " + harvestGlobalStats);
+
+ // context to pattern
+ conf.set("Mavuno.ContextToPattern.ContextPath", contextPath);
+ conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath);
+ conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass);
+ conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass);
+ conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs);
+ conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches);
+ conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", harvestGlobalStats);
+ conf.set("Mavuno.ContextToPattern.OutputPath", outputPath);
+ new ContextToPattern(conf).run();
+
+ // pattern to context
+ conf.set("Mavuno.PatternToContext.PatternPath", outputPath + "/pattern-stats");
+ conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath);
+ conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass);
+ conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass);
+ conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs);
+ conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches);
+ conf.setBoolean("Mavuno.PatternToContext.GlobalStats", harvestGlobalStats);
+ conf.set("Mavuno.PatternToContext.OutputPath", outputPath);
+ new PatternToContext(conf).run();
+
+ return 0;
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+ int res = ToolRunner.run(new ContextToContext(conf), args);
+ System.exit(res);
+ }
+
+}
146 src/edu/isi/mavuno/app/distsim/ContextToPattern.java
@@ -0,0 +1,146 @@
+/*
+ * Mavuno: A Hadoop-Based Text Mining Toolkit
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package edu.isi.mavuno.app.distsim;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+
+import edu.isi.mavuno.extract.CombineSplits;
+import edu.isi.mavuno.extract.ExtractGlobalStats;
+import edu.isi.mavuno.extract.Extract;
+import edu.isi.mavuno.extract.Split;
+import edu.isi.mavuno.util.MavunoUtils;
+
+/**
+ * @author metzler
+ *
+ */
+public class ContextToPattern extends Configured implements Tool {
+ private static final Logger sLogger = Logger.getLogger(ContextToPattern.class);
+
+ public ContextToPattern(Configuration conf) {
+ super(conf);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
+ */
+ @Override
+ public int run(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
+ MavunoUtils.readParameters(args, "Mavuno.ContextToPattern", getConf());
+ return run();
+ }
+
+ public int run() throws ClassNotFoundException, InterruptedException, IOException {
+ Configuration conf = getConf();
+
+ String contextPath = MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.ContextPath", conf);
+ String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.CorpusPath", conf);
+ String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.CorpusClass", conf);
+ String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.ExtractorClass", conf);
+ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.ExtractorArgs", conf);
+ String minMatches = MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.MinMatches", conf);
+ boolean harvestGlobalStats = Boolean.parseBoolean(MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.GlobalStats", conf));
+ String outputPath = MavunoUtils.getRequiredParam("Mavuno.ContextToPattern.OutputPath", conf);
+
+ MavunoUtils.createDirectory(conf, outputPath);
+
+ sLogger.info("Tool name: ContextToPattern");
+ sLogger.info(" - Context path: " + contextPath);
+ sLogger.info(" - Corpus path: " + corpusPath);
+ sLogger.info(" - Corpus class: " + corpusClass);
+ sLogger.info(" - Extractor class: " + extractorClass);
+ sLogger.info(" - Extractor args: " + extractorArgs);
+ sLogger.info(" - Min matches: " + minMatches);
+ sLogger.info(" - Harvest global stats: " + harvestGlobalStats);
+ sLogger.info(" - Output path: " + outputPath);
+
+ // set total terms path
+ conf.set("Mavuno.TotalTermsPath", outputPath + "/totalTerms");
+
+ // split contexts into manageable chunks
+ conf.set("Mavuno.Split.InputPath", contextPath);
+ conf.set("Mavuno.Split.OutputPath", outputPath + "/contexts-split");
+ conf.set("Mavuno.Split.SplitKey", "context");
+ new Split(conf).run();
+
+ // get context splits
+ FileStatus [] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/contexts-split");
+ int split = 0;
+ for(FileStatus file : files) {
+ if(!file.getPath().getName().endsWith(".examples")) {
+ continue;
+ }
+
+ // extract patterns
+ conf.set("Mavuno.Extract.InputPath", file.getPath().toString());
+ conf.set("Mavuno.Extract.CorpusPath", corpusPath);
+ conf.set("Mavuno.Extract.CorpusClass", corpusClass);
+ conf.set("Mavuno.Extract.ExtractorClass", extractorClass);
+ conf.set("Mavuno.Extract.ExtractorArgs", extractorArgs);
+ conf.set("Mavuno.Extract.ExtractorTarget", "pattern");
+ conf.set("Mavuno.Extract.MinMatches", minMatches);
+ conf.set("Mavuno.Extract.OutputPath", outputPath + "/contexts-split/patterns/" + split);
+ new Extract(conf).run();
+
+ // increment split
+ split++;
+ }
+
+ // extract global pattern statistics if necessary
+ if(harvestGlobalStats) {
+ conf.set("Mavuno.ExtractGlobalStats.InputPath", outputPath + "/contexts-split/patterns/");
+ conf.set("Mavuno.ExtractGlobalStats.CorpusPath", corpusPath);
+ conf.set("Mavuno.ExtractGlobalStats.CorpusClass", corpusClass);
+ conf.set("Mavuno.ExtractGlobalStats.ExtractorClass", extractorClass);
+ conf.set("Mavuno.ExtractGlobalStats.ExtractorArgs", extractorArgs);
+ conf.set("Mavuno.ExtractGlobalStats.ExtractorTarget", "pattern");
+ conf.set("Mavuno.ExtractGlobalStats.OutputPath", outputPath + "/contexts-split/pattern-stats/");
+ new ExtractGlobalStats(conf).run();
+ }
+
+ // combine context splits
+ conf.set("Mavuno.CombineSplits.ExamplesPath", outputPath + "/contexts-split/patterns");
+ conf.set("Mavuno.CombineSplits.ExampleStatsPath", outputPath + "/contexts-split/pattern-stats");
+ conf.set("Mavuno.CombineSplits.SplitKey", "context");
+ conf.setInt("Mavuno.CombineSplits.TotalSplits", split);
+ conf.set("Mavuno.CombineSplits.OutputPath", outputPath + "/pattern-stats");
+ new CombineSplits(conf).run();
+
+ // delete context splits
+ MavunoUtils.removeDirectory(conf, outputPath + "/contexts-split");
+
+ return 0;
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+ int res = ToolRunner.run(new ContextToPattern(conf), args);
+ System.exit(res);
+ }
+
+}
146 src/edu/isi/mavuno/app/distsim/PatternToContext.java
@@ -0,0 +1,146 @@
+/*
+ * Mavuno: A Hadoop-Based Text Mining Toolkit
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package edu.isi.mavuno.app.distsim;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+
+import edu.isi.mavuno.extract.CombineSplits;
+import edu.isi.mavuno.extract.Extract;
+import edu.isi.mavuno.extract.ExtractGlobalStats;
+import edu.isi.mavuno.extract.Split;
+import edu.isi.mavuno.util.MavunoUtils;
+
+/**
+ * @author metzler
+ *
+ */
+public class PatternToContext extends Configured implements Tool {
+ private static final Logger sLogger = Logger.getLogger(PatternToContext.class);
+
+ public PatternToContext(Configuration conf) {
+ super(conf);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
+ */
+ @Override
+ public int run(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
+ MavunoUtils.readParameters(args, "Mavuno.PatternToContext", getConf());
+ return run();
+ }
+
+ public int run() throws ClassNotFoundException, InterruptedException, IOException {
+ Configuration conf = getConf();
+
+ String patternPath = MavunoUtils.getRequiredParam("Mavuno.PatternToContext.PatternPath", conf);
+ String corpusPath = MavunoUtils.getRequiredParam("Mavuno.PatternToContext.CorpusPath", conf);
+ String corpusClass = MavunoUtils.getRequiredParam("Mavuno.PatternToContext.CorpusClass", conf);
+ String extractorClass = MavunoUtils.getRequiredParam("Mavuno.PatternToContext.ExtractorClass", conf);
+ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.PatternToContext.ExtractorArgs", conf);
+ String minMatches = MavunoUtils.getRequiredParam("Mavuno.PatternToContext.MinMatches", conf);
+ boolean harvestGlobalStats = Boolean.parseBoolean(MavunoUtils.getRequiredParam("Mavuno.PatternToContext.GlobalStats", conf));
+ String outputPath = MavunoUtils.getRequiredParam("Mavuno.PatternToContext.OutputPath", conf);
+
+ MavunoUtils.createDirectory(conf, outputPath);
+
+ sLogger.info("Tool name: PatternToContext");
+ sLogger.info(" - Pattern path: " + patternPath);
+ sLogger.info(" - Corpus path: " + corpusPath);
+ sLogger.info(" - Corpus class: " + corpusClass);
+ sLogger.info(" - Extractor class: " + extractorClass);
+ sLogger.info(" - Extractor args: " + extractorArgs);
+ sLogger.info(" - Min matches: " + minMatches);
+ sLogger.info(" - Harvest global stats: " + harvestGlobalStats);
+ sLogger.info(" - Output path: " + outputPath);
+
+ // set total terms path
+ conf.set("Mavuno.TotalTermsPath", outputPath + "/totalTerms");
+
+ // split patterns into manageable chunks
+ conf.set("Mavuno.Split.InputPath", patternPath);
+ conf.set("Mavuno.Split.OutputPath", outputPath + "/patterns-split");
+ conf.set("Mavuno.Split.SplitKey", "pattern");
+ new Split(conf).run();
+
+ // get pattern splits
+ FileStatus [] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/patterns-split");
+ int split = 0;
+ for(FileStatus file : files) {
+ if(!file.getPath().getName().endsWith(".examples")) {
+ continue;
+ }
+
+ // extract contexts
+ conf.set("Mavuno.Extract.InputPath", file.getPath().toString());
+ conf.set("Mavuno.Extract.CorpusPath", corpusPath);
+ conf.set("Mavuno.Extract.CorpusClass", corpusClass);
+ conf.set("Mavuno.Extract.ExtractorClass", extractorClass);
+ conf.set("Mavuno.Extract.ExtractorArgs", extractorArgs);
+ conf.set("Mavuno.Extract.ExtractorTarget", "context");
+ conf.set("Mavuno.Extract.MinMatches", minMatches);
+ conf.set("Mavuno.Extract.OutputPath", outputPath + "/patterns-split/contexts/" + split);
+ new Extract(conf).run();
+
+ // increment split
+ split++;
+ }
+
+ // extract global context statistics if necessary
+ if(harvestGlobalStats) {
+ conf.set("Mavuno.ExtractGlobalStats.InputPath", outputPath + "/patterns-split/contexts/");
+ conf.set("Mavuno.ExtractGlobalStats.CorpusPath", corpusPath);
+ conf.set("Mavuno.ExtractGlobalStats.CorpusClass", corpusClass);
+ conf.set("Mavuno.ExtractGlobalStats.ExtractorClass", extractorClass);
+ conf.set("Mavuno.ExtractGlobalStats.ExtractorArgs", extractorArgs);
+ conf.set("Mavuno.ExtractGlobalStats.ExtractorTarget", "context");
+ conf.set("Mavuno.ExtractGlobalStats.OutputPath", outputPath + "/patterns-split/context-stats/");
+ new ExtractGlobalStats(conf).run();
+ }
+
+ // combine pattern splits
+ conf.set("Mavuno.CombineSplits.ExamplesPath", outputPath + "/patterns-split/contexts");
+ conf.set("Mavuno.CombineSplits.ExampleStatsPath", outputPath + "/patterns-split/context-stats");
+ conf.set("Mavuno.CombineSplits.SplitKey", "pattern");
+ conf.setInt("Mavuno.CombineSplits.TotalSplits", split);
+ conf.set("Mavuno.CombineSplits.OutputPath", outputPath + "/context-stats");
+ new CombineSplits(conf).run();
+
+ // delete pattern splits
+ MavunoUtils.removeDirectory(conf, outputPath + "/patterns-split");
+
+ return 0;
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+ int res = ToolRunner.run(new PatternToContext(conf), args);
+ System.exit(res);
+ }
+
+}
108 src/edu/isi/mavuno/app/distsim/PatternToPattern.java
@@ -0,0 +1,108 @@
+/*
+ * Mavuno: A Hadoop-Based Text Mining Toolkit
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package edu.isi.mavuno.app.distsim;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+
+import edu.isi.mavuno.util.MavunoUtils;
+
+/**
+ * @author metzler
+ *
+ */
+public class PatternToPattern extends Configured implements Tool {
+ private static final Logger sLogger = Logger.getLogger(PatternToPattern.class);
+
+ public PatternToPattern(Configuration conf) {
+ super(conf);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
+ */
+ @Override
+ public int run(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
+ MavunoUtils.readParameters(args, "Mavuno.PatternToPattern", getConf());
+ return run();
+ }
+
+ public int run() throws ClassNotFoundException, InterruptedException, IOException {
+ Configuration conf = getConf();
+
+ String patternPath = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.PatternPath", conf);
+ String corpusClass = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.CorpusClass", conf);
+ String corpusPath = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.CorpusPath", conf);
+ String extractorClass = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.ExtractorClass", conf);
+ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.ExtractorArgs", conf);
+ int minMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.MinMatches", conf));
+ boolean harvestGlobalStats = Boolean.parseBoolean(MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.GlobalStats", conf));
+ String outputPath = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.OutputPath", conf);
+
+ MavunoUtils.createDirectory(conf, outputPath);
+
+ sLogger.info("Tool name: PatternToPattern");
+ sLogger.info(" - Pattern path: " + patternPath);
+ sLogger.info(" - Corpus class: " + corpusClass);
+ sLogger.info(" - Corpus path: " + corpusPath);
+ sLogger.info(" - Output path: " + outputPath);
+ sLogger.info(" - Context class: " + extractorClass);
+ sLogger.info(" - Context arguments: " + extractorArgs);
+ sLogger.info(" - Min matches: " + minMatches);
+ sLogger.info(" - Harvest global stats: " + harvestGlobalStats);
+
+ // pattern to context
+ conf.set("Mavuno.PatternToContext.PatternPath", patternPath);
+ conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath);
+ conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass);
+ conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass);
+ conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs);
+ conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches);
+ conf.setBoolean("Mavuno.PatternToContext.GlobalStats", harvestGlobalStats);
+ conf.set("Mavuno.PatternToContext.OutputPath", outputPath);
+ new PatternToContext(conf).run();
+
+ // context to pattern
+ conf.set("Mavuno.ContextToPattern.ContextPath", outputPath + "/context-stats");
+ conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath);
+ conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass);
+ conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass);
+ conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs);
+ conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches);
+ conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", harvestGlobalStats);
+ conf.set("Mavuno.ContextToPattern.OutputPath", outputPath);
+ new ContextToPattern(conf).run();
+
+ return 0;
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+ int res = ToolRunner.run(new PatternToPattern(conf), args);
+ System.exit(res);
+ }
+
+}
891 src/edu/isi/mavuno/app/ie/ExtractRelations.java
@@ -0,0 +1,891 @@
+/*
+ * Mavuno: A Hadoop-Based Text Mining Toolkit
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package edu.isi.mavuno.app.ie;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+import edu.isi.mavuno.extract.Extractor;
+import edu.isi.mavuno.input.SentenceSegmentedDocument;
+import edu.isi.mavuno.util.ContextPatternWritable;
+import edu.isi.mavuno.util.IdWeightPair;
+import edu.isi.mavuno.util.Individual;
+import edu.isi.mavuno.util.MavunoUtils;
+import edu.isi.mavuno.util.Relation;
+import edu.isi.mavuno.util.SentenceWritable;
+import edu.isi.mavuno.util.TokenFactory;
+import edu.isi.mavuno.util.TratzParsedTokenWritable;
+import edu.isi.mavuno.util.TypedTextSpan;
+import edu.stanford.nlp.util.IntPair;
+import edu.stanford.nlp.util.Pair;
+
+/**
+ * @author metzler
+ *
+ */
+public class ExtractRelations extends Configured implements Tool {
+ private static final Logger sLogger = Logger.getLogger(ExtractRelations.class);
+
+ private static final TokenFactory<TratzParsedTokenWritable> TOKEN_FACTORY = new TratzParsedTokenWritable.ParsedTokenFactory();
+
+ public ExtractRelations(Configuration conf) {
+ super(conf);
+ }
+
+ private static class MyMapper extends Mapper<Writable, SentenceSegmentedDocument<TratzParsedTokenWritable>, Text, Text> {
+
+ private static final Text MAVUNO_SOURCE = new Text("isi:mavuno");
+
+ private static final Text DEFAULT_TYPE = new Text("ANY");
+ private static final Text O = new Text("O");
+
+ private static byte [] BUFFER = new byte[1024*1024];
+
+ private String mPlaintextPath = null;
+
+ private Extractor mExtractor = null;
+
+ private final Map<Text, Text> mRelationNameLookup = new HashMap<Text, Text>();
+
+ private final Map<Text, Text []> mArgNames = new HashMap<Text, Text []>();
+ private final Map<Text, Text []> mArgTypes = new HashMap<Text, Text []>();
+ private final Map<Text, Text []> mArgClasses = new HashMap<Text, Text []>();
+
+ private final Map<Text, Text> mPrimaryClasses = new HashMap<Text, Text>();
+ private final Map<Text, List<IdWeightPair>> mPatterns = new HashMap<Text, List<IdWeightPair>>();
+ private final Map<Text, List<IdWeightPair>> mInstances = new HashMap<Text, List<IdWeightPair>>();
+
+ // maps relations to their confidence scores
+ private final Map<Relation,Double> mRelations = new HashMap<Relation,Double>();
+
+ // maps individual ids to their representations
+ private final Map<Text,Individual> mIndividuals = new HashMap<Text,Individual>();
+
+ private final SentenceSegmentedDocument<TratzParsedTokenWritable> mDoc = new SentenceSegmentedDocument<TratzParsedTokenWritable>(TOKEN_FACTORY);
+
+ private final Text mKey = new Text();
+ private final Text mValue = new Text();
+
+ private final ContextPatternWritable mPair = new ContextPatternWritable();
+
+ // private final Map<IntPair, Integer> mCorefForwardLookup = new HashMap<IntPair, Integer>();
+ // private final Map<Integer, List<IntPair>> mCorefReverseLookup = new HashMap<Integer, List<IntPair>>();
+
+ private void loadTypes(String typesPath, Configuration conf) throws IOException {
+ // reset relation name lookup
+ mRelationNameLookup.clear();
+
+ // clear argument names
+ mArgNames.clear();
+
+ // clear argument types
+ mArgTypes.clear();
+
+ // clear argument classes
+ mArgClasses.clear();
+
+ BufferedReader reader = MavunoUtils.getBufferedReader(conf, typesPath);
+
+ // read types
+ String input;
+ while((input = reader.readLine()) != null) {
+ String [] cols = input.split("\t");
+
+ if(cols.length < 5 || (cols.length - 2) % 3 != 0) {
+ throw new RuntimeException("Ill-formed line in types file -- " + input);
+ }
+
+ Text relationId = new Text(cols[0]);
+ Text relationName = new Text(cols[1]);
+
+ mRelationNameLookup.put(relationId, relationName);
+
+ Text [] argNames = new Text[(cols.length-2)/3];
+ Text [] argTypes = new Text[(cols.length-2)/3];
+ Text [] argClasses = new Text[(cols.length-2)/3];
+
+ for(int i = 2; i < cols.length; i+=3) {
+ argNames[(i-2)/3] = new Text(cols[i]);
+ argTypes[(i-2)/3] = new Text(cols[i+1]);
+ argClasses[(i-2)/3] = new Text(cols[i+2]);
+ }
+
+ mArgNames.put(relationId, argNames);
+ mArgTypes.put(relationId, argTypes);
+ mArgClasses.put(relationId, argClasses);
+ }
+
+ // close current reader
+ reader.close();
+ }
+
+ private static void loadPatterns(Map<Text, List<IdWeightPair>> patterns, String patternsPath, Configuration conf) throws IOException {
+ // clear example lookup
+ patterns.clear();
+
+ BufferedReader reader = MavunoUtils.getBufferedReader(conf, patternsPath);
+
+ // read patterns
+ String input;
+ while((input = reader.readLine()) != null) {
+ String [] cols = input.split("\t");
+
+ if(cols.length < 2 || cols.length > 3) {
+ throw new RuntimeException("Ill-formed line in pattern file -- " + input);
+ }
+
+ Text relationName = new Text(cols[0]);
+ Text pattern = new Text(cols[1]);
+
+ float weight = 1.0f;
+ if(cols.length == 3) {
+ weight = Float.parseFloat(cols[2]);
+ }
+
+ IdWeightPair pair = new IdWeightPair(relationName, weight);
+
+ updatePatternMap(patterns, pattern, pair);
+ }
+
+ // close current reader
+ reader.close();
+ }
+
+ private static void updatePatternMap(Map<Text, List<IdWeightPair>> patterns, Text pattern, IdWeightPair pair) {
+ // populate pattern lookup
+ List<IdWeightPair> contextList = null;
+ contextList = patterns.get(pattern);
+ if(contextList == null) {
+ contextList = new ArrayList<IdWeightPair>(1);
+ contextList.add(pair);
+ patterns.put(pattern, contextList);
+ }
+ else {
+ contextList.add(pair);
+ }
+ }
+
+ @Override
+ public void setup(Mapper<Writable, SentenceSegmentedDocument<TratzParsedTokenWritable>, Text, Text>.Context context) throws IOException {
+ Configuration conf = context.getConfiguration();
+
+ try {
+ // initialize extractor
+ mExtractor = (Extractor)Class.forName(conf.get("Mavuno.ExtractRelations.ExtractorClass")).newInstance();
+ String contextArgs = conf.get("Mavuno.ExtractRelations.ExtractorArgs", null);
+ mExtractor.initialize(contextArgs, conf);
+
+ // load types
+ String typesPath = conf.get("Mavuno.ExtractRelations.TypesPath", null);
+ loadTypes(typesPath, conf);
+
+ // get primary types
+ String [] primaryTypes = conf.get("Mavuno.ExtractRelations.PrimaryTypes", "").split(",");
+ mPrimaryClasses.clear();
+ for(int i = 0; i < primaryTypes.length; i++) {
+ String [] pair = primaryTypes[i].split("\\|");
+ if(pair.length != 2) {
+ throw new RuntimeException("Illegal primary type specification -- " + primaryTypes[i]);
+ }
+ mPrimaryClasses.put(new Text(pair[0]), new Text(pair[1]));
+ }
+
+ // load extraction patterns
+ String patternsPath = conf.get("Mavuno.ExtractRelations.PatternsPath", null);
+ loadPatterns(mPatterns, patternsPath, conf);
+
+ // load instances (if provided)
+ String instancesPath = conf.get("Mavuno.ExtractRelations.InstancesPath", null);
+ if(instancesPath != null) {
+ loadPatterns(mInstances, instancesPath, conf);
+ }
+
+ // set plaintext corpus path location (if set)
+ mPlaintextPath = conf.get("Mavuno.ExtractRelations.PlaintextPath", null);
+ }
+ catch(Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void map(Writable key, SentenceSegmentedDocument<TratzParsedTokenWritable> doc, Mapper<Writable, SentenceSegmentedDocument<TratzParsedTokenWritable>, Text, Text>.Context context) throws IOException, InterruptedException {
+ // key = document id
+ mKey.set(doc.getDocid());
+
+ sLogger.info("Processing document: " + doc.getDocid());
+
+ // get sentences
+ List<SentenceWritable<TratzParsedTokenWritable>> sentences = doc.getSentences();
+
+ int sentId = 0;
+ //int tokenId = 0;
+
+ // // get coref clusters
+ // mCorefForwardLookup.clear();
+ // mCorefReverseLookup.clear();
+ //
+ // for(SentenceWritable<TratzParsedTokenWritable> s : sentences) {
+ // // reset token id
+ // tokenId = 0;
+ //
+ // for(TratzParsedTokenWritable t : s.getTokens()) {
+ // int id = t.getCorefId();
+ // if(id != -1) {
+ // // position within the document
+ // IntPair pos = new IntPair(sentId, tokenId);
+ //
+ // // forward lookup
+ // mCorefForwardLookup.put(pos, id);
+ //
+ // // reverse lookup
+ // List<IntPair> pairs = mCorefReverseLookup.get(id);
+ // if(pairs == null) {
+ // pairs = new ArrayList<IntPair>();
+ // pairs.add(pos);
+ // mCorefReverseLookup.put(id, pairs);
+ // }
+ // else {
+ // pairs.add(pos);
+ // }
+ // }
+ // tokenId++;
+ // }
+ // sentId++;
+ // }
+
+ // clear relations
+ mRelations.clear();
+
+ // clear individuals
+ mIndividuals.clear();
+
+ // extract separately from each sentence
+ sentId = 0;
+ //tokenId = 0;
+ for(SentenceWritable<TratzParsedTokenWritable> s : sentences) {
+ // construct new document that only contains this sentence
+ mDoc.clear();
+ mDoc.addSentence(s);
+
+ // set current document
+ mExtractor.setDocument(mDoc);
+
+ //sLogger.info("SENTENCE = " + s);
+
+ String sentenceText = s.toStringOfTokens();
+
+ // skip empty sentences
+ if(sentenceText.length() == 0) {
+ continue;
+ }
+
+ int sentenceCharOffsetBegin = s.getTokenAt(0).getCharOffsetBegin();
+ int sentenceCharOffsetEnd = s.getTokenAt(s.getNumTokens()-1).getCharOffsetEnd();
+
+ List<Text> arguments = new ArrayList<Text>();
+
+ // reset token id
+ //tokenId = 0;
+
+ // main extract loop -- extracts instances and relations
+ List<IdWeightPair> contextList = null;
+ while(mExtractor.getNextPair(mPair)) {
+ // get the context instances for this extraction
+ arguments.clear();
+ String [] args = mPair.getContext().toString().split("\\|");
+ for(int i = 0; i < args.length; i++) {
+ arguments.add(new Text(args[i]));
+ }
+
+ List<Set<Text>> allArgClasses = new ArrayList<Set<Text>>();
+ List<Integer> allArgCharOffsetBegin = new ArrayList<Integer>();
+ List<Integer> allArgCharOffsetEnd = new ArrayList<Integer>();
+
+ // process each argument instance
+ for(Text arg : arguments) {
+ // get offset within sentence
+ int argOffset = getOffset(arg, sentenceText);
+
+ // skip if we can't find an alignment for some reason
+ if(argOffset == -1) {
+ continue;
+ }
+
+ // argument length
+ int argLength = getLength(arg);
+
+ // argument char offsets
+ int argCharOffsetBegin = s.getTokenAt(argOffset).getCharOffsetBegin();
+ int argCharOffsetEnd = s.getTokenAt(argOffset+argLength-1).getCharOffsetEnd();
+
+ allArgCharOffsetBegin.add(argCharOffsetBegin);
+ allArgCharOffsetEnd.add(argCharOffsetEnd);
+
+ // get arg classes
+ Set<Text> argClasses = getTypes(sentences, s, sentId, argOffset, argLength, true);
+
+ allArgClasses.add(argClasses);
+
+ // is this a mention of a primary class? if so, then process it
+ for(Text argClass : argClasses) {
+ Text argType = mPrimaryClasses.get(argClass);
+ if(argType != null) {
+ Pair<Text,IntPair> individualSpec = resolveCoref(mPair.getContext(), argClass, sentences, s, argOffset, argLength);
+ Individual individual = mIndividuals.get(individualSpec.first);
+ if(individual == null) {
+ individual = new Individual(individualSpec.first, individualSpec.second.getSource(), individualSpec.second.getTarget());
+ mIndividuals.put(new Text(individualSpec.first), individual);
+ }
+ individual.addOccurrence(new TypedTextSpan(argType, individualSpec.first, individualSpec.second.getSource(), individualSpec.second.getTarget()));
+ }
+ }
+ }
+
+ // check if this pattern matches any of the relation patterns
+ contextList = mPatterns.get(mPair.getPattern());
+
+ // if this pattern doesn't match any of the relation patterns then we're done
+ if(contextList == null) {
+ continue;
+ }
+
+ // if found, then process
+ for(IdWeightPair pair : contextList) {
+ Text [] expectedNames = mArgNames.get(pair.id);
+ Text [] expectedTypes = mArgTypes.get(pair.id);
+ Text [] expectedClasses = mArgClasses.get(pair.id);
+
+ // skip this pair if we're missing name and/or type information
+ if(expectedNames == null || expectedTypes == null || expectedClasses == null) {
+ continue;
+ }
+
+ // perform length count checking
+ if(expectedClasses.length != expectedNames.length || expectedNames.length != expectedTypes.length || expectedTypes.length != allArgClasses.size()) {
+ continue;
+ }
+
+ // perform class type checking
+ boolean matches = true;
+ for(int i = 0; i < expectedClasses.length; i++) {
+ if(!allArgClasses.get(i).contains(expectedClasses[i])) {
+ matches = false;
+ break;
+ }
+ }
+
+ if(matches) {
+ // build relation
+ Relation r = new Relation(mRelationNameLookup.get(pair.id), MAVUNO_SOURCE, sentenceCharOffsetBegin, sentenceCharOffsetEnd);
+ for(int i = 0; i < arguments.size(); i++) {
+ // argument text
+ Text argText = arguments.get(i);
+
+ // argument name
+ Text argName = expectedNames[i];
+
+ // argument type
+ Text argType = expectedTypes[i];
+
+ // beginning and end offset for this argument
+ int argBegin = allArgCharOffsetBegin.get(i);
+ int argEnd = allArgCharOffsetEnd.get(i);
+
+ // find the individual for this argument
+ Individual individual = mIndividuals.get(argText);
+ if(individual == null) { // create new individual
+ individual = new Individual(argText, argBegin, argEnd);
+ individual.addOccurrence(new TypedTextSpan(argType, argText, argBegin, argEnd));
+ mIndividuals.put(new Text(argText), individual);
+ }
+ else {
+ individual.addOccurrence(new TypedTextSpan(argType, argText, argBegin, argEnd));
+ }
+
+ // add argument to relation
+ r.addArgument(argName, argText, individual, argBegin, argEnd);
+ }
+
+ Double confidence = mRelations.get(r);
+ if(confidence == null) {
+ mRelations.put(r, new Double(pair.weight));
+ }
+ else {
+ mRelations.put(r, confidence + pair.weight);
+ }
+ }
+ }
+
+ //tokenId++;
+ }
+
+ sentId++;
+ }
+
+ try {
+ // read plain text version of document, if necessary
+ String documentText = null;
+ if(mPlaintextPath != null) {
+ documentText = loadDocumentText(context.getConfiguration(), mPlaintextPath, doc.getDocid());
+ }
+
+ // generate XML output
+ String xml = getXMLOutput(doc.getDocid(), documentText, mRelations, mIndividuals);
+ //System.out.println(xml);
+ mValue.set(xml);
+ context.write(mKey, mValue);
+ }
+ catch(Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static String loadDocumentText(Configuration conf, String path, String docid) throws IOException {
+ Text text = new Text();
+
+ FSDataInputStream reader = MavunoUtils.getFSDataInputStream(conf, path + "/" + docid);
+
+ int n;
+ while((n = reader.read(BUFFER, 0, BUFFER.length)) != -1) {
+ text.append(BUFFER, 0, n);
+ }
+
+ reader.close();
+
+ return text.toString();
+ }
+
+ // creates an XML representation of the relations and individuals
+ private static String getXMLOutput(String docid, String docText, Map<Relation,Double> relations, Map<Text,Individual> individuals) throws ParserConfigurationException, TransformerException {
+ DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
+
+ Document doc = domBuilder.newDocument();
+ Element rootElement = doc.createElement("doc");
+ rootElement.setAttribute("xmlns", "http://www.bbn.com/MR/ELF");
+ rootElement.setAttribute("id", docid);
+ rootElement.setAttribute("elf-version", "2.2");
+ rootElement.setAttribute("source", "Mavuno Reader");
+ rootElement.setAttribute("contents", "S-ELF");
+ doc.appendChild(rootElement);
+
+ Map<Individual,Integer> individualIds = new HashMap<Individual,Integer>();
+
+ int id = 1;
+ for(Map.Entry<Text,Individual> indEntry: individuals.entrySet()) {
+ Individual ind = indEntry.getValue();
+ TypedTextSpan indSpan = ind.getSpan();
+
+ Element indElement = doc.createElement("individual");
+ indElement.setAttribute("id", Integer.toString(id));
+
+ Element nameElement = doc.createElement("name");
+ if(docText != null) {
+ nameElement.setTextContent(docText.substring(indSpan.start, indSpan.end+1));
+ }
+ nameElement.setAttribute("name", indSpan.text.toString());
+ nameElement.setAttribute("start", Integer.toString(indSpan.start));
+ nameElement.setAttribute("end", Integer.toString(indSpan.end));
+ indElement.appendChild(nameElement);
+
+ for(TypedTextSpan occurrence : ind.getOccurrences()) {
+ // handle special case
+ // TODO: make this more modular
+ if(occurrence.type.toString().equals("xsd:string")) {
+ continue;
+ }
+
+ Element occurrenceElement = doc.createElement("type");
+ if(docText != null) {
+ occurrenceElement.setTextContent(docText.substring(occurrence.start, occurrence.end+1));
+ }
+ occurrenceElement.setAttribute("type", occurrence.type.toString());
+ occurrenceElement.setAttribute("start", Integer.toString(occurrence.start));
+ occurrenceElement.setAttribute("end", Integer.toString(occurrence.end));
+ indElement.appendChild(occurrenceElement);
+ }
+
+ if(indElement.getChildNodes().getLength() > 1) {
+ rootElement.appendChild(indElement);
+ individualIds.put(ind, id);
+ id++;
+ }
+ }
+
+ for(Map.Entry<Relation,Double> relEntry: relations.entrySet()) {
+ Relation rel = relEntry.getKey();
+ double confidence = relEntry.getValue();
+
+ // TODO: fix this
+ if(confidence > 1.0) { confidence = 1.0; }
+
+ Element relationElement = doc.createElement("relation");
+ if(docText != null) {
+ Element textElement = doc.createElement("text");
+ textElement.setTextContent(docText.substring(rel.getStartOffset(), rel.getEndOffset()+1));
+ relationElement.appendChild(textElement);
+ }
+ relationElement.setAttribute("name", rel.getName().toString());
+ relationElement.setAttribute("source", rel.getSource().toString());
+ relationElement.setAttribute("start", Integer.toString(rel.getStartOffset()));
+ relationElement.setAttribute("end", Integer.toString(rel.getEndOffset()));
+ relationElement.setAttribute("p", Double.toString(confidence));
+
+ for(Map.Entry<TypedTextSpan, Individual> argEntry : rel.getArguments().entrySet()) {
+ TypedTextSpan argSpan = argEntry.getKey();
+ Individual argInd = argEntry.getValue();
+
+ Element argumentElement = doc.createElement("arg");
+ if(docText != null) {
+ argumentElement.setTextContent(docText.substring(argSpan.start, argSpan.end+1));
+ }
+ argumentElement.setAttribute("role", argSpan.type.toString());
+ argumentElement.setAttribute("start", Integer.toString(argSpan.start));
+ argumentElement.setAttribute("end", Integer.toString(argSpan.end));
+
+ // handle special case
+ // TODO: make this more modular
+ if(argSpan.type.toString().startsWith("t:")) {
+ argumentElement.setAttribute("type", "xsd:string");
+ argumentElement.setAttribute("value", argSpan.text.toString());
+ }
+ else {
+ int argId = individualIds.get(argInd);
+ argumentElement.setAttribute("id", Integer.toString(argId));
+ }
+
+ relationElement.appendChild(argumentElement);
+ }
+
+ rootElement.appendChild(relationElement);
+ }
+
+ TransformerFactory transFactory = TransformerFactory.newInstance();
+ Transformer trans = transFactory.newTransformer();
+ trans.setOutputProperty(OutputKeys.INDENT, "yes");
+ trans.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
+
+ StringWriter sw = new StringWriter();
+ StreamResult result = new StreamResult(sw);
+ DOMSource source = new DOMSource(doc);
+ trans.transform(source, result);
+
+ return sw.toString();
+ }
+
+ private Pair<Text, IntPair> resolveCoref(Text arg, Text expectedType, List<SentenceWritable<TratzParsedTokenWritable>> sentences, SentenceWritable<TratzParsedTokenWritable> s, int offset, int length) {
+ Pair<Text,IntPair> bestPair = null;
+ // // resolve co-ref to best individual
+ // for(int i = offset; i < offset + length; i++) {
+ // int corefId = s.getTokenAt(i).getCorefId();
+ // List<IntPair> pairs = mCorefReverseLookup.get(corefId);
+ // if(pairs != null) {
+ // for(IntPair p : pairs) {
+ // IntPair chunkSpan = getChunkSpan(sentences.get(p.getSource()), p.getTarget());
+ // Set<Text> chunkTypes = getTypes(sentences, sentences.get(p.getSource()), p.getSource(), chunkSpan.getSource(), chunkSpan.getTarget(), false);
+ // if(chunkTypes.contains(expectedType)) {
+ // Text chunkText = getSpan(sentences.get(p.getSource()), chunkSpan.getSource(), chunkSpan.getTarget());
+ // int begin = sentences.get(p.getSource()).getTokenAt(chunkSpan.getSource()).getCharOffsetBegin();
+ // int end = sentences.get(p.getSource()).getTokenAt(chunkSpan.getSource()+chunkSpan.getTarget()-1).getCharOffsetEnd();
+ // //System.out.println(arg + " RESOLVES TO " + chunkText + "\t" + begin + "\t" + end);
+ // if(bestPair == null || chunkText.getLength() > bestPair.first.getLength()) {
+ // bestPair = new Pair<Text,IntPair>(chunkText, new IntPair(begin, end));
+ // }
+ // }
+ // }
+ // }
+ // }
+ //
+ if(bestPair == null) {
+ IntPair chunkSpan = getChunkSpan(s, offset);
+ Text chunkText = getSpan(s, chunkSpan.getSource(), chunkSpan.getTarget());
+ int begin = s.getTokenAt(chunkSpan.getSource()).getCharOffsetBegin();
+ int end = s.getTokenAt(chunkSpan.getSource()+chunkSpan.getTarget()-1).getCharOffsetEnd();
+ bestPair = new Pair<Text,IntPair>(chunkText, new IntPair(begin, end));
+ }
+
+ return bestPair;
+ }