-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
OpenNLP parser wrapper now fully working.
Beginnings of coref wrapper.
- Loading branch information
Mark Granroth-Wilding
committed
Apr 8, 2016
1 parent
bd3548b
commit 019ca52
Showing
12 changed files
with
663 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
package pimlico.opennlp; | ||
|
||
import com.google.common.base.Joiner; | ||
import opennlp.tools.cmdline.parser.ParserTool; | ||
import opennlp.tools.coref.*; | ||
import opennlp.tools.coref.mention.DefaultParse; | ||
import opennlp.tools.coref.mention.Mention; | ||
import opennlp.tools.parser.Parse; | ||
import opennlp.tools.parser.Parser; | ||
import opennlp.tools.parser.ParserFactory; | ||
import opennlp.tools.parser.ParserModel; | ||
|
||
import java.io.*; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
/** | ||
* Created by mtw29 on 23/04/14. | ||
*/ | ||
public class CoreferenceResolver { | ||
private Parser parser; | ||
private Linker corefLinker; | ||
|
||
public CoreferenceResolver(File parsingModel, File modelsDir) throws ModelLoadError { | ||
InputStream modelIn = null; | ||
try { | ||
// Load the parsing model | ||
modelIn = new FileInputStream(parsingModel); | ||
ParserModel model = new ParserModel(modelIn); | ||
// Prepare a parser using the model | ||
parser = ParserFactory.create(model); | ||
} catch (IOException e) { | ||
throw new ModelLoadError("could not load parser model: " + e.getMessage()); | ||
} finally { | ||
if (modelIn != null) { | ||
try { | ||
modelIn.close(); | ||
} catch (IOException e) {} | ||
} | ||
} | ||
|
||
// Load a coref model | ||
try { | ||
//corefLinker = new DefaultLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST); | ||
corefLinker = new TreebankLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST); | ||
} catch (IOException e) { | ||
throw new ModelLoadError("could not load coref model: " + e.getMessage()); | ||
} | ||
} | ||
|
||
public DiscourseEntity[] resolveCoreferenceParsed(List<ParsedSentence> sentences) { | ||
List<Mention> mentions = new ArrayList<Mention>(); | ||
|
||
// Collect mentions from each sentence | ||
int sentenceNum = 0; | ||
for (ParsedSentence sentence : sentences) { | ||
// Get mentions from this sentence | ||
mentions.addAll(Arrays.asList(getMentions(sentence.parse, sentenceNum))); | ||
sentenceNum++; | ||
} | ||
|
||
Mention[] mentionArray = mentions.toArray(new Mention[mentions.size()]); | ||
try { | ||
// Run coreference resolution | ||
return corefLinker.getEntities(mentionArray); | ||
} catch (RuntimeException e) { | ||
// Sadly this happens occasionally: handle it nicely | ||
return new DiscourseEntity[0]; | ||
} | ||
} | ||
|
||
public Parse parse(String[] words) { | ||
// Run the parser to get max 1 parse | ||
Parse[] parses = ParserTool.parseLine(Joiner.on(' ').join(words), parser, 1); | ||
// Return the parse if we got one | ||
if (parses.length > 0) { | ||
return parses[0]; | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
public Mention[] getMentions(String[] words, int sentenceNum) { | ||
// Run the parser on the sentence | ||
Parse parse = parse(words); | ||
return getMentions(parse, sentenceNum); | ||
} | ||
|
||
public Mention[] getMentions(Parse parse, int sentenceNum) { | ||
// If there was an error reading parse data, we might have no parse: give no mentions | ||
if (parse == null) | ||
return new Mention[0]; | ||
|
||
// Wrap up the parse | ||
opennlp.tools.coref.mention.Parse mentionParse = new DefaultParse(parse, sentenceNum); | ||
// Extract mentions | ||
Mention[] mentions = corefLinker.getMentionFinder().getMentions(mentionParse); | ||
|
||
// This is taken from CoreferencerTool... | ||
//construct new parses for mentions which don't have constituents. | ||
for (Mention mention : mentions) { | ||
if (mention.getParse() == null) { | ||
//not sure how to get head index, but its not used at this point. | ||
Parse snp = new Parse(parse.getText(), mention.getSpan(), "NML", 1.0, 0); | ||
parse.insert(snp); | ||
mention.setParse(new DefaultParse(snp, sentenceNum)); | ||
} | ||
} | ||
|
||
return mentions; | ||
} | ||
|
||
public DiscourseEntity[] resolveCoreference(List<String[]> sentences) { | ||
List<Mention> mentions = new ArrayList<Mention>(); | ||
|
||
// Collect mentions from each sentence | ||
int sentenceNum = 0; | ||
for (String[] sentence : sentences) { | ||
// Get mentions from this sentence | ||
mentions.addAll(Arrays.asList(getMentions(sentence, sentenceNum))); | ||
sentenceNum++; | ||
} | ||
|
||
Mention[] mentionArray = mentions.toArray(new Mention[mentions.size()]); | ||
// Run coreference resolution | ||
return corefLinker.getEntities(mentionArray); | ||
} | ||
|
||
public static class ModelLoadError extends Exception { | ||
public ModelLoadError(String message) { | ||
super(message); | ||
} | ||
} | ||
} |
118 changes: 118 additions & 0 deletions
118
src/java/pimlico/opennlp/CoreferenceResolverGateway.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package pimlico.opennlp; | ||
|
||
import net.sourceforge.argparse4j.ArgumentParsers; | ||
import net.sourceforge.argparse4j.inf.ArgumentParser; | ||
import net.sourceforge.argparse4j.inf.ArgumentParserException; | ||
import net.sourceforge.argparse4j.inf.Namespace; | ||
import opennlp.tools.coref.DiscourseEntity; | ||
import opennlp.tools.coref.Linker; | ||
import opennlp.tools.coref.LinkerMode; | ||
import opennlp.tools.coref.TreebankLinker; | ||
import opennlp.tools.coref.mention.DefaultParse; | ||
import opennlp.tools.coref.mention.Mention; | ||
import opennlp.tools.parser.Parse; | ||
import pimlico.core.Py4JGatewayStarter; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
/** | ||
* Wrapper around OpenNLP's coreference resolution tool to provide access to it via Py4J for Pimlico module. | ||
*/ | ||
public class CoreferenceResolverGateway { | ||
private Linker corefLinker; | ||
|
||
public CoreferenceResolverGateway(File modelsDir) throws ModelLoadError { | ||
// Load a coref model | ||
try { | ||
//corefLinker = new DefaultLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST); | ||
corefLinker = new TreebankLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST); | ||
} catch (IOException e) { | ||
throw new ModelLoadError("could not load coref model: " + e.getMessage()); | ||
} | ||
} | ||
|
||
public DiscourseEntity[] resolveCoreference(List<Parse> sentences) { | ||
List<Mention> mentions = new ArrayList<Mention>(); | ||
|
||
// Collect mentions from each sentence | ||
int sentenceNum = 0; | ||
for (Parse parsedSentence : sentences) { | ||
// Get mentions from this sentence | ||
mentions.addAll(Arrays.asList(getMentions(parsedSentence, sentenceNum))); | ||
sentenceNum++; | ||
} | ||
|
||
Mention[] mentionArray = mentions.toArray(new Mention[mentions.size()]); | ||
try { | ||
// Run coreference resolution | ||
return corefLinker.getEntities(mentionArray); | ||
} catch (RuntimeException e) { | ||
// Sadly this happens occasionally: handle it nicely | ||
return new DiscourseEntity[0]; | ||
} | ||
} | ||
|
||
private Mention[] getMentions(Parse parse, int sentenceNum) { | ||
// If there was an error reading parse data, we might have no parse: give no mentions | ||
if (parse == null) | ||
return new Mention[0]; | ||
|
||
// Wrap up the parse | ||
opennlp.tools.coref.mention.Parse mentionParse = new DefaultParse(parse, sentenceNum); | ||
// Extract mentions | ||
Mention[] mentions = corefLinker.getMentionFinder().getMentions(mentionParse); | ||
|
||
// This is taken from CoreferencerTool... | ||
//construct new parses for mentions which don't have constituents. | ||
for (Mention mention : mentions) { | ||
if (mention.getParse() == null) { | ||
//not sure how to get head index, but its not used at this point. | ||
Parse snp = new Parse(parse.getText(), mention.getSpan(), "NML", 1.0, 0); | ||
parse.insert(snp); | ||
mention.setParse(new DefaultParse(snp, sentenceNum)); | ||
} | ||
} | ||
|
||
return mentions; | ||
} | ||
|
||
public static class ModelLoadError extends Exception { | ||
public ModelLoadError(String message) { | ||
super(message); | ||
} | ||
} | ||
|
||
public static void main(String[] args) { | ||
ArgumentParser argParser = ArgumentParsers.newArgumentParser("Coreference"); | ||
argParser.description("Run the OpenNLP coreference resolver, providing access to it via Py4J"); | ||
argParser.addArgument("coref_model_dir").help("Path to coref model dir"); | ||
argParser.addArgument("--port").type(Integer.class).help("Specify a port for gateway server to run on").setDefault(0); | ||
argParser.addArgument("--python-port").type(Integer.class).help("Specify a port for gateway server to use " + | ||
"to response to Python").setDefault(0); | ||
|
||
Namespace opts = null; | ||
try { | ||
opts = argParser.parseArgs(args); | ||
} catch (ArgumentParserException e) { | ||
System.err.println("Error in command-line arguments: " + e); | ||
System.exit(1); | ||
} | ||
|
||
String corefModelDir = opts.getString("coref_model_dir"); | ||
|
||
// Load the gateway instance | ||
CoreferenceResolverGateway entryPoint = null; | ||
try { | ||
entryPoint = new CoreferenceResolverGateway(new File(corefModelDir)); | ||
} catch (ModelLoadError modelLoadError) { | ||
modelLoadError.printStackTrace(); | ||
System.exit(1); | ||
} | ||
// Create a gateway server, using this as an entry point | ||
Py4JGatewayStarter.startGateway(entryPoint, opts.getInt("port"), opts.getInt("python_port")); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package pimlico.opennlp; | ||
|
||
import net.sourceforge.argparse4j.ArgumentParsers; | ||
import net.sourceforge.argparse4j.inf.ArgumentParser; | ||
import net.sourceforge.argparse4j.inf.ArgumentParserException; | ||
import net.sourceforge.argparse4j.inf.Namespace; | ||
import opennlp.tools.cmdline.parser.ParserTool; | ||
import opennlp.tools.parser.Parse; | ||
import opennlp.tools.parser.Parser; | ||
import opennlp.tools.parser.ParserFactory; | ||
import opennlp.tools.parser.ParserModel; | ||
import pimlico.core.Py4JGatewayStarter; | ||
|
||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* Wrapper around OpenNLP's parser tool to provide access to it via Py4J for Pimlico module. | ||
*/ | ||
public class ParserGateway { | ||
private Parser parser; | ||
|
||
public ParserGateway(File modelPath) throws ModelLoadError { | ||
InputStream modelIn = null; | ||
try { | ||
// Load the parsing model | ||
modelIn = new FileInputStream(modelPath); | ||
ParserModel model = new ParserModel(modelIn); | ||
// Prepare a parser using the model | ||
parser = ParserFactory.create(model); | ||
} catch (IOException e) { | ||
throw new ModelLoadError("could not load parser model: " + e.getMessage()); | ||
} finally { | ||
if (modelIn != null) { | ||
try { | ||
modelIn.close(); | ||
} catch (IOException e) {} | ||
} | ||
} | ||
} | ||
|
||
public static class ModelLoadError extends Exception { | ||
public ModelLoadError(String message) { | ||
super(message); | ||
} | ||
} | ||
|
||
public Parse parse(String sentence) { | ||
// Run the parser to get max 1 parse | ||
Parse[] parses = ParserTool.parseLine(sentence, parser, 1); | ||
// Return the parse if we got one | ||
if (parses.length > 0) { | ||
return parses[0]; | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
public String parseTree(String sentence) { | ||
Parse parserOutput = parse(sentence); | ||
// Format the parser output as a PTB tree | ||
StringBuffer sb = new StringBuffer(); | ||
parserOutput.show(sb); | ||
return sb.toString(); | ||
} | ||
|
||
public List<Parse> parse(List<String> sentences) { | ||
ArrayList<Parse> results = new ArrayList<Parse>(); | ||
for (String sentence : sentences) | ||
results.add(parse(sentence)); | ||
return results; | ||
} | ||
|
||
public List<String> parseTrees(List<String> sentences) { | ||
ArrayList<String> results = new ArrayList<String>(); | ||
for (String sentence : sentences) | ||
results.add(parseTree(sentence)); | ||
return results; | ||
} | ||
|
||
public static void main(String[] args) { | ||
ArgumentParser argParser = ArgumentParsers.newArgumentParser("Parser"); | ||
argParser.description("Run the OpenNLP parser, providing access to it via Py4J"); | ||
argParser.addArgument("model_path").help("Path to path model file"); | ||
argParser.addArgument("--port").type(Integer.class).help("Specify a port for gateway server to run on").setDefault(0); | ||
argParser.addArgument("--python-port").type(Integer.class).help("Specify a port for gateway server to use " + | ||
"to response to Python").setDefault(0); | ||
|
||
Namespace opts = null; | ||
try { | ||
opts = argParser.parseArgs(args); | ||
} catch (ArgumentParserException e) { | ||
System.err.println("Error in command-line arguments: " + e); | ||
System.exit(1); | ||
} | ||
|
||
String parserModelPath = opts.getString("model_path"); | ||
|
||
// Load the gateway instance | ||
ParserGateway entryPoint = null; | ||
try { | ||
entryPoint = new ParserGateway(new File(parserModelPath)); | ||
} catch (ModelLoadError modelLoadError) { | ||
modelLoadError.printStackTrace(); | ||
System.exit(1); | ||
} | ||
// Create a gateway server, using this as an entry point | ||
Py4JGatewayStarter.startGateway(entryPoint, opts.getInt("port"), opts.getInt("python_port")); | ||
} | ||
} |
Oops, something went wrong.