Skip to content

Commit

Permalink
OpenNLP parser wrapper now fully working.
Browse files Browse the repository at this point in the history
Beginnings of coref wrapper.
  • Loading branch information
Mark Granroth-Wilding committed Apr 8, 2016
1 parent bd3548b commit 019ca52
Show file tree
Hide file tree
Showing 12 changed files with 663 additions and 10 deletions.
11 changes: 3 additions & 8 deletions src/java/pimlico/core/Py4JGatewayStarter.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,9 @@ public static void startGateway(Object entryPoint) {
public static void startGateway(Object entryPoint, int port, int pythonPort) {
try {
// Create a gateway server, using this as an entry point
GatewayServer gatewayServer;
if (port != 0) {
if (pythonPort == 0)
pythonPort = port + 1;

gatewayServer = new GatewayServer(entryPoint, port, pythonPort, 0, 0, (List) null);
} else
gatewayServer = new GatewayServer(entryPoint);
// GatewayServer has a constructor with no ports, which sets them to defaults
// If ports aren't given, instead with set them to 0 to automatically allocated ports
GatewayServer gatewayServer = new GatewayServer(entryPoint, port, pythonPort, 0, 0, (List) null);

try {
// Set the server running
Expand Down
135 changes: 135 additions & 0 deletions src/java/pimlico/opennlp/CoreferenceResolver.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package pimlico.opennlp;

import com.google.common.base.Joiner;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.coref.*;
import opennlp.tools.coref.mention.DefaultParse;
import opennlp.tools.coref.mention.Mention;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;

import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* Created by mtw29 on 23/04/14.
*/
public class CoreferenceResolver {
private Parser parser;
private Linker corefLinker;

public CoreferenceResolver(File parsingModel, File modelsDir) throws ModelLoadError {
InputStream modelIn = null;
try {
// Load the parsing model
modelIn = new FileInputStream(parsingModel);
ParserModel model = new ParserModel(modelIn);
// Prepare a parser using the model
parser = ParserFactory.create(model);
} catch (IOException e) {
throw new ModelLoadError("could not load parser model: " + e.getMessage());
} finally {
if (modelIn != null) {
try {
modelIn.close();
} catch (IOException e) {}
}
}

// Load a coref model
try {
//corefLinker = new DefaultLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST);
corefLinker = new TreebankLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST);
} catch (IOException e) {
throw new ModelLoadError("could not load coref model: " + e.getMessage());
}
}

public DiscourseEntity[] resolveCoreferenceParsed(List<ParsedSentence> sentences) {
List<Mention> mentions = new ArrayList<Mention>();

// Collect mentions from each sentence
int sentenceNum = 0;
for (ParsedSentence sentence : sentences) {
// Get mentions from this sentence
mentions.addAll(Arrays.asList(getMentions(sentence.parse, sentenceNum)));
sentenceNum++;
}

Mention[] mentionArray = mentions.toArray(new Mention[mentions.size()]);
try {
// Run coreference resolution
return corefLinker.getEntities(mentionArray);
} catch (RuntimeException e) {
// Sadly this happens occasionally: handle it nicely
return new DiscourseEntity[0];
}
}

public Parse parse(String[] words) {
// Run the parser to get max 1 parse
Parse[] parses = ParserTool.parseLine(Joiner.on(' ').join(words), parser, 1);
// Return the parse if we got one
if (parses.length > 0) {
return parses[0];
} else {
return null;
}
}

public Mention[] getMentions(String[] words, int sentenceNum) {
// Run the parser on the sentence
Parse parse = parse(words);
return getMentions(parse, sentenceNum);
}

public Mention[] getMentions(Parse parse, int sentenceNum) {
// If there was an error reading parse data, we might have no parse: give no mentions
if (parse == null)
return new Mention[0];

// Wrap up the parse
opennlp.tools.coref.mention.Parse mentionParse = new DefaultParse(parse, sentenceNum);
// Extract mentions
Mention[] mentions = corefLinker.getMentionFinder().getMentions(mentionParse);

// This is taken from CoreferencerTool...
//construct new parses for mentions which don't have constituents.
for (Mention mention : mentions) {
if (mention.getParse() == null) {
//not sure how to get head index, but its not used at this point.
Parse snp = new Parse(parse.getText(), mention.getSpan(), "NML", 1.0, 0);
parse.insert(snp);
mention.setParse(new DefaultParse(snp, sentenceNum));
}
}

return mentions;
}

public DiscourseEntity[] resolveCoreference(List<String[]> sentences) {
List<Mention> mentions = new ArrayList<Mention>();

// Collect mentions from each sentence
int sentenceNum = 0;
for (String[] sentence : sentences) {
// Get mentions from this sentence
mentions.addAll(Arrays.asList(getMentions(sentence, sentenceNum)));
sentenceNum++;
}

Mention[] mentionArray = mentions.toArray(new Mention[mentions.size()]);
// Run coreference resolution
return corefLinker.getEntities(mentionArray);
}

public static class ModelLoadError extends Exception {
public ModelLoadError(String message) {
super(message);
}
}
}
118 changes: 118 additions & 0 deletions src/java/pimlico/opennlp/CoreferenceResolverGateway.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package pimlico.opennlp;

import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.Namespace;
import opennlp.tools.coref.DiscourseEntity;
import opennlp.tools.coref.Linker;
import opennlp.tools.coref.LinkerMode;
import opennlp.tools.coref.TreebankLinker;
import opennlp.tools.coref.mention.DefaultParse;
import opennlp.tools.coref.mention.Mention;
import opennlp.tools.parser.Parse;
import pimlico.core.Py4JGatewayStarter;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* Wrapper around OpenNLP's coreference resolution tool to provide access to it via Py4J for Pimlico module.
*/
public class CoreferenceResolverGateway {
private Linker corefLinker;

public CoreferenceResolverGateway(File modelsDir) throws ModelLoadError {
// Load a coref model
try {
//corefLinker = new DefaultLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST);
corefLinker = new TreebankLinker(modelsDir.getAbsolutePath(), LinkerMode.TEST);
} catch (IOException e) {
throw new ModelLoadError("could not load coref model: " + e.getMessage());
}
}

public DiscourseEntity[] resolveCoreference(List<Parse> sentences) {
List<Mention> mentions = new ArrayList<Mention>();

// Collect mentions from each sentence
int sentenceNum = 0;
for (Parse parsedSentence : sentences) {
// Get mentions from this sentence
mentions.addAll(Arrays.asList(getMentions(parsedSentence, sentenceNum)));
sentenceNum++;
}

Mention[] mentionArray = mentions.toArray(new Mention[mentions.size()]);
try {
// Run coreference resolution
return corefLinker.getEntities(mentionArray);
} catch (RuntimeException e) {
// Sadly this happens occasionally: handle it nicely
return new DiscourseEntity[0];
}
}

private Mention[] getMentions(Parse parse, int sentenceNum) {
// If there was an error reading parse data, we might have no parse: give no mentions
if (parse == null)
return new Mention[0];

// Wrap up the parse
opennlp.tools.coref.mention.Parse mentionParse = new DefaultParse(parse, sentenceNum);
// Extract mentions
Mention[] mentions = corefLinker.getMentionFinder().getMentions(mentionParse);

// This is taken from CoreferencerTool...
//construct new parses for mentions which don't have constituents.
for (Mention mention : mentions) {
if (mention.getParse() == null) {
//not sure how to get head index, but its not used at this point.
Parse snp = new Parse(parse.getText(), mention.getSpan(), "NML", 1.0, 0);
parse.insert(snp);
mention.setParse(new DefaultParse(snp, sentenceNum));
}
}

return mentions;
}

public static class ModelLoadError extends Exception {
public ModelLoadError(String message) {
super(message);
}
}

public static void main(String[] args) {
ArgumentParser argParser = ArgumentParsers.newArgumentParser("Coreference");
argParser.description("Run the OpenNLP coreference resolver, providing access to it via Py4J");
argParser.addArgument("coref_model_dir").help("Path to coref model dir");
argParser.addArgument("--port").type(Integer.class).help("Specify a port for gateway server to run on").setDefault(0);
argParser.addArgument("--python-port").type(Integer.class).help("Specify a port for gateway server to use " +
"to response to Python").setDefault(0);

Namespace opts = null;
try {
opts = argParser.parseArgs(args);
} catch (ArgumentParserException e) {
System.err.println("Error in command-line arguments: " + e);
System.exit(1);
}

String corefModelDir = opts.getString("coref_model_dir");

// Load the gateway instance
CoreferenceResolverGateway entryPoint = null;
try {
entryPoint = new CoreferenceResolverGateway(new File(corefModelDir));
} catch (ModelLoadError modelLoadError) {
modelLoadError.printStackTrace();
System.exit(1);
}
// Create a gateway server, using this as an entry point
Py4JGatewayStarter.startGateway(entryPoint, opts.getInt("port"), opts.getInt("python_port"));
}
}
114 changes: 114 additions & 0 deletions src/java/pimlico/opennlp/ParserGateway.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package pimlico.opennlp;

import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.Namespace;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import pimlico.core.Py4JGatewayStarter;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

/**
* Wrapper around OpenNLP's parser tool to provide access to it via Py4J for Pimlico module.
*/
public class ParserGateway {
private Parser parser;

public ParserGateway(File modelPath) throws ModelLoadError {
InputStream modelIn = null;
try {
// Load the parsing model
modelIn = new FileInputStream(modelPath);
ParserModel model = new ParserModel(modelIn);
// Prepare a parser using the model
parser = ParserFactory.create(model);
} catch (IOException e) {
throw new ModelLoadError("could not load parser model: " + e.getMessage());
} finally {
if (modelIn != null) {
try {
modelIn.close();
} catch (IOException e) {}
}
}
}

public static class ModelLoadError extends Exception {
public ModelLoadError(String message) {
super(message);
}
}

public Parse parse(String sentence) {
// Run the parser to get max 1 parse
Parse[] parses = ParserTool.parseLine(sentence, parser, 1);
// Return the parse if we got one
if (parses.length > 0) {
return parses[0];
} else {
return null;
}
}

public String parseTree(String sentence) {
Parse parserOutput = parse(sentence);
// Format the parser output as a PTB tree
StringBuffer sb = new StringBuffer();
parserOutput.show(sb);
return sb.toString();
}

public List<Parse> parse(List<String> sentences) {
ArrayList<Parse> results = new ArrayList<Parse>();
for (String sentence : sentences)
results.add(parse(sentence));
return results;
}

public List<String> parseTrees(List<String> sentences) {
ArrayList<String> results = new ArrayList<String>();
for (String sentence : sentences)
results.add(parseTree(sentence));
return results;
}

public static void main(String[] args) {
ArgumentParser argParser = ArgumentParsers.newArgumentParser("Parser");
argParser.description("Run the OpenNLP parser, providing access to it via Py4J");
argParser.addArgument("model_path").help("Path to path model file");
argParser.addArgument("--port").type(Integer.class).help("Specify a port for gateway server to run on").setDefault(0);
argParser.addArgument("--python-port").type(Integer.class).help("Specify a port for gateway server to use " +
"to response to Python").setDefault(0);

Namespace opts = null;
try {
opts = argParser.parseArgs(args);
} catch (ArgumentParserException e) {
System.err.println("Error in command-line arguments: " + e);
System.exit(1);
}

String parserModelPath = opts.getString("model_path");

// Load the gateway instance
ParserGateway entryPoint = null;
try {
entryPoint = new ParserGateway(new File(parserModelPath));
} catch (ModelLoadError modelLoadError) {
modelLoadError.printStackTrace();
System.exit(1);
}
// Create a gateway server, using this as an entry point
Py4JGatewayStarter.startGateway(entryPoint, opts.getInt("port"), opts.getInt("python_port"));
}
}

0 comments on commit 019ca52

Please sign in to comment.