Skip to content

Commit

Permalink
Finally got Stanford wrapper to work nicely for word-by-word annotati…
Browse files Browse the repository at this point in the history
…ons.
  • Loading branch information
Mark Granroth-Wilding committed Mar 29, 2016
1 parent ceb0a02 commit 59f5e80
Show file tree
Hide file tree
Showing 15 changed files with 346 additions and 65 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ lib/python/*
models/*
!models/Makefile
docs/_build/*
log/
11 changes: 5 additions & 6 deletions lib/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,10 @@ bs4 :
rm beautifulsoup4-4.3.2.tar.gz

################
corenlp : stanford_corenlp_pywrapper
stanford : requests

stanford_corenlp_pywrapper :
$(FETCH) https://github.com/brendano/stanford_corenlp_pywrapper/archive/master.zip
requests :
$(FETCH) https://github.com/kennethreitz/requests/archive/master.zip
unzip master.zip
mv stanford_corenlp_pywrapper-master/stanford_corenlp_pywrapper .
rm -rf stanford_corenlp_pywrapper-master master.zip
# Installed Python wrapper: remember to install the Java libraries as well!
mv requests-master/requests .
rm -rf requests-master/ master.zip
1 change: 1 addition & 0 deletions src/python/pimlico/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
JAVA_LIB_DIR = os.path.join(LIB_DIR, "java")
JAVA_BUILD_DIR = os.path.join(PIMLICO_ROOT, "build")
MODEL_DIR = os.path.join(PIMLICO_ROOT, "models")
LOG_DIR = os.path.join(PIMLICO_ROOT, "log")
17 changes: 9 additions & 8 deletions src/python/pimlico/core/external/java.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import os
import time
from subprocess import Popen, PIPE, check_output, STDOUT, CalledProcessError

from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR, PIMLICO_ROOT

from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR
from pimlico.core.logs import get_log_file
from pimlico.core.modules.base import DependencyError
from pimlico.utils.communicate import timeout_process


CLASSPATH = ":".join(["%s/*" % JAVA_LIB_DIR, JAVA_BUILD_DIR])


Expand All @@ -18,10 +16,12 @@ def call_java(class_name, args=[]):
return stdout_data, stderr_data, process.returncode


def start_java_process(class_name, args=[], wait=0.1):
def start_java_process(class_name, args=[], java_args=[], wait=0.1):
# May in future want to allow the path to the java executable to be specified in local config
cmd = ["java", "-cp", CLASSPATH, class_name] + args
cmd = ["java", "-cp", CLASSPATH] + java_args + [class_name] + args
process = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=False)
# Attach the command to the Popen object so it's easy to read what was run for debugging
process.command_run = " ".join(cmd)

# Wait a mo for it to get started
time.sleep(wait)
Expand All @@ -48,7 +48,8 @@ def check_java_dependency(class_name):

out, err, code = call_java("pimlico.core.DependencyChecker", [class_name])
if code != 0:
raise DependencyError("could not load Java class %s. Have you compiled the relevant Java module?" % class_name)
raise DependencyError("could not load Java class %s. Have you compiled the relevant Java module?" % class_name,
stderr=err, stdout=out)


def check_java():
Expand Down Expand Up @@ -189,7 +190,7 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],


def output_p4j_error_info(command, returncode, stdout, stderr):
file_path = os.path.abspath(os.path.join(PIMLICO_ROOT, "py4j.err"))
file_path = get_log_file("py4j")
with open(file_path, "w") as f:
print >>f, "Command:"
print >>f, " ".join(command)
Expand Down
16 changes: 16 additions & 0 deletions src/python/pimlico/core/logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
from pimlico import LOG_DIR


def get_log_file(name):
"""
Returns the path to a log file that may be used to output helpful logging info. Typically used
to output verbose error information if something goes wrong. The file can be found in the Pimlico
log dir.
:param name: identifier to distinguish from other logs
:return: path
"""
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR)
return os.path.abspath(os.path.join(LOG_DIR, "%s.log" % name))
5 changes: 4 additions & 1 deletion src/python/pimlico/core/modules/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,10 @@ class DependencyError(Exception):
make target in the lib directory.
"""
pass
def __init__(self, message, stderr=None, stdout=None):
super(DependencyError, self).__init__(message)
self.stdout = stdout
self.stderr = stderr


def load_module_executor(path_or_info):
Expand Down
8 changes: 8 additions & 0 deletions src/python/pimlico/modules/stanford/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
__author__ = 'mark'


class CoreNLPClientError(Exception):
pass


class CoreNLPProcessingError(Exception):
pass
65 changes: 37 additions & 28 deletions src/python/pimlico/modules/stanford/annotate/exec.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,26 @@
import os
from pimlico import JAVA_LIB_DIR
from pimlico.core.modules.execute import ModuleExecutionError
from pimlico.core.modules.map import DocumentMapModuleExecutor
from stanford_corenlp_pywrapper import CoreNLP
from pimlico.datatypes.tar import TarredCorpus
from pimlico.datatypes.word_annotations import WordAnnotationCorpus, SimpleWordAnnotationCorpusWriter
from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus
from pimlico.modules.stanford import CoreNLPProcessingError
from pimlico.modules.stanford.wrapper import CoreNLP


def _word_annotation_preproc(doc):
return "\n".join(
" ".join(word["word"] for word in sentence)
for sentence in doc
)
return "\n".join(" ".join(word["word"] for word in sentence) for sentence in doc)


class ModuleExecutor(DocumentMapModuleExecutor):
def get_writer(self, info):
output_datatype = info.get_output("documents")
return SimpleWordAnnotationCorpusWriter(info.get_output_dir("documents"),
output_datatype.read_annotation_fields())
output_name, output_datatype = info.get_output_datatype("documents")
return SimpleWordAnnotationCorpusWriter(info.get_output_dir("documents"), output_datatype.annotation_fields)

def preprocess(self, info):
annotators = []
if type(self.input_corpora[0]) is TarredCorpus:
if not isinstance(self.input_corpora[0], (WordAnnotationCorpus, TokenizedCorpus)):
# Data not already run through a tokenizer: include tokenization and sentence splitting
annotators.extend(["tokenize", "ssplit"])
annotators.extend(info.options["annotators"].split(","))

# Prepare a CoreNLP background process to do the processing
self.corenlp = CoreNLP(
configdict={"annotators": annotators},
output_types=[annotators],
corenlp_jars=[os.path.join(JAVA_LIB_DIR, "*")]
)

# By default, for a TarredCorpus or TokenizedCorpus, just pass in the document text
self._doc_preproc = lambda doc: doc
if type(self.input_corpora[0]) is TokenizedCorpus:
Expand All @@ -44,15 +30,38 @@ def preprocess(self, info):
# For a word annotation corpus, we need to pull out the words
self._doc_preproc = _word_annotation_preproc

# Prepare the list of attributes to extract from the output and send to the writer
output_name, output_datatype = info.get_output_datatype("documents")
self.output_fields = output_datatype.annotation_fields

# Prepare a CoreNLP background process to do the processing
self.corenlp = CoreNLP(info.pipeline)
self.corenlp.start()
self.log.info("CoreNLP server started on %s" % self.corenlp.server_url)
self.properties = {
"annotators": ",".join(annotators),
"outputFormat": "json",
}

def process_document(self, archive, filename, doc):
doc = self._doc_preproc(doc)
# Call CoreNLP on the doc
# TODO Not working -- not sure why
json_result = self.corenlp.parse_doc(doc.encode("utf-8"))
print json_result
# TODO Do something with the result
# Output one sentence per line
#return u"\n".join(tokenized_sents)

if doc.strip():
# Call CoreNLP on the doc
try:
json_result = self.corenlp.annotate(doc.encode("utf-8"), self.properties)
except CoreNLPProcessingError, e:
# TODO: do something other than re-raise
raise

return [
[
[word_data[field_name] for field_name in self.output_fields]
for word_data in sentence["tokens"]
] for sentence in json_result["sentences"]
]
else:
return []

def postprocess(self, info, error=False):
pass
self.corenlp.shutdown()
24 changes: 2 additions & 22 deletions src/python/pimlico/modules/stanford/annotate/info.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from pimlico.core.external.java import check_java_dependency, DependencyCheckerError
from pimlico.core.modules.base import DependencyError
from pimlico.core.modules.map import DocumentMapModuleInfo
from pimlico.datatypes.tar import TarredCorpus
from pimlico.datatypes.word_annotations import WordAnnotationCorpus
from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus
from pimlico.modules.stanford.dependencies import check_corenlp_dependencies


def annotation_fields_from_options(module_info):
Expand Down Expand Up @@ -57,26 +56,7 @@ def __init__(self, *args, **kwargs):
#self.token_model_path = abs_path_or_model_dir_path(self.options["token_model"], "opennlp")

def check_runtime_dependencies(self):
missing_dependencies = []
# We need the CoreNLP python wrapper available
try:
import stanford_corenlp_pywrapper
except ImportError:
missing_dependencies.append(("CoreNLP wrapper", self.module_name,
"Install in lib/python/ dir using 'make corenlp'"))

# Check whether the OpenNLP tokenizer is available
try:
class_name = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
try:
check_java_dependency(class_name)
except DependencyError:
missing_dependencies.append(("CoreNLP", self.module_name,
"Couldn't load %s. Install Stanford CoreNLP in lib/java/ dir using "
"'make corenlp'" % class_name))
except DependencyCheckerError, e:
missing_dependencies.append(("Java dependency checker", self.module_name, str(e)))

missing_dependencies = check_corenlp_dependencies(self.module_name)
# TODO Check models are available here, when you've added the model path option
missing_dependencies.extend(super(ModuleInfo, self).check_runtime_dependencies())
return missing_dependencies
34 changes: 34 additions & 0 deletions src/python/pimlico/modules/stanford/dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from pimlico.core.external.java import check_java_dependency, DependencyCheckerError
from pimlico.core.modules.base import DependencyError


def check_corenlp_dependencies(module_name):
"""
Check dependencies in the style of module dependency checkers and return a list of the missing
dependencies in the form they use. Designed to make it easy for all modules that use CoreNLP to
check the basic deps.
"""
missing_dependencies = []
try:
class_name = "edu.stanford.nlp.pipeline.StanfordCoreNLPServer"
try:
check_java_dependency(class_name)
except DependencyError, e:
if e.stderr is not None:
extra_err = ". (Error: %s)" % e.stderr.splitlines()[0]
else:
extra_err = ""
missing_dependencies.append(("CoreNLP", module_name,
"Couldn't load %s. Install Stanford CoreNLP libraries in Java lib dir using "
"'make corenlp'%s" % (class_name, extra_err)))
except DependencyCheckerError, e:
missing_dependencies.append(("Java dependency checker", module_name, str(e)))

# We depend on the requests library
try:
import requests
except ImportError:
missing_dependencies.append(("Python requests library", module_name,
"Install together with all CoreNLP python deps in Python lib dir using "
"'make stanford'"))
return missing_dependencies

0 comments on commit 59f5e80

Please sign in to comment.