Finally got Stanford wrapper to work nicely for word-by-word annotati…

…ons.
markgw · Mar 29, 2016 · 59f5e80 · 59f5e80
1 parent ceb0a02
commit 59f5e80
Show file tree

Hide file tree

Showing 15 changed files with 346 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ lib/python/*
 models/*
 !models/Makefile
 docs/_build/*
+log/
diff --git a/lib/python/Makefile b/lib/python/Makefile
@@ -57,11 +57,10 @@ bs4 :
 	rm beautifulsoup4-4.3.2.tar.gz
 
 ################
-corenlp : stanford_corenlp_pywrapper
+stanford : requests
 
-stanford_corenlp_pywrapper :
-	$(FETCH) https://github.com/brendano/stanford_corenlp_pywrapper/archive/master.zip
+requests :
+	$(FETCH) https://github.com/kennethreitz/requests/archive/master.zip
 	unzip master.zip
-	mv stanford_corenlp_pywrapper-master/stanford_corenlp_pywrapper .
-	rm -rf stanford_corenlp_pywrapper-master master.zip
-	# Installed Python wrapper: remember to install the Java libraries as well!
+	mv requests-master/requests .
+	rm -rf requests-master/ master.zip
diff --git a/...core/StreamCommunicationPacketReader.java → .../StreamCommunicationPacketReader.java.old b/...core/StreamCommunicationPacketReader.java → .../StreamCommunicationPacketReader.java.old
diff --git a/...core/StreamCommunicationPacketWriter.java → .../StreamCommunicationPacketWriter.java.old b/...core/StreamCommunicationPacketWriter.java → .../StreamCommunicationPacketWriter.java.old
diff --git a/src/python/pimlico/__init__.py b/src/python/pimlico/__init__.py
@@ -6,3 +6,4 @@
 JAVA_LIB_DIR = os.path.join(LIB_DIR, "java")
 JAVA_BUILD_DIR = os.path.join(PIMLICO_ROOT, "build")
 MODEL_DIR = os.path.join(PIMLICO_ROOT, "models")
+LOG_DIR = os.path.join(PIMLICO_ROOT, "log")
diff --git a/src/python/pimlico/core/external/java.py b/src/python/pimlico/core/external/java.py
@@ -1,13 +1,11 @@
-import os
 import time
 from subprocess import Popen, PIPE, check_output, STDOUT, CalledProcessError
 
-from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR, PIMLICO_ROOT
-
+from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR
+from pimlico.core.logs import get_log_file
 from pimlico.core.modules.base import DependencyError
 from pimlico.utils.communicate import timeout_process
 
-
 CLASSPATH = ":".join(["%s/*" % JAVA_LIB_DIR, JAVA_BUILD_DIR])
 
 
@@ -18,10 +16,12 @@ def call_java(class_name, args=[]):
     return stdout_data, stderr_data, process.returncode
 
 
-def start_java_process(class_name, args=[], wait=0.1):
+def start_java_process(class_name, args=[], java_args=[], wait=0.1):
     # May in future want to allow the path to the java executable to be specified in local config
-    cmd = ["java", "-cp", CLASSPATH, class_name] + args
+    cmd = ["java", "-cp", CLASSPATH] + java_args + [class_name] + args
     process = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=False)
+    # Attach the command to the Popen object so it's easy to read what was run for debugging
+    process.command_run = " ".join(cmd)
 
     # Wait a mo for it to get started
     time.sleep(wait)
@@ -48,7 +48,8 @@ def check_java_dependency(class_name):
 
     out, err, code = call_java("pimlico.core.DependencyChecker", [class_name])
     if code != 0:
-        raise DependencyError("could not load Java class %s. Have you compiled the relevant Java module?" % class_name)
+        raise DependencyError("could not load Java class %s. Have you compiled the relevant Java module?" % class_name,
+                              stderr=err, stdout=out)
 
 
 def check_java():
@@ -189,7 +190,7 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],
 
 
 def output_p4j_error_info(command, returncode, stdout, stderr):
-    file_path = os.path.abspath(os.path.join(PIMLICO_ROOT, "py4j.err"))
+    file_path = get_log_file("py4j")
     with open(file_path, "w") as f:
         print >>f, "Command:"
         print >>f, " ".join(command)

diff --git a/src/python/pimlico/core/logs.py b/src/python/pimlico/core/logs.py
@@ -0,0 +1,16 @@
+import os
+from pimlico import LOG_DIR
+
+
+def get_log_file(name):
+    """
+    Returns the path to a log file that may be used to output helpful logging info. Typically used
+    to output verbose error information if something goes wrong. The file can be found in the Pimlico
+    log dir.
+
+    :param name: identifier to distinguish from other logs
+    :return: path
+    """
+    if not os.path.exists(LOG_DIR):
+        os.makedirs(LOG_DIR)
+    return os.path.abspath(os.path.join(LOG_DIR, "%s.log" % name))
diff --git a/src/python/pimlico/core/modules/base.py b/src/python/pimlico/core/modules/base.py
@@ -372,7 +372,10 @@ class DependencyError(Exception):
     make target in the lib directory.
 
     """
-    pass
+    def __init__(self, message, stderr=None, stdout=None):
+        super(DependencyError, self).__init__(message)
+        self.stdout = stdout
+        self.stderr = stderr
 
 
 def load_module_executor(path_or_info):

diff --git a/src/python/pimlico/modules/stanford/__init__.py b/src/python/pimlico/modules/stanford/__init__.py
@@ -1 +1,9 @@
 __author__ = 'mark'
+
+
+class CoreNLPClientError(Exception):
+    pass
+
+
+class CoreNLPProcessingError(Exception):
+    pass
diff --git a/src/python/pimlico/modules/stanford/annotate/exec.py b/src/python/pimlico/modules/stanford/annotate/exec.py
@@ -1,40 +1,26 @@
-import os
-from pimlico import JAVA_LIB_DIR
-from pimlico.core.modules.execute import ModuleExecutionError
 from pimlico.core.modules.map import DocumentMapModuleExecutor
-from stanford_corenlp_pywrapper import CoreNLP
-from pimlico.datatypes.tar import TarredCorpus
 from pimlico.datatypes.word_annotations import WordAnnotationCorpus, SimpleWordAnnotationCorpusWriter
 from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus
+from pimlico.modules.stanford import CoreNLPProcessingError
+from pimlico.modules.stanford.wrapper import CoreNLP
 
 
 def _word_annotation_preproc(doc):
-    return "\n".join(
-        " ".join(word["word"] for word in sentence)
-        for sentence in doc
-    )
+    return "\n".join(" ".join(word["word"] for word in sentence) for sentence in doc)
 
 
 class ModuleExecutor(DocumentMapModuleExecutor):
     def get_writer(self, info):
-        output_datatype = info.get_output("documents")
-        return SimpleWordAnnotationCorpusWriter(info.get_output_dir("documents"),
-                                                output_datatype.read_annotation_fields())
+        output_name, output_datatype = info.get_output_datatype("documents")
+        return SimpleWordAnnotationCorpusWriter(info.get_output_dir("documents"), output_datatype.annotation_fields)
 
     def preprocess(self, info):
         annotators = []
-        if type(self.input_corpora[0]) is TarredCorpus:
+        if not isinstance(self.input_corpora[0], (WordAnnotationCorpus, TokenizedCorpus)):
             # Data not already run through a tokenizer: include tokenization and sentence splitting
             annotators.extend(["tokenize", "ssplit"])
         annotators.extend(info.options["annotators"].split(","))
 
-        # Prepare a CoreNLP background process to do the processing
-        self.corenlp = CoreNLP(
-            configdict={"annotators": annotators},
-            output_types=[annotators],
-            corenlp_jars=[os.path.join(JAVA_LIB_DIR, "*")]
-        )
-
         # By default, for a TarredCorpus or TokenizedCorpus, just pass in the document text
         self._doc_preproc = lambda doc: doc
         if type(self.input_corpora[0]) is TokenizedCorpus:
@@ -44,15 +30,38 @@ def preprocess(self, info):
             # For a word annotation corpus, we need to pull out the words
             self._doc_preproc = _word_annotation_preproc
 
+        # Prepare the list of attributes to extract from the output and send to the writer
+        output_name, output_datatype = info.get_output_datatype("documents")
+        self.output_fields = output_datatype.annotation_fields
+
+        # Prepare a CoreNLP background process to do the processing
+        self.corenlp = CoreNLP(info.pipeline)
+        self.corenlp.start()
+        self.log.info("CoreNLP server started on %s" % self.corenlp.server_url)
+        self.properties = {
+            "annotators": ",".join(annotators),
+            "outputFormat": "json",
+        }
+
     def process_document(self, archive, filename, doc):
         doc = self._doc_preproc(doc)
-        # Call CoreNLP on the doc
-        # TODO Not working -- not sure why
-        json_result = self.corenlp.parse_doc(doc.encode("utf-8"))
-        print json_result
-        # TODO Do something with the result
-        # Output one sentence per line
-        #return u"\n".join(tokenized_sents)
+
+        if doc.strip():
+            # Call CoreNLP on the doc
+            try:
+                json_result = self.corenlp.annotate(doc.encode("utf-8"), self.properties)
+            except CoreNLPProcessingError, e:
+                # TODO: do something other than re-raise
+                raise
+
+            return [
+                [
+                    [word_data[field_name] for field_name in self.output_fields]
+                    for word_data in sentence["tokens"]
+                ] for sentence in json_result["sentences"]
+            ]
+        else:
+            return []
 
     def postprocess(self, info, error=False):
-        pass
+        self.corenlp.shutdown()
diff --git a/src/python/pimlico/modules/stanford/annotate/info.py b/src/python/pimlico/modules/stanford/annotate/info.py
@@ -1,9 +1,8 @@
-from pimlico.core.external.java import check_java_dependency, DependencyCheckerError
-from pimlico.core.modules.base import DependencyError
 from pimlico.core.modules.map import DocumentMapModuleInfo
 from pimlico.datatypes.tar import TarredCorpus
 from pimlico.datatypes.word_annotations import WordAnnotationCorpus
 from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus
+from pimlico.modules.stanford.dependencies import check_corenlp_dependencies
 
 
 def annotation_fields_from_options(module_info):
@@ -57,26 +56,7 @@ def __init__(self, *args, **kwargs):
         #self.token_model_path = abs_path_or_model_dir_path(self.options["token_model"], "opennlp")
 
     def check_runtime_dependencies(self):
-        missing_dependencies = []
-        # We need the CoreNLP python wrapper available
-        try:
-            import stanford_corenlp_pywrapper
-        except ImportError:
-            missing_dependencies.append(("CoreNLP wrapper", self.module_name,
-                                         "Install in lib/python/ dir using 'make corenlp'"))
-
-        # Check whether the OpenNLP tokenizer is available
-        try:
-            class_name = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
-            try:
-                check_java_dependency(class_name)
-            except DependencyError:
-                missing_dependencies.append(("CoreNLP", self.module_name,
-                                             "Couldn't load %s. Install Stanford CoreNLP in lib/java/ dir using "
-                                             "'make corenlp'" % class_name))
-        except DependencyCheckerError, e:
-            missing_dependencies.append(("Java dependency checker", self.module_name, str(e)))
-
+        missing_dependencies = check_corenlp_dependencies(self.module_name)
         # TODO Check models are available here, when you've added the model path option
         missing_dependencies.extend(super(ModuleInfo, self).check_runtime_dependencies())
         return missing_dependencies
diff --git a/src/python/pimlico/modules/stanford/dependencies.py b/src/python/pimlico/modules/stanford/dependencies.py
@@ -0,0 +1,34 @@
+from pimlico.core.external.java import check_java_dependency, DependencyCheckerError
+from pimlico.core.modules.base import DependencyError
+
+
+def check_corenlp_dependencies(module_name):
+    """
+    Check dependencies in the style of module dependency checkers and return a list of the missing
+    dependencies in the form they use. Designed to make it easy for all modules that use CoreNLP to
+    check the basic deps.
+    """
+    missing_dependencies = []
+    try:
+        class_name = "edu.stanford.nlp.pipeline.StanfordCoreNLPServer"
+        try:
+            check_java_dependency(class_name)
+        except DependencyError, e:
+            if e.stderr is not None:
+                extra_err = ". (Error: %s)" % e.stderr.splitlines()[0]
+            else:
+                extra_err = ""
+            missing_dependencies.append(("CoreNLP", module_name,
+                                         "Couldn't load %s. Install Stanford CoreNLP libraries in Java lib dir using "
+                                         "'make corenlp'%s" % (class_name, extra_err)))
+    except DependencyCheckerError, e:
+        missing_dependencies.append(("Java dependency checker", module_name, str(e)))
+
+    # We depend on the requests library
+    try:
+        import requests
+    except ImportError:
+        missing_dependencies.append(("Python requests library", module_name,
+                                     "Install together with all CoreNLP python deps in Python lib dir using "
+                                     "'make stanford'"))
+    return missing_dependencies