Further work on OpenNLP modules. Started on Stanford CoreNLP wrapper.…

… Tried using python wrapper, but it doesn't work well, so going to try a different one.
markgw · Mar 29, 2016 · 7647aff · 7647aff
1 parent 6dcb008
commit 7647aff
Show file tree

Hide file tree

Showing 18 changed files with 316 additions and 55 deletions.
diff --git a/lib/java/Makefile b/lib/java/Makefile
@@ -70,25 +70,15 @@ py4j0.9.2.jar :
 
 ###################################
 
-stanford-parser :
-	$(MAKE) stanford-parser-full-2014-01-04.zip
-	unzip stanford-parser-full-2014-01-04.zip
-	mv stanford-parser-full-2014-01-04/ stanford-parser
-	rm stanford-parser-full-2014-01-04.zip
-
-stanford-parser-full-2014-01-04.zip :
-	$(FETCH) http://nlp.stanford.edu/software/stanford-parser-full-2014-01-04.zip
-
-stanford-tagger : 
-	$(MAKE) stanford-postagger-2014-01-04.zip
-	unzip stanford-postagger-2014-01-04.zip
-	mv stanford-postagger-2014-01-04 stanford-tagger
-	rm stanford-postagger-2014-01-04.zip
-
-stanford-postagger-2014-01-04.zip :
-	$(FETCH) http://nlp.stanford.edu/downloads/stanford-postagger-2014-01-04.zip
-
-####################################
+corenlp : stanford-corenlp-3.6.0.jar
+
+stanford-corenlp-3.6.0.jar :
+	$(FETCH) http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip
+	unzip stanford-corenlp-full-2015-12-09.zip
+	mv stanford-corenlp-full-2015-12-09/*.jar .
+	rm -rf stanford-corenlp-full-2015-12-09 stanford-corenlp-full-2015-12-09.zip
+
+##################################
 
 guava : guava.jar
 

diff --git a/lib/python/Makefile b/lib/python/Makefile
@@ -55,3 +55,13 @@ bs4 :
 	mv beautifulsoup4-4.3.2/bs4 .
 	rmdir beautifulsoup4-4.3.2
 	rm beautifulsoup4-4.3.2.tar.gz
+
+################
+corenlp : stanford_corenlp_pywrapper
+
+stanford_corenlp_pywrapper :
+	$(FETCH) https://github.com/brendano/stanford_corenlp_pywrapper/archive/master.zip
+	unzip master.zip
+	mv stanford_corenlp_pywrapper-master/stanford_corenlp_pywrapper .
+	rm -rf stanford_corenlp_pywrapper-master master.zip
+	# Installed Python wrapper: remember to install the Java libraries as well!
diff --git a/src/python/pimlico/cli/check.py b/src/python/pimlico/cli/check.py
@@ -6,6 +6,10 @@
 
 
 def check_cmd(pipeline, opts):
+    # Output what variants are available and say which we're checking
+    print "Available pipeline variants: %s" % ", ".join(["main"] + pipeline.available_variants)
+    print "Checking variant '%s'\n" % pipeline.variant
+
     # Load all the modules' metadata
     # This makes sure none of the modules have trouble loading
     for name in pipeline.modules:
@@ -48,7 +52,7 @@ def check_cmd(pipeline, opts):
 
         if len(missing_dependencies):
             print "\nRuntime dependencies not satisfied:\n%s" % \
-                  multiline_tablate(missing_dependencies, [30, 30, 150],
+                  multiline_tablate(missing_dependencies, [30, 30, 60],
                                     tablefmt="orgtbl", headers=["Dependency", "Module", "Description"])
         else:
             print "\nRuntime dependencies all satisfied"
diff --git a/src/python/pimlico/cli/run.py b/src/python/pimlico/cli/run.py
@@ -30,10 +30,19 @@ def run_cmd(pipeline, opts):
             raise
 
 
+def list_variants(pipeline, opts):
+    # Main is the default pipeline config and is always available (but not included in this list)
+    variants = ["main"] + pipeline.available_variants
+    print "Available pipeline variants: %s" % ", ".join(variants)
+    print "Select one using the --variant option"
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Main command line interface to PiMLiCo")
     parser.add_argument("pipeline_config", help="Config file to load a pipeline from")
     parser.add_argument("--debug", "-d", help="Output verbose debugging info", action="store_true")
+    parser.add_argument("--variant", "-v", help="Load a particular variant of a pipeline. For a list of available "
+                                                "variants, use the 'variants' command", default="main")
     subparsers = parser.add_subparsers(help="Select a sub-command")
 
     check = subparsers.add_parser("check",
@@ -53,11 +62,14 @@ def run_cmd(pipeline, opts):
     run.add_argument("--force-rerun", "-f", action="store_true",
                      help="Force running the module, even if it's already been run to completion")
 
+    variants = subparsers.add_parser("variants", help="List the available variants of a pipeline config")
+    variants.set_defaults(func=list_variants)
+
     opts = parser.parse_args()
 
     # Read in the pipeline config from the given file
     try:
-        pipeline = PipelineConfig.load(opts.pipeline_config)
+        pipeline = PipelineConfig.load(opts.pipeline_config, variant=opts.variant)
     except PipelineConfigParseError, e:
         print >>sys.stderr, "Error reading pipeline config: %s" % e
         sys.exit(1)

diff --git a/src/python/pimlico/core/config.py b/src/python/pimlico/core/config.py
@@ -19,6 +19,7 @@ class PipelineConfig(object):
 
     Special sections
     ================
+
     - vars:
         May contain any variable definitions, to be used later on in the pipeline. Further down, expressions like
         `%(varname)s` will be expanded into the value assigned to `varname` in the vars section.
@@ -34,9 +35,28 @@ class PipelineConfig(object):
             modules/packages used by the pipeline can be found. Typically, a config file is distributed with a
             directory of Python code providing extra modules, datatypes, etc. Multiple paths are separated by `:`s
 
+    Directives
+    ==========
+    Certain special directives are processed when reading config files. They are lines that begin with "%%", followed
+    by the directive name and any arguments.
+
+    - `variant`:
+        Allows a line to be included only when loading a particular variant of a pipeline. The variant name is
+        specified as part of the directive in the form: `variant:variant_name`. You may include the line in more
+        than one variant by specifying multiple names, separated by commas (and no spaces). You can use the default
+        variant "main", so that the line will be left out of other variants. The rest of the line, after the directive
+        and variant name(s) is the content that will be included in those variants.
+    - `novariant`:
+        A line to be included only when not loading a variant of the pipeline. Equivalent to `variant:main`.
+    - `include`:
+        Include the entire contents of another file. The filename, specified relative to the config file in which the
+        directive is found, is given after a space.
 
     """
-    def __init__(self, pipeline_config, local_config, raw_module_configs, module_order, filename=None):
+    def __init__(self, pipeline_config, local_config, raw_module_configs, module_order, filename=None,
+                 variant="main", available_variants=[]):
+        self.available_variants = available_variants
+        self.variant = variant
         # Stores the module names in the order they were specified in the config file
         self.module_order = module_order
         self.local_config = local_config
@@ -54,8 +74,8 @@ def __init__(self, pipeline_config, local_config, raw_module_configs, module_ord
                                            "pipeline section")
         check_release(self.pipeline_config["release"])
 
-        self.long_term_store = os.path.join(self.local_config["long_term_store"], self.name)
-        self.short_term_store = os.path.join(self.local_config["short_term_store"], self.name)
+        self.long_term_store = os.path.join(self.local_config["long_term_store"], self.name, self.variant)
+        self.short_term_store = os.path.join(self.local_config["short_term_store"], self.name, self.variant)
 
         # Get paths to add to the python path for the pipeline
         # Used so that a project can specify custom module types and other python code outside the pimlico source tree
@@ -155,7 +175,7 @@ def path_relative_to_config(self, path):
         return os.path.abspath(os.path.join(config_dir, path))
 
     @staticmethod
-    def load(filename, local_config=None):
+    def load(filename, local_config=None, variant="main"):
         if local_config is None:
             # Use the default locations for local config file
             # May want to add other locations here...
@@ -183,10 +203,12 @@ def load(filename, local_config=None):
             if attr not in local_config_data:
                 raise PipelineConfigParseError("required attribute '%s' is not specified in local config" % attr)
 
-        # TODO Perform pre-processing of config file to replace includes, etc
-        with open(filename, "r") as f:
-            # ConfigParser can read directly from a file, but we need to pre-process the text
-            config_text = f.read()
+        # Perform pre-processing of config file to replace includes, etc
+        config_text, available_variants = preprocess_config_file(os.path.abspath(filename), variant=variant)
+        # If we were asked to load a particular variant, check it's in the list of available variants
+        if variant is not None and variant != "main" and variant not in available_variants:
+            raise PipelineConfigParseError("could not load pipeline variant '%s': it is not declared anywhere in the "
+                                           "config file")
         text_buffer = StringIO(config_text)
 
         # Parse the config file text
@@ -220,7 +242,8 @@ def load(filename, local_config=None):
         except ConfigParser.Error, e:
             raise PipelineConfigParseError("could not parse config file. %s" % e)
         # Do no further checking or processing at this stage: just keep raw dictionaries for the time being
-        return PipelineConfig(pipeline_config, local_config_data, raw_module_options, module_order, filename=filename)
+        return PipelineConfig(pipeline_config, local_config_data, raw_module_options, module_order,
+                              filename=filename, variant=variant, available_variants=list(sorted(available_variants)))
 
 
 def var_substitute(option_val, vars):
@@ -240,6 +263,41 @@ class PipelineStructureError(Exception):
     pass
 
 
+def preprocess_config_file(filename, variant="main"):
+    # Read in the file
+    config_lines = []
+    available_variants = set([])
+    with open(filename, "r") as f:
+        # ConfigParser can read directly from a file, but we need to pre-process the text
+        for line in f:
+            if line.startswith("%% "):
+                # Directive: process this now
+                directive, __, rest = line[3:].partition(" ")
+                if directive.lower() == "novariant":
+                    # Include this line only if loading the main variant
+                    if variant == "main":
+                        config_lines.append(rest.lstrip())
+                elif directive.lower().startswith("variant:"):
+                    variant_cond = directive[8:]
+                    # Line conditional on a specific variant: include only if we're loading that variant
+                    if variant_cond == variant:
+                        config_lines.append(rest.lstrip())
+                    # Keep a list of all available variants
+                    available_variants.add(variant_cond)
+                elif directive == "include":
+                    # Include another file, given relative to this one
+                    include_filename = os.path.abspath(os.path.join(os.path.dirname(filename), rest.strip("\n ")))
+                    # Run preprocessing over that file too, so we can have embedded includes, etc
+                    incl_text, incl_variants = preprocess_config_file(include_filename, variant=variant)
+                    config_lines.append(incl_text)
+                    available_variants.update(incl_variants)
+                else:
+                    raise PipelineConfigParseError("unknown directive '%s' used in config file" % directive)
+            else:
+                config_lines.append(line)
+    return "".join(config_lines), available_variants
+
+
 def check_for_cycles(pipeline):
     # Build a mapping representing module dependencies
     dep_map = dict(

diff --git a/src/python/pimlico/core/external/java.py b/src/python/pimlico/core/external/java.py
@@ -1,13 +1,13 @@
 import os
 import time
 from subprocess import Popen, PIPE, check_output, STDOUT, CalledProcessError
-import fcntl
-import sys
 
-from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR
+from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR, PIMLICO_ROOT
+
 from pimlico.core.modules.base import DependencyError
 from pimlico.utils.communicate import timeout_process
 
+
 CLASSPATH = ":".join(["%s/*" % JAVA_LIB_DIR, JAVA_BUILD_DIR])
 
 
@@ -140,14 +140,23 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],
 
     # Determine which port the server started on (needed to support ephemeral ports)
     # Don't hang on an error running the gateway launcher
+    output = None
     try:
-        with timeout_process(proc, 1.0):
+        with timeout_process(proc, 3.0):
             output = proc.stdout.readline()
     except Exception, e:
         # Try reading stderr to see if there's any info there
         error_output = proc.stderr.read().strip("\n ")
-        raise JavaProcessError("error reading first line from gateway process: %s. Error output: %s" %
-                               (e, error_output))
+        err_path = output_p4j_error_info(command, "?", "could not read", error_output)
+
+        raise JavaProcessError("error reading first line from gateway process: %s. Error output: %s (see %s for "
+                               "more details)" % (e, error_output, err_path))
+
+    if output is None:
+        error_output = proc.stderr.read().strip("\n ")
+        err_path = output_p4j_error_info(command, "(timed out)", "", error_output)
+        raise JavaProcessError("timed out starting gateway server (for details see %s)" % err_path)
+
     # Check whether there was an error reported
     output = output.strip("\n ")
     if output == "ERROR":
@@ -159,11 +168,16 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],
         port_used = int(output)
     except ValueError:
         returncode = proc.poll()
+
+        stderr_output = proc.stderr.read().strip("\n ")
+        err_path = output_p4j_error_info(command, returncode, output, stderr_output)
+
         if returncode is not None:
-            raise JavaProcessError("Py4J server process returned with return code %s: %s" % (returncode,
-                                                                                             proc.stderr.read()))
+            raise JavaProcessError("Py4J server process returned with return code %s: %s (see %s for details)" %
+                                   (returncode, stderr_output, err_path))
         else:
-            raise JavaProcessError("invalid output from Py4J server when started: '%s'" % output)
+            raise JavaProcessError("invalid output from Py4J server when started: '%s' (see %s for details)" %
+                                   (output, err_path))
 
     # Start consumer threads so process does not deadlock/hangs
     OutputConsumer(redirect_stdout, proc.stdout, daemon=daemonize_redirect).start()
@@ -174,6 +188,18 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],
     return port_used, proc
 
 
+def output_p4j_error_info(command, returncode, stdout, stderr):
+    file_path = os.path.abspath(os.path.join(PIMLICO_ROOT, "py4j.err"))
+    with open(file_path, "w") as f:
+        print >>f, "Command:"
+        print >>f, " ".join(command)
+        print >>f, "Return code: %s" % returncode
+        print >>f, "Read from stdout:"
+        print >>f, stdout
+        print >>f, "Read from stderr:"
+        print >>f, stderr
+    return file_path
+
 
 class DependencyCheckerError(Exception):
     pass

diff --git a/src/python/pimlico/core/modules/execute.py b/src/python/pimlico/core/modules/execute.py
@@ -8,6 +8,10 @@ def execute_module(pipeline, module_name, force_rerun=False, debug=False):
     # Prepare a logger
     log = get_console_logger("Pimlico", debug=debug)
 
+    pipeline_name = "'%s'" % pipeline.name if pipeline.variant == "main" else \
+        "'%s' (variant '%s')" % (pipeline.name, pipeline.variant)
+    log.info("Loaded pipeline %s" % pipeline_name)
+
     # Load the module instance
     module = pipeline[module_name]
     log.info("Checking module config")

diff --git a/src/python/pimlico/core/modules/inputs.py b/src/python/pimlico/core/modules/inputs.py
@@ -11,6 +11,10 @@ class InputModuleInfo(BaseModuleInfo):
     Base class for input modules. These don't get executed in general, they just provide a way to iterate over
     input data.
 
+    You probably don't want to subclass this. It's usually simplest to define a datatype for reading the input
+    data and then just specify its class as the module's type. This results in a subclass of this module info
+    being created dynamically to read that data.
+
     Note that module_executable is typically set to False and the base class does this. However, some input
     modules need to be executed before the input is usable, for example to collect stats about the input
     data.

diff --git a/src/python/pimlico/core/modules/map.py b/src/python/pimlico/core/modules/map.py
@@ -68,7 +68,7 @@ def execute(self, module_instance_info):
             with self.get_writer(module_instance_info) as writer:
                 for archive, filename, docs in pbar(input_iterator.archive_iter()):
                     # Get the subclass to process the doc
-                    result = self.process_document(filename, *docs)
+                    result = self.process_document(archive, filename, *docs)
                     # Write the result to the output corpus
                     writer.add_document(archive, filename, result)
             complete = True

diff --git a/src/python/pimlico/datatypes/word_annotations.py b/src/python/pimlico/datatypes/word_annotations.py
@@ -79,7 +79,7 @@ def word_re(self):
             self._word_re = re.compile(word_re)
         return self._word_re
 
-    def parse_annotations(self, doc):
+    def process_document(self, doc):
         sentences = []
         while len(doc):
             # Find the next sentence boundary
@@ -103,8 +103,6 @@ def parse_annotations(self, doc):
             sentences.append(word_dicts)
         return sentences
 
-    document_preprocessors = [parse_annotations]
-
     def data_ready(self):
         if not super(WordAnnotationCorpus, self).data_ready():
             return False
@@ -165,7 +163,7 @@ def add_document(self, archive_name, doc_name, data):
         super(SimpleWordAnnotationCorpusWriter, self).add_document(archive_name, doc_name, doc_string)
 
 
-def add_annotation_field(input_name, add_fields):
+def AddAnnotationField(input_name, add_fields):
     """
     Dynamic type constructor that can be used in place of a module's output type. When called
     (when the output type is needed), dynamically creates a new type that is a WordAnnotationCorpus
@@ -200,6 +198,11 @@ def _builder(module_info):
                                           (input_name, input_datatype.__name__))
             base_annotation_fields = input_datatype.annotation_fields
 
+        for field in add_fields:
+            if field in base_annotation_fields:
+                raise ModuleInfoLoadError("trying to add a field '%s' to data that already has a field with "
+                                          "that name" % field)
+
         class ExtendedWordAnnotationCorpus(WordAnnotationCorpus):
             annotation_fields = base_annotation_fields + add_fields