Skip to content

Commit

Permalink
Further work on OpenNLP modules. Started on Stanford CoreNLP wrapper.…
Browse files Browse the repository at this point in the history
… Tried using python wrapper, but it doesn't work well, so going to try a different one.
  • Loading branch information
Mark Granroth-Wilding committed Mar 29, 2016
1 parent 6dcb008 commit 7647aff
Show file tree
Hide file tree
Showing 18 changed files with 316 additions and 55 deletions.
28 changes: 9 additions & 19 deletions lib/java/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -70,25 +70,15 @@ py4j0.9.2.jar :

###################################

stanford-parser :
$(MAKE) stanford-parser-full-2014-01-04.zip
unzip stanford-parser-full-2014-01-04.zip
mv stanford-parser-full-2014-01-04/ stanford-parser
rm stanford-parser-full-2014-01-04.zip

stanford-parser-full-2014-01-04.zip :
$(FETCH) http://nlp.stanford.edu/software/stanford-parser-full-2014-01-04.zip

stanford-tagger :
$(MAKE) stanford-postagger-2014-01-04.zip
unzip stanford-postagger-2014-01-04.zip
mv stanford-postagger-2014-01-04 stanford-tagger
rm stanford-postagger-2014-01-04.zip

stanford-postagger-2014-01-04.zip :
$(FETCH) http://nlp.stanford.edu/downloads/stanford-postagger-2014-01-04.zip

####################################
corenlp : stanford-corenlp-3.6.0.jar

stanford-corenlp-3.6.0.jar :
$(FETCH) http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip
unzip stanford-corenlp-full-2015-12-09.zip
mv stanford-corenlp-full-2015-12-09/*.jar .
rm -rf stanford-corenlp-full-2015-12-09 stanford-corenlp-full-2015-12-09.zip

##################################

guava : guava.jar

Expand Down
10 changes: 10 additions & 0 deletions lib/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,13 @@ bs4 :
mv beautifulsoup4-4.3.2/bs4 .
rmdir beautifulsoup4-4.3.2
rm beautifulsoup4-4.3.2.tar.gz

################
corenlp : stanford_corenlp_pywrapper

stanford_corenlp_pywrapper :
$(FETCH) https://github.com/brendano/stanford_corenlp_pywrapper/archive/master.zip
unzip master.zip
mv stanford_corenlp_pywrapper-master/stanford_corenlp_pywrapper .
rm -rf stanford_corenlp_pywrapper-master master.zip
# Installed Python wrapper: remember to install the Java libraries as well!
6 changes: 5 additions & 1 deletion src/python/pimlico/cli/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@


def check_cmd(pipeline, opts):
# Output what variants are available and say which we're checking
print "Available pipeline variants: %s" % ", ".join(["main"] + pipeline.available_variants)
print "Checking variant '%s'\n" % pipeline.variant

# Load all the modules' metadata
# This makes sure none of the modules have trouble loading
for name in pipeline.modules:
Expand Down Expand Up @@ -48,7 +52,7 @@ def check_cmd(pipeline, opts):

if len(missing_dependencies):
print "\nRuntime dependencies not satisfied:\n%s" % \
multiline_tablate(missing_dependencies, [30, 30, 150],
multiline_tablate(missing_dependencies, [30, 30, 60],
tablefmt="orgtbl", headers=["Dependency", "Module", "Description"])
else:
print "\nRuntime dependencies all satisfied"
14 changes: 13 additions & 1 deletion src/python/pimlico/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,19 @@ def run_cmd(pipeline, opts):
raise


def list_variants(pipeline, opts):
# Main is the default pipeline config and is always available (but not included in this list)
variants = ["main"] + pipeline.available_variants
print "Available pipeline variants: %s" % ", ".join(variants)
print "Select one using the --variant option"


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Main command line interface to PiMLiCo")
parser.add_argument("pipeline_config", help="Config file to load a pipeline from")
parser.add_argument("--debug", "-d", help="Output verbose debugging info", action="store_true")
parser.add_argument("--variant", "-v", help="Load a particular variant of a pipeline. For a list of available "
"variants, use the 'variants' command", default="main")
subparsers = parser.add_subparsers(help="Select a sub-command")

check = subparsers.add_parser("check",
Expand All @@ -53,11 +62,14 @@ def run_cmd(pipeline, opts):
run.add_argument("--force-rerun", "-f", action="store_true",
help="Force running the module, even if it's already been run to completion")

variants = subparsers.add_parser("variants", help="List the available variants of a pipeline config")
variants.set_defaults(func=list_variants)

opts = parser.parse_args()

# Read in the pipeline config from the given file
try:
pipeline = PipelineConfig.load(opts.pipeline_config)
pipeline = PipelineConfig.load(opts.pipeline_config, variant=opts.variant)
except PipelineConfigParseError, e:
print >>sys.stderr, "Error reading pipeline config: %s" % e
sys.exit(1)
Expand Down
76 changes: 67 additions & 9 deletions src/python/pimlico/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class PipelineConfig(object):
Special sections
================
- vars:
May contain any variable definitions, to be used later on in the pipeline. Further down, expressions like
`%(varname)s` will be expanded into the value assigned to `varname` in the vars section.
Expand All @@ -34,9 +35,28 @@ class PipelineConfig(object):
modules/packages used by the pipeline can be found. Typically, a config file is distributed with a
directory of Python code providing extra modules, datatypes, etc. Multiple paths are separated by `:`s
Directives
==========
Certain special directives are processed when reading config files. They are lines that begin with "%%", followed
by the directive name and any arguments.
- `variant`:
Allows a line to be included only when loading a particular variant of a pipeline. The variant name is
specified as part of the directive in the form: `variant:variant_name`. You may include the line in more
than one variant by specifying multiple names, separated by commas (and no spaces). You can use the default
variant "main", so that the line will be left out of other variants. The rest of the line, after the directive
and variant name(s) is the content that will be included in those variants.
- `novariant`:
A line to be included only when not loading a variant of the pipeline. Equivalent to `variant:main`.
- `include`:
Include the entire contents of another file. The filename, specified relative to the config file in which the
directive is found, is given after a space.
"""
def __init__(self, pipeline_config, local_config, raw_module_configs, module_order, filename=None):
def __init__(self, pipeline_config, local_config, raw_module_configs, module_order, filename=None,
variant="main", available_variants=[]):
self.available_variants = available_variants
self.variant = variant
# Stores the module names in the order they were specified in the config file
self.module_order = module_order
self.local_config = local_config
Expand All @@ -54,8 +74,8 @@ def __init__(self, pipeline_config, local_config, raw_module_configs, module_ord
"pipeline section")
check_release(self.pipeline_config["release"])

self.long_term_store = os.path.join(self.local_config["long_term_store"], self.name)
self.short_term_store = os.path.join(self.local_config["short_term_store"], self.name)
self.long_term_store = os.path.join(self.local_config["long_term_store"], self.name, self.variant)
self.short_term_store = os.path.join(self.local_config["short_term_store"], self.name, self.variant)

# Get paths to add to the python path for the pipeline
# Used so that a project can specify custom module types and other python code outside the pimlico source tree
Expand Down Expand Up @@ -155,7 +175,7 @@ def path_relative_to_config(self, path):
return os.path.abspath(os.path.join(config_dir, path))

@staticmethod
def load(filename, local_config=None):
def load(filename, local_config=None, variant="main"):
if local_config is None:
# Use the default locations for local config file
# May want to add other locations here...
Expand Down Expand Up @@ -183,10 +203,12 @@ def load(filename, local_config=None):
if attr not in local_config_data:
raise PipelineConfigParseError("required attribute '%s' is not specified in local config" % attr)

# TODO Perform pre-processing of config file to replace includes, etc
with open(filename, "r") as f:
# ConfigParser can read directly from a file, but we need to pre-process the text
config_text = f.read()
# Perform pre-processing of config file to replace includes, etc
config_text, available_variants = preprocess_config_file(os.path.abspath(filename), variant=variant)
# If we were asked to load a particular variant, check it's in the list of available variants
if variant is not None and variant != "main" and variant not in available_variants:
raise PipelineConfigParseError("could not load pipeline variant '%s': it is not declared anywhere in the "
"config file")
text_buffer = StringIO(config_text)

# Parse the config file text
Expand Down Expand Up @@ -220,7 +242,8 @@ def load(filename, local_config=None):
except ConfigParser.Error, e:
raise PipelineConfigParseError("could not parse config file. %s" % e)
# Do no further checking or processing at this stage: just keep raw dictionaries for the time being
return PipelineConfig(pipeline_config, local_config_data, raw_module_options, module_order, filename=filename)
return PipelineConfig(pipeline_config, local_config_data, raw_module_options, module_order,
filename=filename, variant=variant, available_variants=list(sorted(available_variants)))


def var_substitute(option_val, vars):
Expand All @@ -240,6 +263,41 @@ class PipelineStructureError(Exception):
pass


def preprocess_config_file(filename, variant="main"):
# Read in the file
config_lines = []
available_variants = set([])
with open(filename, "r") as f:
# ConfigParser can read directly from a file, but we need to pre-process the text
for line in f:
if line.startswith("%% "):
# Directive: process this now
directive, __, rest = line[3:].partition(" ")
if directive.lower() == "novariant":
# Include this line only if loading the main variant
if variant == "main":
config_lines.append(rest.lstrip())
elif directive.lower().startswith("variant:"):
variant_cond = directive[8:]
# Line conditional on a specific variant: include only if we're loading that variant
if variant_cond == variant:
config_lines.append(rest.lstrip())
# Keep a list of all available variants
available_variants.add(variant_cond)
elif directive == "include":
# Include another file, given relative to this one
include_filename = os.path.abspath(os.path.join(os.path.dirname(filename), rest.strip("\n ")))
# Run preprocessing over that file too, so we can have embedded includes, etc
incl_text, incl_variants = preprocess_config_file(include_filename, variant=variant)
config_lines.append(incl_text)
available_variants.update(incl_variants)
else:
raise PipelineConfigParseError("unknown directive '%s' used in config file" % directive)
else:
config_lines.append(line)
return "".join(config_lines), available_variants


def check_for_cycles(pipeline):
# Build a mapping representing module dependencies
dep_map = dict(
Expand Down
44 changes: 35 additions & 9 deletions src/python/pimlico/core/external/java.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import os
import time
from subprocess import Popen, PIPE, check_output, STDOUT, CalledProcessError
import fcntl
import sys

from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR
from pimlico import JAVA_LIB_DIR, JAVA_BUILD_DIR, PIMLICO_ROOT

from pimlico.core.modules.base import DependencyError
from pimlico.utils.communicate import timeout_process


CLASSPATH = ":".join(["%s/*" % JAVA_LIB_DIR, JAVA_BUILD_DIR])


Expand Down Expand Up @@ -140,14 +140,23 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],

# Determine which port the server started on (needed to support ephemeral ports)
# Don't hang on an error running the gateway launcher
output = None
try:
with timeout_process(proc, 1.0):
with timeout_process(proc, 3.0):
output = proc.stdout.readline()
except Exception, e:
# Try reading stderr to see if there's any info there
error_output = proc.stderr.read().strip("\n ")
raise JavaProcessError("error reading first line from gateway process: %s. Error output: %s" %
(e, error_output))
err_path = output_p4j_error_info(command, "?", "could not read", error_output)

raise JavaProcessError("error reading first line from gateway process: %s. Error output: %s (see %s for "
"more details)" % (e, error_output, err_path))

if output is None:
error_output = proc.stderr.read().strip("\n ")
err_path = output_p4j_error_info(command, "(timed out)", "", error_output)
raise JavaProcessError("timed out starting gateway server (for details see %s)" % err_path)

# Check whether there was an error reported
output = output.strip("\n ")
if output == "ERROR":
Expand All @@ -159,11 +168,16 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],
port_used = int(output)
except ValueError:
returncode = proc.poll()

stderr_output = proc.stderr.read().strip("\n ")
err_path = output_p4j_error_info(command, returncode, output, stderr_output)

if returncode is not None:
raise JavaProcessError("Py4J server process returned with return code %s: %s" % (returncode,
proc.stderr.read()))
raise JavaProcessError("Py4J server process returned with return code %s: %s (see %s for details)" %
(returncode, stderr_output, err_path))
else:
raise JavaProcessError("invalid output from Py4J server when started: '%s'" % output)
raise JavaProcessError("invalid output from Py4J server when started: '%s' (see %s for details)" %
(output, err_path))

# Start consumer threads so process does not deadlock/hangs
OutputConsumer(redirect_stdout, proc.stdout, daemon=daemonize_redirect).start()
Expand All @@ -174,6 +188,18 @@ def launch_gateway(gateway_class="py4j.GatewayServer", args=[],
return port_used, proc


def output_p4j_error_info(command, returncode, stdout, stderr):
file_path = os.path.abspath(os.path.join(PIMLICO_ROOT, "py4j.err"))
with open(file_path, "w") as f:
print >>f, "Command:"
print >>f, " ".join(command)
print >>f, "Return code: %s" % returncode
print >>f, "Read from stdout:"
print >>f, stdout
print >>f, "Read from stderr:"
print >>f, stderr
return file_path


class DependencyCheckerError(Exception):
pass
Expand Down
4 changes: 4 additions & 0 deletions src/python/pimlico/core/modules/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ def execute_module(pipeline, module_name, force_rerun=False, debug=False):
# Prepare a logger
log = get_console_logger("Pimlico", debug=debug)

pipeline_name = "'%s'" % pipeline.name if pipeline.variant == "main" else \
"'%s' (variant '%s')" % (pipeline.name, pipeline.variant)
log.info("Loaded pipeline %s" % pipeline_name)

# Load the module instance
module = pipeline[module_name]
log.info("Checking module config")
Expand Down
4 changes: 4 additions & 0 deletions src/python/pimlico/core/modules/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ class InputModuleInfo(BaseModuleInfo):
Base class for input modules. These don't get executed in general, they just provide a way to iterate over
input data.
You probably don't want to subclass this. It's usually simplest to define a datatype for reading the input
data and then just specify its class as the module's type. This results in a subclass of this module info
being created dynamically to read that data.
Note that module_executable is typically set to False and the base class does this. However, some input
modules need to be executed before the input is usable, for example to collect stats about the input
data.
Expand Down
2 changes: 1 addition & 1 deletion src/python/pimlico/core/modules/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def execute(self, module_instance_info):
with self.get_writer(module_instance_info) as writer:
for archive, filename, docs in pbar(input_iterator.archive_iter()):
# Get the subclass to process the doc
result = self.process_document(filename, *docs)
result = self.process_document(archive, filename, *docs)
# Write the result to the output corpus
writer.add_document(archive, filename, result)
complete = True
Expand Down
11 changes: 7 additions & 4 deletions src/python/pimlico/datatypes/word_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def word_re(self):
self._word_re = re.compile(word_re)
return self._word_re

def parse_annotations(self, doc):
def process_document(self, doc):
sentences = []
while len(doc):
# Find the next sentence boundary
Expand All @@ -103,8 +103,6 @@ def parse_annotations(self, doc):
sentences.append(word_dicts)
return sentences

document_preprocessors = [parse_annotations]

def data_ready(self):
if not super(WordAnnotationCorpus, self).data_ready():
return False
Expand Down Expand Up @@ -165,7 +163,7 @@ def add_document(self, archive_name, doc_name, data):
super(SimpleWordAnnotationCorpusWriter, self).add_document(archive_name, doc_name, doc_string)


def add_annotation_field(input_name, add_fields):
def AddAnnotationField(input_name, add_fields):
"""
Dynamic type constructor that can be used in place of a module's output type. When called
(when the output type is needed), dynamically creates a new type that is a WordAnnotationCorpus
Expand Down Expand Up @@ -200,6 +198,11 @@ def _builder(module_info):
(input_name, input_datatype.__name__))
base_annotation_fields = input_datatype.annotation_fields

for field in add_fields:
if field in base_annotation_fields:
raise ModuleInfoLoadError("trying to add a field '%s' to data that already has a field with "
"that name" % field)

class ExtendedWordAnnotationCorpus(WordAnnotationCorpus):
annotation_fields = base_annotation_fields + add_fields

Expand Down

0 comments on commit 7647aff

Please sign in to comment.