-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Major changes to add further standard modules, including major extens…
…ion of CoreNLP wrappers. Some new docs detailing future plans.
- Loading branch information
Mark Granroth-Wilding
committed
Apr 1, 2016
1 parent
2492d27
commit 34f9523
Showing
26 changed files
with
866 additions
and
148 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
=============== | ||
Berkeley Parser | ||
=============== | ||
|
||
https://github.com/slavpetrov/berkeleyparser | ||
|
||
Java constituency parser. Pre-trained models are also provided in the Github repo. | ||
|
||
Probably no need for a Java wrapper here. The parser itself accepts input on stdin and outputs to stdout, | ||
so just use a subprocess with pipes. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
========= | ||
C&C Tools | ||
========= | ||
|
||
http://svn.ask.it.usyd.edu.au/trac/candc/ | ||
|
||
Can't fetch via make script, as you need to sign-up for a login (free). Need to get user to do this. Pain in the arse. | ||
|
||
Written wrappers around C&C before using subprocesses (with the REST server). Can reuse most of this code as is. | ||
|
||
Consider also whether it's even worth wrapping this. Might be better off just using the new Java implementation, | ||
which is faster and as accurate. | ||
|
||
https://www.cl.cam.ac.uk/~sc609/java-candc.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
============= | ||
Cherry Picker | ||
============= | ||
|
||
**Coreference resolver** | ||
|
||
http://www.hlt.utdallas.edu/~altaf/cherrypicker/ | ||
|
||
Requires NER, POS tagging and constituency parsing to be done first. Tools for all of these are included in the | ||
Cherry Picker codebase, but we just need a wrapper around the Cherry Picker tool itself to be able to feed these | ||
annotations in from other modules and perform coref. | ||
|
||
Write a Java wrapper and interface with it using Py4J, as with OpenNLP. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
""" | ||
Document map modules can in general be fairly easily parallelized. | ||
""" | ||
from multiprocessing import Queue | ||
|
||
from itertools import islice | ||
|
||
from pimlico.core.modules.execute import ModuleExecutionError | ||
from pimlico.core.modules.map import DocumentMapModuleExecutor | ||
from pimlico.datatypes.base import InvalidDocument | ||
from pimlico.datatypes.tar import AlignedTarredCorpora | ||
from pimlico.utils.core import multiwith | ||
from pimlico.utils.progress import get_progress_bar | ||
|
||
|
||
class DocumentProcessorPool(object): | ||
""" | ||
Base class for pools that provide an easy implementation of parallelization for document map modules. | ||
""" | ||
def __init__(self, processes): | ||
self.queue = self.create_queue() | ||
self.processes = processes | ||
|
||
@staticmethod | ||
def create_queue(): | ||
""" | ||
May be overridden by subclasses to provide different implementations of a Queue. By default, uses the | ||
multiprocessing queue type. Whatever is returned, it should implement the interface of Queue.Queue. | ||
""" | ||
return Queue() | ||
|
||
def process_document(self, archive, filename, *docs): | ||
""" | ||
Lightweight method to be called from the main process to fire off the processing in a worker | ||
process. The worker should place its output, wrapped in a ProcessOutput, on the output_queue when | ||
finished. This method should return without blocking. | ||
""" | ||
raise NotImplementedError | ||
|
||
|
||
class ProcessOutput(object): | ||
""" | ||
Wrapper for all result data coming out from a worker. | ||
""" | ||
def __init__(self, archive, filename, data): | ||
self.data = data | ||
self.filename = filename | ||
self.archive = archive | ||
|
||
|
||
class DocumentMapModuleParallelExecutor(DocumentMapModuleExecutor): | ||
""" | ||
Inherit from this class, instead of DocumentMapModuleExecutor, to provide parallelization. | ||
""" | ||
def preprocess_parallel(self): | ||
""" Defaults to calling preprocess() """ | ||
self.preprocess() | ||
|
||
def postprocess_parallel(self, error=False): | ||
""" Defaults to calling postprocess() """ | ||
self.postprocess(error=error) | ||
|
||
def create_pool(self, processes): | ||
""" | ||
Should return an instance of the pool to be used for document processing. Should generally be a | ||
subclass of DocumentProcessorPool. | ||
Always called after postprocess_parallel(). | ||
""" | ||
raise NotImplementedError | ||
|
||
def execute(self): | ||
""" | ||
Gets called instead of execute() if the config asks for multiple processes to be used. | ||
""" | ||
processes = self.info.pipeline.processes | ||
if processes < 2: | ||
# Not multiprocessing: just use the single-core version | ||
super(DocumentMapModuleExecutor, self).execute() | ||
else: | ||
# We may have multiple inputs, which should be aligned tarred corpora | ||
# If there's only one, this also works | ||
self.input_corpora = [self.info.get_input(input_name) | ||
for input_name in self.info.input_names] | ||
input_iterator = AlignedTarredCorpora(self.input_corpora) | ||
|
||
# Call the set-up routine, if one's been defined | ||
self.log.info("Preparing parallel document map execution for %d documents with %d processes" % | ||
(len(input_iterator), processes)) | ||
self.preprocess_parallel() | ||
|
||
# Start up a pool | ||
pool = self.create_pool(processes) | ||
output_queue = pool.queue | ||
self.log.info("Process pool created for processing %d documents in parallel" % processes) | ||
|
||
pbar = get_progress_bar(len(input_iterator), | ||
title="%s map" % self.info.module_type_name.replace("_", " ").capitalize()) | ||
complete = False | ||
docs_processing = [] | ||
result_buffer = {} | ||
docs_complete = 0 | ||
try: | ||
# Prepare a corpus writer for the output | ||
with multiwith(*self.get_writers()) as writers: | ||
# Inputs will be taken from this as they're needed | ||
input_iter = iter(input_iterator.archive_iter()) | ||
# Push the first inputs into the pool | ||
for archive, filename, docs in islice(input_iter, processes): | ||
pool.process_document(archive, filename, *docs) | ||
# Keep track of the order we need the results in | ||
docs_processing.append((archive, filename)) | ||
|
||
# Look for the first document coming off the queue | ||
while len(docs_processing): | ||
result = output_queue.get() | ||
docs_complete += 1 | ||
pbar.update(docs_complete) | ||
# We've got some result, but it might not be the one we're looking for | ||
# Add it to a buffer, so we can potentially keep it and only output it when its turn comes up | ||
result_buffer[(result.archive, result.filename)] = result.data | ||
# Before doing anything else, set the next job going | ||
try: | ||
archive, filename, docs = input_iter.next() | ||
except StopIteration: | ||
# No more documents to send: just receive the remaining results until we've got all the | ||
# ones we've already sent | ||
pass | ||
else: | ||
pool.process_document(archive, filename, *docs) | ||
docs_processing.append((archive, filename)) | ||
|
||
# Write out as many as we can of the docs that have been sent and whose output is available | ||
# while maintaining the order they were put in in | ||
while len(docs_processing) and docs_processing[0] in result_buffer: | ||
archive, filename = docs_processing.pop(0) | ||
next_output = result_buffer.pop((archive, filename)) | ||
# Next document processed: output the result precisely as in the single-core case | ||
if type(next_output) is InvalidDocument: | ||
# Just got a single invalid document out: write it out to every output | ||
next_output = [next_output] * len(writers) | ||
elif type(next_output) is not tuple: | ||
# If the processor produces a single result and there's only one output, fine | ||
next_output = (next_output,) | ||
if len(next_output) != len(writers): | ||
raise ModuleExecutionError( | ||
"%s executor's process_document() returned %d results for a document, but the " | ||
"module has %d outputs" % (type(self).__name__, len(next_output), len(writers)) | ||
) | ||
# Write the result to the output corpora | ||
for result, writer in zip(next_output, writers): | ||
writer.add_document(archive, filename, result) | ||
|
||
pbar.finish() | ||
complete = True | ||
finally: | ||
# Call the finishing-off routine, if one's been defined | ||
if complete: | ||
self.log.info("Document mapping complete. Finishing off") | ||
else: | ||
self.log.info("Document mapping failed. Finishing off") | ||
self.postprocess_parallel(error=not complete) |
Oops, something went wrong.