Skip to content

Commit

Permalink
Added new regex matching module, for scanning a corpus for occurrence…
Browse files Browse the repository at this point in the history
…s of expressions including POS tags.
  • Loading branch information
Mark Granroth-Wilding committed Mar 23, 2016
1 parent 477271b commit 2eee27b
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 2 deletions.
5 changes: 3 additions & 2 deletions src/python/pimlico/core/modules/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ def postprocess(self, info, error=False):
def execute(self, module_instance_info):
# We may have multiple inputs, which should be aligned tarred corpora
# If there's only one, this also works
input_iterator = AlignedTarredCorpora([module_instance_info.get_input(input_name)
for input_name in module_instance_info.input_names])
self.input_corpora = [module_instance_info.get_input(input_name)
for input_name in module_instance_info.input_names]
input_iterator = AlignedTarredCorpora(self.input_corpora)

# Call the set-up routine, if one's been defined
self.log.info("Preparing document map execution for %s documents" % len(input_iterator))
Expand Down
4 changes: 4 additions & 0 deletions src/python/pimlico/datatypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ def load_datatype(path):
raise DatatypeLoadError("could not load datatype class %s in module %s" % (cls_name, mod_path))
cls = getattr(mod, cls_name)

if type(cls) is not type(object):
raise DatatypeLoadError("tried to load datatype %s.%s, but result was not a class, it was a %s" %
(mod, cls_name, type(cls).__name__))

if not issubclass(cls, PimlicoDatatype):
raise DatatypeLoadError("%s is not a Pimlico datatype" % path)
return cls
2 changes: 2 additions & 0 deletions src/python/pimlico/modules/opennlp/pos/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
from pimlico.core.modules.map import DocumentMapModuleInfo
from pimlico.core.paths import abs_path_or_model_dir_path
from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus
from .datatypes import PosTaggedCorpus


class ModuleInfo(DocumentMapModuleInfo):
module_type_name = "opennlp_pos_tagger"
module_inputs = [("text", TokenizedCorpus)]
module_outputs = [("documents", PosTaggedCorpus)]
module_options = {
"model": {
"help": "POS tagger model, full path or filename. If a filename is given, it is expected to be in the "
Expand Down
1 change: 1 addition & 0 deletions src/python/pimlico/modules/regex/pos_text/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'mtw29'
31 changes: 31 additions & 0 deletions src/python/pimlico/modules/regex/pos_text/exec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pimlico.core.modules.map import DocumentMapModuleExecutor


class ModuleExecutor(DocumentMapModuleExecutor):
def preprocess(self, info):
# Turn off document processing on the input iterator
# This means we'll just get raw text for each document
self.input_corpora[0].raw_input = True
self.regex = info.regex
self.log.info("Matching regex %s" % self.regex.pattern)
self.match_count = 0
self.matched_docs = 0

def process_document(self, filename, doc):
# Add spaces either side of the doc text so we match at the beginning and end
doc = " %s " % doc
# Search using the pre-prepared regex
output_lines = []
matched = 0
for match in self.regex.finditer(doc):
matched += 1
# For each match of the regex, add a line to the output document
output_lines.append(" ".join("%s=%s" % (var, val) for (var, val) in match.groupdict().items()))

self.match_count += matched
if matched:
self.matched_docs += 1
return "\n".join(output_lines)

def postprocess(self, info, error=False):
self.log.info("Regex matched a total of %d times in %d documents" % (self.match_count, self.matched_docs))
97 changes: 97 additions & 0 deletions src/python/pimlico/modules/regex/pos_text/info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import re
from pimlico.core.modules.base import ModuleInfoLoadError
from pimlico.core.modules.map import DocumentMapModuleInfo
from pimlico.datatypes.tar import TarredCorpus
from pimlico.modules.opennlp.pos.datatypes import PosTaggedCorpus


class ModuleInfo(DocumentMapModuleInfo):
module_type_name = "pos_text_matcher"
module_inputs = [("text", PosTaggedCorpus)]
module_outputs = [("documents", TarredCorpus)]
module_options = {
"expr": {
"help": "An expression to determine what to search for in sentences. Consists of a sequence of tokens, "
"each a word or POS tag. Words are lower case (matching is case insensitive), POS "
"tags are upper case. A token of the form 'x=TAG' matches the tag TAG and assigns it to a "
"variable extracted in the output. POS tags ending with a * match tag prefixes. "
"E.g. 'my mything=NN* is very myadj=JJ' will matches phrases like 'my foot is very sore', "
"producing 'mything=foot' and 'myadj=sore'",
},
"regex": {
"help": "Instead of matching a regex based on a simple expression given in 'expr', specify a regex "
"directly. The regex will be matching against POS tagged text, where each word is followed "
"by a POS tag separated by '|' and words are separated by spaces. Use named groups to specify "
"the attributes that are extracted",
},
}

def __init__(self, *args, **kwargs):
super(ModuleInfo, self).__init__(*args, **kwargs)

# Process the regex so we've got it ready for matching
if self.options["regex"] is not None:
# We've been given a regex directly: just compile this
try:
self.regex = re.compile(self.options["regex"])
except Exception, e:
# Any errors here should be reported as errors preparing the module info
raise ModuleInfoLoadError("could not parse regex '%s': %s" % (self.options["regex"], e))
elif self.options["expr"] is not None:
# Parse the expression into a regex we can use for matching
expression = self.options["expr"]
pos_re = re.compile(r"[A-Z]+\*?")
regex = ""

for token in expression.split():
if "=" in token:
# This is a variable assignment
var_name, __, pos = token.partition("=")
# Expression must be a POS
if not pos_re.match(token):
raise ModuleInfoLoadError("error in pos-text expression: variable binding must be on a "
"POS, can't match '%s' (for variable '%s')" % (pos, var_name))
regex += _pos_regex(pos, var_name)
elif token.startswith("/") and token.endswith("/"):
# This is a regex to be applied to a word
regex += _regex_word_regex(token[1:-1])
elif pos_re.match(token):
# This is a POS
regex += _pos_regex(token)
else:
# Just a word
regex += _word_regex(token)
# Start and end with a space
regex += " "
# Now try compiling this regex
try:
self.regex = re.compile(regex)
except Exception, e:
raise ModuleInfoLoadError("build regex from expression (%s), but couldn't compile it: %s" %
(regex, e))
else:
raise ModuleInfoLoadError("pos text matcher must have either an expr or a regex option")


def _pos_regex(text, name=None):
if name is None:
# Just match the word to start with
r = r" [!\|]*\|"
else:
# Match the word in a named group
r = r" (?P<%s>[!\|]*)\|" % name

if text.endswith("*"):
# Match POS prefix
return r + r"%s\S*" % text[:-1]
else:
# Match exact POS
return r + r"%s" % text

def _word_regex(text):
# Case-insensitively match the word, with no constraint on the POS tag
return r" %s\|[A-Z]+" % "".join("[%s%s]" % (letter.lower(), letter.upper()) for letter in text)

def _regex_word_regex(text):
# Include this regex in one that also eats up the POS tag
return r" %s\|[A-Z]+" % text

0 comments on commit 2eee27b

Please sign in to comment.