Added new regex matching module, for scanning a corpus for occurrence…

…s of expressions including POS tags.
markgw · Mar 23, 2016 · 2eee27b · 2eee27b
1 parent 477271b
commit 2eee27b
Show file tree

Hide file tree

Showing 6 changed files with 138 additions and 2 deletions.
diff --git a/src/python/pimlico/core/modules/map.py b/src/python/pimlico/core/modules/map.py
@@ -43,8 +43,9 @@ def postprocess(self, info, error=False):
     def execute(self, module_instance_info):
         # We may have multiple inputs, which should be aligned tarred corpora
         # If there's only one, this also works
-        input_iterator = AlignedTarredCorpora([module_instance_info.get_input(input_name)
-                                               for input_name in module_instance_info.input_names])
+        self.input_corpora = [module_instance_info.get_input(input_name)
+                              for input_name in module_instance_info.input_names]
+        input_iterator = AlignedTarredCorpora(self.input_corpora)
 
         # Call the set-up routine, if one's been defined
         self.log.info("Preparing document map execution for %s documents" % len(input_iterator))

diff --git a/src/python/pimlico/datatypes/base.py b/src/python/pimlico/datatypes/base.py
@@ -157,6 +157,10 @@ def load_datatype(path):
         raise DatatypeLoadError("could not load datatype class %s in module %s" % (cls_name, mod_path))
     cls = getattr(mod, cls_name)
 
+    if type(cls) is not type(object):
+        raise DatatypeLoadError("tried to load datatype %s.%s, but result was not a class, it was a %s" %
+                                (mod, cls_name, type(cls).__name__))
+
     if not issubclass(cls, PimlicoDatatype):
         raise DatatypeLoadError("%s is not a Pimlico datatype" % path)
     return cls
diff --git a/src/python/pimlico/modules/opennlp/pos/info.py b/src/python/pimlico/modules/opennlp/pos/info.py
@@ -5,11 +5,13 @@
 from pimlico.core.modules.map import DocumentMapModuleInfo
 from pimlico.core.paths import abs_path_or_model_dir_path
 from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus
+from .datatypes import PosTaggedCorpus
 
 
 class ModuleInfo(DocumentMapModuleInfo):
     module_type_name = "opennlp_pos_tagger"
     module_inputs = [("text", TokenizedCorpus)]
+    module_outputs = [("documents", PosTaggedCorpus)]
     module_options = {
         "model": {
             "help": "POS tagger model, full path or filename. If a filename is given, it is expected to be in the "

diff --git a/src/python/pimlico/modules/regex/pos_text/__init__.py b/src/python/pimlico/modules/regex/pos_text/__init__.py
@@ -0,0 +1 @@
+__author__ = 'mtw29'
diff --git a/src/python/pimlico/modules/regex/pos_text/exec.py b/src/python/pimlico/modules/regex/pos_text/exec.py
@@ -0,0 +1,31 @@
+from pimlico.core.modules.map import DocumentMapModuleExecutor
+
+
+class ModuleExecutor(DocumentMapModuleExecutor):
+    def preprocess(self, info):
+        # Turn off document processing on the input iterator
+        # This means we'll just get raw text for each document
+        self.input_corpora[0].raw_input = True
+        self.regex = info.regex
+        self.log.info("Matching regex %s" % self.regex.pattern)
+        self.match_count = 0
+        self.matched_docs = 0
+
+    def process_document(self, filename, doc):
+        # Add spaces either side of the doc text so we match at the beginning and end
+        doc = " %s " % doc
+        # Search using the pre-prepared regex
+        output_lines = []
+        matched = 0
+        for match in self.regex.finditer(doc):
+            matched += 1
+            # For each match of the regex, add a line to the output document
+            output_lines.append(" ".join("%s=%s" % (var, val) for (var, val) in match.groupdict().items()))
+
+        self.match_count += matched
+        if matched:
+            self.matched_docs += 1
+        return "\n".join(output_lines)
+
+    def postprocess(self, info, error=False):
+        self.log.info("Regex matched a total of %d times in %d documents" % (self.match_count, self.matched_docs))
diff --git a/src/python/pimlico/modules/regex/pos_text/info.py b/src/python/pimlico/modules/regex/pos_text/info.py
@@ -0,0 +1,97 @@
+import re
+from pimlico.core.modules.base import ModuleInfoLoadError
+from pimlico.core.modules.map import DocumentMapModuleInfo
+from pimlico.datatypes.tar import TarredCorpus
+from pimlico.modules.opennlp.pos.datatypes import PosTaggedCorpus
+
+
+class ModuleInfo(DocumentMapModuleInfo):
+    module_type_name = "pos_text_matcher"
+    module_inputs = [("text", PosTaggedCorpus)]
+    module_outputs = [("documents", TarredCorpus)]
+    module_options = {
+        "expr": {
+            "help": "An expression to determine what to search for in sentences. Consists of a sequence of tokens, "
+                    "each a word or POS tag. Words are lower case (matching is case insensitive), POS "
+                    "tags are upper case. A token of the form 'x=TAG' matches the tag TAG and assigns it to a "
+                    "variable extracted in the output. POS tags ending with a * match tag prefixes. "
+                    "E.g. 'my mything=NN* is very myadj=JJ' will matches phrases like 'my foot is very sore', "
+                    "producing 'mything=foot' and 'myadj=sore'",
+        },
+        "regex": {
+            "help": "Instead of matching a regex based on a simple expression given in 'expr', specify a regex "
+                    "directly. The regex will be matching against POS tagged text, where each word is followed "
+                    "by a POS tag separated by '|' and words are separated by spaces. Use named groups to specify "
+                    "the attributes that are extracted",
+        },
+    }
+
+    def __init__(self, *args, **kwargs):
+        super(ModuleInfo, self).__init__(*args, **kwargs)
+
+        # Process the regex so we've got it ready for matching
+        if self.options["regex"] is not None:
+            # We've been given a regex directly: just compile this
+            try:
+                self.regex = re.compile(self.options["regex"])
+            except Exception, e:
+                # Any errors here should be reported as errors preparing the module info
+                raise ModuleInfoLoadError("could not parse regex '%s': %s" % (self.options["regex"], e))
+        elif self.options["expr"] is not None:
+            # Parse the expression into a regex we can use for matching
+            expression = self.options["expr"]
+            pos_re = re.compile(r"[A-Z]+\*?")
+            regex = ""
+
+            for token in expression.split():
+                if "=" in token:
+                    # This is a variable assignment
+                    var_name, __, pos = token.partition("=")
+                    # Expression must be a POS
+                    if not pos_re.match(token):
+                        raise ModuleInfoLoadError("error in pos-text expression: variable binding must be on a "
+                                                  "POS, can't match '%s' (for variable '%s')" % (pos, var_name))
+                    regex += _pos_regex(pos, var_name)
+                elif token.startswith("/") and token.endswith("/"):
+                    # This is a regex to be applied to a word
+                    regex += _regex_word_regex(token[1:-1])
+                elif pos_re.match(token):
+                    # This is a POS
+                    regex += _pos_regex(token)
+                else:
+                    # Just a word
+                    regex += _word_regex(token)
+            # Start and end with a space
+            regex += " "
+            # Now try compiling this regex
+            try:
+                self.regex = re.compile(regex)
+            except Exception, e:
+                raise ModuleInfoLoadError("build regex from expression (%s), but couldn't compile it: %s" %
+                                          (regex, e))
+        else:
+            raise ModuleInfoLoadError("pos text matcher must have either an expr or a regex option")
+
+
+def _pos_regex(text, name=None):
+    if name is None:
+        # Just match the word to start with
+        r = r" [!\|]*\|"
+    else:
+        # Match the word in a named group
+        r = r" (?P<%s>[!\|]*)\|" % name
+
+    if text.endswith("*"):
+        # Match POS prefix
+        return r + r"%s\S*" % text[:-1]
+    else:
+        # Match exact POS
+        return r + r"%s" % text
+
+def _word_regex(text):
+    # Case-insensitively match the word, with no constraint on the POS tag
+    return r" %s\|[A-Z]+" % "".join("[%s%s]" % (letter.lower(), letter.upper()) for letter in text)
+
+def _regex_word_regex(text):
+    # Include this regex in one that also eats up the POS tag
+    return r" %s\|[A-Z]+" % text