Added specialized datatypes for tokenized text and POS tagged text.

markgw · Mar 23, 2016 · 477271b · 477271b
1 parent 9fbf42c
commit 477271b
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 3 deletions.
diff --git a/src/python/pimlico/datatypes/tar.py b/src/python/pimlico/datatypes/tar.py
@@ -13,7 +13,11 @@
 class TarredCorpus(IterableDocumentCorpus):
     datatype_name = "tar"
 
-    def __init__(self, base_dir):
+    def __init__(self, base_dir, raw_data=False):
+        """
+        If raw_data=True, post-processing of documents (as defined by subclasses) is not applied. Each
+        document's text is just returned as read in from the file.
+        """
         super(TarredCorpus, self).__init__(base_dir)
         self.tar_filenames = [f for f in
                               [os.path.join(root, filename) for root, dirs, files in os.walk(self.data_dir)
@@ -22,6 +26,7 @@ def __init__(self, base_dir):
         self.tar_filenames.sort()
 
         self.tarballs = [os.path.splitext(os.path.basename(f))[0] for f in self.tar_filenames]
+        self.raw_data = raw_data
 
     def extract_file(self, archive_name, filename):
         """
@@ -59,13 +64,27 @@ def archive_iter(self, subsample=None, start=0):
                         # Read in the data
                         with open(os.path.join(tmp_dir, filename), "r") as f:
                             document = f.read()
+                        # Apply subclass-specific post-processing if we've not been asked to yield just the raw data
+                        if not self.raw_data:
+                            document = self.process_document(document)
                         yield tar_name, filename, document
                         # Remove the file once we're done with it (when we request another)
                         os.remove(os.path.join(tmp_dir, filename))
         finally:
             # Remove the temp dir
             shutil.rmtree(tmp_dir)
 
+    def process_document(self, data):
+        """
+        Process the data read in for a single document. Allows easy implementation of datatypes using
+        TarredCorpus to do all the archive handling, etc, just specifying a particular way of handling
+        the data within documents.
+
+        By default, just returns the data string.
+
+        """
+        return data
+
     def list_archive_iter(self):
         for tar_name, tarball_filename in zip(self.tarballs, self.tar_filenames):
             tarball = tarfile.open(os.path.join(self.data_dir, tarball_filename), 'r')

diff --git a/src/python/pimlico/modules/opennlp/pos/datatypes.py b/src/python/pimlico/modules/opennlp/pos/datatypes.py
@@ -0,0 +1,20 @@
+from pimlico.datatypes.tar import TarredCorpus
+
+
+class PosTaggedCorpus(TarredCorpus):
+    """
+    Specialized datatype for a tarred corpus that's had POS tagging applied.
+
+    Each document is a list of sentences. Each sentence is a list of words. Each word is a list of
+    pairs (word, POS tag).
+
+    """
+    def process_document(self, data):
+        return [
+            [_word_tag_pair(word) for word in sentence.split(" ")] for sentence in data.split("\n")
+        ]
+
+
+def _word_tag_pair(text):
+    word, __, tag = text.rpartition("|")
+    return word, tag
diff --git a/src/python/pimlico/modules/opennlp/pos/info.py b/src/python/pimlico/modules/opennlp/pos/info.py
@@ -4,12 +4,12 @@
 from pimlico.core.modules.base import DependencyError
 from pimlico.core.modules.map import DocumentMapModuleInfo
 from pimlico.core.paths import abs_path_or_model_dir_path
-from pimlico.datatypes.tar import TarredCorpus
+from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus
 
 
 class ModuleInfo(DocumentMapModuleInfo):
     module_type_name = "opennlp_pos_tagger"
-    module_inputs = [("text", TarredCorpus)]
+    module_inputs = [("text", TokenizedCorpus)]
     module_options = {
         "model": {
             "help": "POS tagger model, full path or filename. If a filename is given, it is expected to be in the "

diff --git a/src/python/pimlico/modules/opennlp/tokenize/datatypes.py b/src/python/pimlico/modules/opennlp/tokenize/datatypes.py
@@ -0,0 +1,16 @@
+from pimlico.datatypes.tar import TarredCorpus
+
+
+class TokenizedCorpus(TarredCorpus):
+    """
+    Specialized datatype for a tarred corpus that's had tokenization applied. The datatype does very little -
+    the main reason for its existence is to allow modules to require that a corpus has been tokenized before
+    it's given as input.
+
+    Each document is a list of sentences. Each sentence is a list of words.
+
+    """
+    def process_document(self, data):
+        return [
+            sentence.split(" ") for sentence in data.split("\n")
+        ]
diff --git a/src/python/pimlico/modules/opennlp/tokenize/info.py b/src/python/pimlico/modules/opennlp/tokenize/info.py
@@ -4,12 +4,14 @@
 from pimlico.core.modules.base import DependencyError
 from pimlico.core.modules.map import DocumentMapModuleInfo
 from pimlico.core.paths import abs_path_or_model_dir_path
+from .datatypes import TokenizedCorpus
 from pimlico.datatypes.tar import TarredCorpus
 
 
 class ModuleInfo(DocumentMapModuleInfo):
     module_type_name = "opennlp_tokenizer"
     module_inputs = [("text", TarredCorpus)]
+    module_outputs = [("documents", TokenizedCorpus)]
     module_options = {
         "sentence_model": {
             "help": "Sentence segmentation model. Specify a full path, or just a filename. If a filename is given "

diff --git a/src/python/pimlico/modules/regex/__init__.py b/src/python/pimlico/modules/regex/__init__.py
@@ -0,0 +1 @@
+__author__ = 'mtw29'