Skip to content

Commit

Permalink
Added specialized datatypes for tokenized text and POS tagged text.
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Granroth-Wilding committed Mar 23, 2016
1 parent 9fbf42c commit 477271b
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 3 deletions.
21 changes: 20 additions & 1 deletion src/python/pimlico/datatypes/tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
class TarredCorpus(IterableDocumentCorpus):
datatype_name = "tar"

def __init__(self, base_dir):
def __init__(self, base_dir, raw_data=False):
"""
If raw_data=True, post-processing of documents (as defined by subclasses) is not applied. Each
document's text is just returned as read in from the file.
"""
super(TarredCorpus, self).__init__(base_dir)
self.tar_filenames = [f for f in
[os.path.join(root, filename) for root, dirs, files in os.walk(self.data_dir)
Expand All @@ -22,6 +26,7 @@ def __init__(self, base_dir):
self.tar_filenames.sort()

self.tarballs = [os.path.splitext(os.path.basename(f))[0] for f in self.tar_filenames]
self.raw_data = raw_data

def extract_file(self, archive_name, filename):
"""
Expand Down Expand Up @@ -59,13 +64,27 @@ def archive_iter(self, subsample=None, start=0):
# Read in the data
with open(os.path.join(tmp_dir, filename), "r") as f:
document = f.read()
# Apply subclass-specific post-processing if we've not been asked to yield just the raw data
if not self.raw_data:
document = self.process_document(document)
yield tar_name, filename, document
# Remove the file once we're done with it (when we request another)
os.remove(os.path.join(tmp_dir, filename))
finally:
# Remove the temp dir
shutil.rmtree(tmp_dir)

def process_document(self, data):
"""
Process the data read in for a single document. Allows easy implementation of datatypes using
TarredCorpus to do all the archive handling, etc, just specifying a particular way of handling
the data within documents.
By default, just returns the data string.
"""
return data

def list_archive_iter(self):
for tar_name, tarball_filename in zip(self.tarballs, self.tar_filenames):
tarball = tarfile.open(os.path.join(self.data_dir, tarball_filename), 'r')
Expand Down
20 changes: 20 additions & 0 deletions src/python/pimlico/modules/opennlp/pos/datatypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pimlico.datatypes.tar import TarredCorpus


class PosTaggedCorpus(TarredCorpus):
"""
Specialized datatype for a tarred corpus that's had POS tagging applied.
Each document is a list of sentences. Each sentence is a list of words. Each word is a list of
pairs (word, POS tag).
"""
def process_document(self, data):
return [
[_word_tag_pair(word) for word in sentence.split(" ")] for sentence in data.split("\n")
]


def _word_tag_pair(text):
word, __, tag = text.rpartition("|")
return word, tag
4 changes: 2 additions & 2 deletions src/python/pimlico/modules/opennlp/pos/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from pimlico.core.modules.base import DependencyError
from pimlico.core.modules.map import DocumentMapModuleInfo
from pimlico.core.paths import abs_path_or_model_dir_path
from pimlico.datatypes.tar import TarredCorpus
from pimlico.modules.opennlp.tokenize.datatypes import TokenizedCorpus


class ModuleInfo(DocumentMapModuleInfo):
module_type_name = "opennlp_pos_tagger"
module_inputs = [("text", TarredCorpus)]
module_inputs = [("text", TokenizedCorpus)]
module_options = {
"model": {
"help": "POS tagger model, full path or filename. If a filename is given, it is expected to be in the "
Expand Down
16 changes: 16 additions & 0 deletions src/python/pimlico/modules/opennlp/tokenize/datatypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pimlico.datatypes.tar import TarredCorpus


class TokenizedCorpus(TarredCorpus):
"""
Specialized datatype for a tarred corpus that's had tokenization applied. The datatype does very little -
the main reason for its existence is to allow modules to require that a corpus has been tokenized before
it's given as input.
Each document is a list of sentences. Each sentence is a list of words.
"""
def process_document(self, data):
return [
sentence.split(" ") for sentence in data.split("\n")
]
2 changes: 2 additions & 0 deletions src/python/pimlico/modules/opennlp/tokenize/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
from pimlico.core.modules.base import DependencyError
from pimlico.core.modules.map import DocumentMapModuleInfo
from pimlico.core.paths import abs_path_or_model_dir_path
from .datatypes import TokenizedCorpus
from pimlico.datatypes.tar import TarredCorpus


class ModuleInfo(DocumentMapModuleInfo):
module_type_name = "opennlp_tokenizer"
module_inputs = [("text", TarredCorpus)]
module_outputs = [("documents", TokenizedCorpus)]
module_options = {
"sentence_model": {
"help": "Sentence segmentation model. Specify a full path, or just a filename. If a filename is given "
Expand Down
1 change: 1 addition & 0 deletions src/python/pimlico/modules/regex/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'mtw29'

0 comments on commit 477271b

Please sign in to comment.