-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added new modules for extracting features from documents. Some improv…
…ements to CoreNLP and regex modules
- Loading branch information
Mark Granroth-Wilding
committed
Apr 4, 2016
1 parent
f90ac5b
commit 603fc99
Showing
15 changed files
with
255 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
from operator import itemgetter | ||
|
||
from pimlico.core.modules.map import skip_invalid, invalid_doc_on_error | ||
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid | ||
|
||
|
||
class KeyValueListCorpus(TarredCorpus): | ||
def __init__(self, base_dir, pipeline): | ||
super(KeyValueListCorpus, self).__init__(base_dir, pipeline) | ||
self.separator = self.metadata.get("separator", " ") | ||
self.fv_separator = self.metadata.get("fv_separator", "=") | ||
|
||
@skip_invalid | ||
def process_document(self, data): | ||
# Read a set of feature-value pairs from each line | ||
data_points = [] | ||
for line in data.splitlines(): | ||
# Skip blank lines | ||
if line.strip(): | ||
# Split up the various feature assignments | ||
fvs = line.strip().split(self.separator) | ||
# Now we've split on sep, unescape any instances that were escaped | ||
fvs = [unescape_sep(self.separator, "ITEMSEP", fv) for fv in fvs] | ||
# Split each one into a feature-value pair | ||
fvs = [itemgetter(0, 2)(fv.split(self.fv_separator)) for fv in fvs] | ||
# Unescape the fv sep within feature names and feature values | ||
fvs = [ | ||
(unescape_sep(self.fv_separator, "FVSEP", fv[0]), unescape_sep(self.fv_separator, "FVSEP", fv[1])) | ||
for fv in fvs | ||
] | ||
data_points.append(fvs) | ||
return data_points | ||
|
||
|
||
class KeyValueListCorpusWriter(TarredCorpusWriter): | ||
def __init__(self, base_dir, separator=" ", fv_separator="="): | ||
super(KeyValueListCorpusWriter, self).__init__(base_dir) | ||
self.fv_separator = fv_separator | ||
self.separator = separator | ||
# Put the separators in the metadata, so we know how to read the data in again | ||
self.metadata["separator"] = separator | ||
self.metadata["fv_separator"] = fv_separator | ||
|
||
@pass_up_invalid | ||
def add_document(self, archive_name, doc_name, data): | ||
# Input should be a list of data points, where each is a list of feature-value pairs | ||
# One data point per line | ||
data = "\n".join([ | ||
# Fv pairs are separated by separator | ||
self.separator.join([ | ||
# Make sure they don't include the separator | ||
escape_sep( | ||
self.separator, "ITEMSEP", | ||
# Feature and value are separated by fv_separator | ||
"%s%s%s" % ( | ||
# Make sure they don't include the fv separator | ||
escape_sep(self.fv_separator, "FVSEP", feature_name), | ||
self.fv_separator, | ||
escape_sep(self.fv_separator, "FVSEP", feature_value) | ||
) | ||
) for (feature_name, feature_value) in data_point | ||
]) for data_point in data | ||
]) | ||
super(KeyValueListCorpusWriter, self).add_document(archive_name, doc_name, data) | ||
|
||
|
||
class TermFeatureListCorpus(KeyValueListCorpus): | ||
""" | ||
Special case of KeyValueListCorpus, where one special feature "term" is always present and the other | ||
feature types are counts of the occurrence of a particular feature with this term in each data point. | ||
""" | ||
def __init__(self, base_dir, pipeline): | ||
super(TermFeatureListCorpus, self).__init__(base_dir, pipeline) | ||
|
||
@skip_invalid | ||
@invalid_doc_on_error | ||
def process_document(self, data): | ||
data = super(TermFeatureListCorpus, self).process_document(data) | ||
|
||
data_points = [] | ||
for data_point in data: | ||
# Pull out the special "term" feature (usually at the beginning) | ||
try: | ||
term = (value for (feature, value) in data_point if feature == "term").next() | ||
except StopIteration: | ||
# No "term" feature found -- uh-oh! Catch as invalid doc | ||
raise ValueError("data point has no 'term' feature: %s" % data_point) | ||
# The rest of the features are feature counts | ||
features = dict((feature, int(value)) for (feature, value) in data_point if feature != "term") | ||
data_points.append((term, features)) | ||
return data_points | ||
|
||
|
||
class TermFeatureListCorpusWriter(KeyValueListCorpusWriter): | ||
def __init__(self, base_dir, separator=" ", fv_separator="="): | ||
super(TermFeatureListCorpusWriter, self).__init__(base_dir, separator=" ", fv_separator="=") | ||
|
||
@pass_up_invalid | ||
def add_document(self, archive_name, doc_name, data): | ||
# Input should be a list of data points, where each is a (term, feature count) pair | ||
data = [ | ||
[("term", term)] + [(feature, str(count)) for (feature, count) in feature_counts.items()] | ||
for (term, feature_counts) in data | ||
] | ||
super(TermFeatureListCorpusWriter, self).add_document(archive_name, doc_name, data) | ||
|
||
|
||
def escape_sep(sep, sep_type, text): | ||
return text.replace(sep, "~~%s~~" % sep_type) | ||
|
||
|
||
def unescape_sep(sep, sep_type, text): | ||
return text.replace("~~%s~~" % sep_type, sep) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
58 changes: 58 additions & 0 deletions
58
src/python/pimlico/modules/features/term_feature_compiler/info.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import random | ||
|
||
from pimlico.core.modules.map import DocumentMapModuleInfo | ||
from pimlico.datatypes.features import TermFeatureListCorpus, KeyValueListCorpus | ||
from pimlico.datatypes.tar import TarredCorpus | ||
|
||
|
||
# Subclass TermFeatureListCorpus so that inputs expecting one can accept this | ||
# TODO There should be a better superclass for doc-doc filters like this | ||
class TermFeatureListCorpusFilter(TermFeatureListCorpus): | ||
def __init__(self, input_datatype, pipeline, **kwargs): | ||
TarredCorpus.__init__(self, None, pipeline, **kwargs) | ||
self.input_datatype = input_datatype | ||
|
||
def __len__(self): | ||
return len(self.input_datatype) | ||
|
||
def archive_iter(self, subsample=None, start=0): | ||
# TODO Implement this, which does the key | ||
tarballs = self.tarballs | ||
|
||
current_archive = 0 | ||
current_archive_count = 0 | ||
|
||
for file_num, (doc_name, doc) in enumerate(self.input_datatype): | ||
# Allow the first portion of the corpus to be skipped | ||
if file_num < start: | ||
continue | ||
# If subsampling, decide whether to extract this file | ||
if subsample is not None and random.random() > subsample: | ||
# Reject this file | ||
continue | ||
|
||
# Check whether we've put enough files in the current archive to move onto the next | ||
if current_archive_count == self.archive_size: | ||
current_archive += 1 | ||
current_archive_count = 0 | ||
|
||
yield tarballs[current_archive], doc_name, doc | ||
|
||
def data_ready(self): | ||
return self.input_datatype.data_ready() | ||
|
||
|
||
class ModuleInfo(DocumentMapModuleInfo): | ||
module_type_name = "term_feature_list_filter" | ||
module_inputs = [("key_values", KeyValueListCorpus)] | ||
module_outputs = [("term_features", TermFeatureListCorpusFilter)] | ||
module_options = [ | ||
# TODO Add some options | ||
] | ||
module_executable = False | ||
|
||
def instantiate_output_datatype(self, output_name, output_datatype): | ||
if output_name == "term_features": | ||
return TermFeatureListCorpusFilter(self.pipeline, self.get_input("key_values")) | ||
else: | ||
return super(ModuleInfo, self).instantiate_output_datatype(output_name, output_datatype) |
File renamed without changes.
Oops, something went wrong.