-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added new module for mapping a term-feature-count corpus to indices u…
…sing a dictionary and storing in a super-efficient format. Several bug fixes.
- Loading branch information
Mark Granroth-Wilding
committed
Apr 6, 2016
1 parent
9105e61
commit eae8b00
Showing
11 changed files
with
242 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from pimlico.core.modules.base import BaseModuleExecutor | ||
from pimlico.datatypes.base import InvalidDocument | ||
from pimlico.datatypes.features import IndexedTermFeatureListCorpusWriter | ||
from pimlico.utils.progress import get_progress_bar | ||
|
||
|
||
class ModuleExecutor(BaseModuleExecutor): | ||
def execute(self): | ||
input_data = self.info.get_input("data") | ||
self.log.info("Loading dictionaries") | ||
term_vocab = self.info.get_input("term_vocab").get_data() | ||
feature_vocab = self.info.get_input("feature_vocab").get_data() | ||
|
||
pbar = get_progress_bar(len(input_data), title="Mapping") | ||
|
||
# Prepare a writer for the output data | ||
with IndexedTermFeatureListCorpusWriter(self.info.get_output_dir("data"), term_vocab, feature_vocab) as writer: | ||
# Input is given for every document in a corpus | ||
writer.add_data_points( | ||
# Doc data consists of (term, feature count dict) pairs which we can pass straight to writer | ||
(term, fcs) | ||
for doc_name, document_data in pbar(input_data) if not isinstance(document_data, InvalidDocument) | ||
for (term, fcs) in document_data | ||
) | ||
self.log.info("Mapper produced dataset with %d data points" % writer.metadata["length"]) |
Oops, something went wrong.