-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added matrix building module to build feature matrices from extracted…
… features.
- Loading branch information
Mark Granroth-Wilding
committed
Apr 6, 2016
1 parent
eae8b00
commit 7fe67ed
Showing
7 changed files
with
154 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
""" | ||
Wrappers around Numpy arrays and Scipy sparse matrices. | ||
""" | ||
import os | ||
|
||
from pimlico.datatypes.base import PimlicoDatatype, PimlicoDatatypeWriter | ||
|
||
|
||
class NumpyArray(PimlicoDatatype): | ||
def __init__(self, base_dir, pipeline, **kwargs): | ||
super(NumpyArray, self).__init__(base_dir, pipeline, **kwargs) | ||
self._array = None | ||
|
||
@property | ||
def array(self): | ||
if self._array is None: | ||
import numpy | ||
with open(os.path.join(self.data_dir, "array.npy"), "r") as f: | ||
self._array = numpy.load(f) | ||
return self._array | ||
|
||
def data_ready(self): | ||
return super(NumpyArray, self).data_ready() and os.path.exists(os.path.join(self.data_dir, "array.npy")) | ||
|
||
def check_runtime_dependencies(self): | ||
missing_dependencies = [] | ||
try: | ||
import numpy | ||
except ImportError: | ||
missing_dependencies.append(("Numpy", "install Numpy systemwide (e.g. package 'python-numpy' on Ubuntu)")) | ||
missing_dependencies.extend(super(NumpyArray, self).check_runtime_dependencies()) | ||
return missing_dependencies | ||
|
||
|
||
class NumpyArrayWriter(PimlicoDatatypeWriter): | ||
def set_array(self, array): | ||
import numpy | ||
numpy.save(os.path.join(self.data_dir, "array.npy"), array) | ||
|
||
|
||
class ScipySparseMatrix(PimlicoDatatype): | ||
""" | ||
Wrapper around Scipy sparse matrices. The matrix loaded is always in COO format -- you probably want to convert | ||
to something else before using it. See scipy docs on sparse matrix conversions. | ||
""" | ||
def __init__(self, base_dir, pipeline, **kwargs): | ||
super(ScipySparseMatrix, self).__init__(base_dir, pipeline, **kwargs) | ||
self._array = None | ||
|
||
@property | ||
def array(self): | ||
if self._array is None: | ||
from scipy import io | ||
self._array = io.mmread(os.path.join(self.data_dir, "array.mtx")) | ||
return self._array | ||
|
||
def data_ready(self): | ||
return super(ScipySparseMatrix, self).data_ready() and os.path.exists(os.path.join(self.data_dir, "array.mtx")) | ||
|
||
def check_runtime_dependencies(self): | ||
missing_dependencies = [] | ||
try: | ||
import numpy | ||
except ImportError: | ||
missing_dependencies.append(("Numpy", "install Numpy systemwide (e.g. package 'python-numpy' on Ubuntu)")) | ||
try: | ||
import scipy | ||
except ImportError: | ||
missing_dependencies.append(("Scipy", "install Scipy systemwide (e.g. package 'python-scipy' on Ubuntu)")) | ||
missing_dependencies.extend(super(ScipySparseMatrix, self).check_runtime_dependencies()) | ||
return missing_dependencies | ||
|
||
|
||
class ScipySparseMatrixWriter(PimlicoDatatypeWriter): | ||
def set_matrix(self, mat): | ||
from scipy.sparse import coo_matrix | ||
from scipy.io import mmwrite | ||
|
||
if type(mat) is not coo_matrix: | ||
# If this isn't a COO matrix, try converting it | ||
# Other scipy sparse matrix types and numpy dense arrays can all be converted in this way | ||
mat = coo_matrix(mat) | ||
|
||
mmwrite(os.path.join(self.data_dir, "array.mtx"), mat) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
27 changes: 27 additions & 0 deletions
27
src/python/pimlico/modules/features/term_feature_matrix_builder/exec.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import numpy | ||
from scipy.sparse.dok import dok_matrix | ||
|
||
from pimlico.core.modules.base import BaseModuleExecutor | ||
from pimlico.datatypes.arrays import ScipySparseMatrixWriter | ||
from pimlico.utils.progress import get_progress_bar | ||
|
||
|
||
class ModuleExecutor(BaseModuleExecutor): | ||
def execute(self): | ||
input_data = self.info.get_input("data") | ||
|
||
self.log.info("Collecting features into a %d x %d sparse matrix from %d data points" % | ||
(len(input_data.term_dictionary), len(input_data.feature_dictionary), len(input_data))) | ||
pbar = get_progress_bar(len(input_data), title="Collecting") | ||
|
||
matrix = dok_matrix((len(input_data.term_dictionary), len(input_data.feature_dictionary)), dtype=numpy.int32) | ||
# Iterate over the input data and collect up counts from all instances of each term | ||
for term, feature_counts in pbar(input_data): | ||
for feature, count in feature_counts.items(): | ||
matrix[term, feature] += count | ||
|
||
# Write out the matrix | ||
self.log.info("Built matrix: writing to disk") | ||
with ScipySparseMatrixWriter(self.info.get_output_dir("matrix")) as writer: | ||
# Matrix will be converted to COO format before writing | ||
writer.set_matrix(matrix) |
28 changes: 28 additions & 0 deletions
28
src/python/pimlico/modules/features/term_feature_matrix_builder/info.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from pimlico.core.modules.base import BaseModuleInfo | ||
from pimlico.datatypes.arrays import ScipySparseMatrix | ||
from pimlico.datatypes.features import IndexedTermFeatureListCorpus | ||
|
||
|
||
class ModuleInfo(BaseModuleInfo): | ||
module_type_name = "term_feature_matrix_builder" | ||
module_inputs = [ | ||
("data", IndexedTermFeatureListCorpus) | ||
] | ||
module_outputs = [("matrix", ScipySparseMatrix)] | ||
module_options = {} | ||
|
||
def check_runtime_dependencies(self): | ||
missing_dependencies = [] | ||
try: | ||
import numpy | ||
except ImportError: | ||
missing_dependencies.append(("Numpy", self.module_name, | ||
"install Numpy systemwide (e.g. package 'python-numpy' on Ubuntu)")) | ||
try: | ||
import scipy | ||
except ImportError: | ||
missing_dependencies.append(("Scipy", self.module_name, | ||
"install Scipy systemwide (e.g. package 'python-scipy' on Ubuntu)")) | ||
|
||
missing_dependencies.extend(super(ModuleInfo, self).check_runtime_dependencies()) | ||
return missing_dependencies |