Permalink
Browse files

ENH Ducktyping to allow for alternative Memory implementations (sciki…

  • Loading branch information...
thechargedneutron authored and maskani-moh committed Aug 30, 2017
1 parent 337fc9f commit 2dc223856ca35082868f1c8b0e33f5eef23c83a2
@@ -43,6 +43,11 @@ should be used when applicable.
be sliced or indexed using safe_index. This is used to validate input for
cross-validation.
- :func:`validation.check_memory` checks that input is ``joblib.Memory``-like,
which means that it can be converted into a
``sklearn.externals.joblib.Memory`` instance (typically a str denoting
the ``cachedir``) or has the same interface.
If your code relies on a random number generator, it should never use
functions like ``numpy.random.random`` or ``numpy.random.normal``. This
approach can lead to repeatability issues in unit tests. Instead, a
View
@@ -1378,6 +1378,7 @@ Low-level methods
utils.sparsefuncs.inplace_swap_column
utils.sparsefuncs.mean_variance_axis
utils.validation.check_is_fitted
utils.validation.check_memory
utils.validation.check_symmetric
utils.validation.column_or_1d
utils.validation.has_fit_parameter
@@ -15,10 +15,10 @@
from scipy.sparse.csgraph import connected_components
from ..base import BaseEstimator, ClusterMixin
from ..externals.joblib import Memory
from ..externals import six
from ..metrics.pairwise import paired_distances, pairwise_distances
from ..utils import check_array
from ..utils.validation import check_memory
from . import _hierarchical
from ._feature_agglomeration import AgglomerationTransform
@@ -609,8 +609,7 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin):
"manhattan", "cosine", or 'precomputed'.
If linkage is "ward", only "euclidean" is accepted.
memory : Instance of sklearn.externals.joblib.Memory or string, optional \
(default=None)
memory : joblib.Memory-like or string, optional
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
@@ -693,16 +692,7 @@ def fit(self, X, y=None):
self
"""
X = check_array(X, ensure_min_samples=2, estimator=self)
memory = self.memory
if memory is None:
memory = Memory(cachedir=None, verbose=0)
elif isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)
elif not isinstance(memory, Memory):
raise ValueError("'memory' should either be a string or"
" a sklearn.externals.joblib.Memory"
" instance, got 'memory={!r}' instead.".format(
type(memory)))
memory = check_memory(self.memory)
if self.n_clusters <= 0:
raise ValueError("n_clusters should be an integer greater than 0."
@@ -779,8 +769,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
"manhattan", "cosine", or 'precomputed'.
If linkage is "ward", only "euclidean" is accepted.
memory : Instance of sklearn.externals.joblib.Memory or string, optional \
(default=None)
memory : joblib.Memory-like or string, optional
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
View
@@ -19,6 +19,7 @@
from .externals import six
from .utils.metaestimators import if_delegate_has_method
from .utils import Bunch
from .utils.validation import check_memory
from .utils.metaestimators import _BaseComposition
@@ -51,8 +52,7 @@ class Pipeline(_BaseComposition):
chained, in the order in which they are chained, with the last object
an estimator.
memory : Instance of sklearn.external.joblib.Memory or string, optional \
(default=None)
memory : joblib.Memory-like or string, optional
Used to cache the fitted transformers of the pipeline. By default,
no caching is performed. If a string is given, it is the path to
the caching directory. Enabling caching triggers a clone of
@@ -186,16 +186,7 @@ def _final_estimator(self):
def _fit(self, X, y=None, **fit_params):
self._validate_steps()
# Setup the memory
memory = self.memory
if memory is None:
memory = Memory(cachedir=None, verbose=0)
elif isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)
elif not isinstance(memory, Memory):
raise ValueError("'memory' should either be a string or"
" a sklearn.externals.joblib.Memory"
" instance, got 'memory={!r}' instead.".format(
type(memory)))
memory = check_memory(self.memory)
fit_transform_one_cached = memory.cache(_fit_transform_one)
@@ -209,7 +200,7 @@ def _fit(self, X, y=None, **fit_params):
if transformer is None:
pass
else:
if memory.cachedir is None:
if hasattr(memory, 'cachedir') and memory.cachedir is None:
# we do not clone when caching is disabled to preserve
# backward compatibility
cloned_transformer = transformer
@@ -537,8 +528,7 @@ def make_pipeline(*steps, **kwargs):
----------
*steps : list of estimators,
memory : Instance of sklearn.externals.joblib.Memory or string, optional \
(default=None)
memory : joblib.Memory-like or string, optional
Used to cache the fitted transformers of the pipeline. By default,
no caching is performed. If a string is given, it is the path to
the caching directory. Enabling caching triggers a clone of
@@ -868,9 +868,33 @@ def test_pipeline_wrong_memory():
memory = 1
cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())],
memory=memory)
assert_raises_regex(ValueError, "'memory' should either be a string or a"
" sklearn.externals.joblib.Memory instance, got",
cached_pipe.fit, X, y)
assert_raises_regex(ValueError, "'memory' should be None, a string or"
" have the same interface as "
"sklearn.externals.joblib.Memory."
" Got memory='1' instead.", cached_pipe.fit, X, y)
class DummyMemory(object):
def cache(self, func):
return func
class WrongDummyMemory(object):
pass
def test_pipeline_with_cache_attribute():
X = np.array([[1, 2]])
pipe = Pipeline([('transf', Transf()), ('clf', Mult())],
memory=DummyMemory())
pipe.fit(X, y=None)
dummy = WrongDummyMemory()
pipe = Pipeline([('transf', Transf()), ('clf', Mult())],
memory=dummy)
assert_raises_regex(ValueError, "'memory' should be None, a string or"
" have the same interface as "
"sklearn.externals.joblib.Memory."
" Got memory='{}' instead.".format(dummy), pipe.fit, X)
def test_pipeline_memory():
@@ -1,6 +1,7 @@
"""Tests for input validation functions"""
import warnings
import os
from tempfile import NamedTemporaryFile
from itertools import product
@@ -10,7 +11,8 @@
import scipy.sparse as sp
from sklearn.utils.testing import assert_true, assert_false, assert_equal
from sklearn.utils.testing import assert_raises, assert_raises_regexp
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.testing import assert_no_warnings
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_warns
@@ -31,6 +33,7 @@
check_is_fitted,
check_consistent_length,
assert_all_finite,
check_memory
)
import sklearn
@@ -39,6 +42,7 @@
from sklearn.utils.testing import assert_raise_message
def test_as_float_array():
# Test function for as_float_array
X = np.ones((3, 10), dtype=np.int32)
@@ -506,17 +510,17 @@ def test_check_consistent_length():
check_consistent_length([1], [2], [3], [4], [5])
check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b'])
check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2)))
assert_raises_regexp(ValueError, 'inconsistent numbers of samples',
check_consistent_length, [1, 2], [1])
assert_raises_regexp(TypeError, 'got <\w+ \'int\'>',
check_consistent_length, [1, 2], 1)
assert_raises_regexp(TypeError, 'got <\w+ \'object\'>',
check_consistent_length, [1, 2], object())
assert_raises_regex(ValueError, 'inconsistent numbers of samples',
check_consistent_length, [1, 2], [1])
assert_raises_regex(TypeError, 'got <\w+ \'int\'>',
check_consistent_length, [1, 2], 1)
assert_raises_regex(TypeError, 'got <\w+ \'object\'>',
check_consistent_length, [1, 2], object())
assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1))
# Despite ensembles having __len__ they must raise TypeError
assert_raises_regexp(TypeError, 'estimator', check_consistent_length,
[1, 2], RandomForestRegressor())
assert_raises_regex(TypeError, 'estimator', check_consistent_length,
[1, 2], RandomForestRegressor())
# XXX: We should have a test with a string, but what is correct behaviour?
@@ -539,3 +543,31 @@ def test_suppress_validation():
assert_all_finite(X)
sklearn.set_config(assume_finite=False)
assert_raises(ValueError, assert_all_finite, X)
class DummyMemory(object):
def cache(self, func):
return func
class WrongDummyMemory(object):
pass
def test_check_memory():
memory = check_memory("cache_directory")
assert_equal(memory.cachedir, os.path.join('cache_directory', 'joblib'))
memory = check_memory(None)
assert_equal(memory.cachedir, None)
dummy = DummyMemory()
memory = check_memory(dummy)
assert memory is dummy
assert_raises_regex(ValueError, "'memory' should be None, a string or"
" have the same interface as "
"sklearn.externals.joblib.Memory."
" Got memory='1' instead.", check_memory, 1)
dummy = WrongDummyMemory()
assert_raises_regex(ValueError, "'memory' should be None, a string or"
" have the same interface as "
"sklearn.externals.joblib.Memory. Got memory='{}' "
"instead.".format(dummy), check_memory, dummy)
@@ -20,6 +20,7 @@
from ..exceptions import NonBLASDotWarning
from ..exceptions import NotFittedError
from ..exceptions import DataConversionWarning
from ..externals.joblib import Memory
FLOAT_DTYPES = (np.float64, np.float32, np.float16)
@@ -155,6 +156,36 @@ def _shape_repr(shape):
return "(%s)" % joined
def check_memory(memory):
"""Check that ``memory`` is joblib.Memory-like.
joblib.Memory-like means that ``memory`` can be converted into a
sklearn.externals.joblib.Memory instance (typically a str denoting the
``cachedir``) or has the same interface (has a ``cache`` method).
Parameters
----------
memory : joblib.Memory-like or string or None
Returns
-------
memory : object with the joblib.Memory interface
Raises
------
ValueError
If ``memory`` is not joblib.Memory-like.
"""
if memory is None or isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)
elif not hasattr(memory, 'cache'):
raise ValueError("'memory' should be None, a string or have the same"
" interface as sklearn.externals.joblib.Memory."
" Got memory='{}' instead.".format(memory))
return memory
def check_consistent_length(*arrays):
"""Check that all arrays have consistent first dimensions.

0 comments on commit 2dc2238

Please sign in to comment.