From ccb12ac890c697e03c123c03797424bd9de84717 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 5 Jan 2019 17:55:26 -0800 Subject: [PATCH] Use chunks="auto" in array creation routines This uses the chunks="auto" option in array creation routines like ones, zeros, random, arange, and so on. ```python >>> import dask.array as da >>> x = da.ones((10000, 10000)) >>> x.chunks ((2500, 2500, 2500, 2500), (2500, 2500, 2500, 2500)) ``` --- dask/array/creation.py | 24 +++------ dask/array/fft.py | 4 +- dask/array/random.py | 80 ++++++++++++++--------------- dask/array/tests/test_array_core.py | 2 - dask/array/tests/test_creation.py | 12 +++-- dask/array/tests/test_random.py | 6 +++ dask/array/wrap.py | 2 +- 7 files changed, 66 insertions(+), 64 deletions(-) diff --git a/dask/array/creation.py b/dask/array/creation.py index 4e179e860ca..814fd80eb5e 100644 --- a/dask/array/creation.py +++ b/dask/array/creation.py @@ -175,7 +175,7 @@ def full_like(a, fill_value, dtype=None, chunks=None): ) -def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks=None, +def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks='auto', dtype=None): """ Return `num` evenly spaced values over the closed interval [`start`, @@ -215,9 +215,6 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks=None, """ num = int(num) - if chunks is None: - raise ValueError("Must supply a chunks= keyword argument") - chunks = normalize_chunks(chunks, (num,)) range_ = stop - start @@ -296,17 +293,16 @@ def arange(*args, **kwargs): arange takes 3 positional arguments: arange([start], stop, [step]) ''') - try: - chunks = kwargs.pop('chunks') - except KeyError: - raise TypeError("Required argument 'chunks' not found") + chunks = kwargs.pop('chunks', 'auto') num = int(max(np.ceil((stop - start) / step), 0)) - chunks = normalize_chunks(chunks, (num,)) dtype = kwargs.pop('dtype', None) if dtype is None: dtype = np.arange(start, stop, step * num if num else step).dtype + + chunks = normalize_chunks(chunks, (num,), dtype=dtype) + if kwargs: raise TypeError("Unexpected keyword argument(s): %s" % ",".join(kwargs.keys())) @@ -364,7 +360,7 @@ def meshgrid(*xi, **kwargs): return grid -def indices(dimensions, dtype=int, chunks=None): +def indices(dimensions, dtype=int, chunks='auto'): """ Implements NumPy's ``indices`` for Dask Arrays. @@ -388,9 +384,6 @@ def indices(dimensions, dtype=int, chunks=None): ------- grid : dask array """ - if chunks is None: - raise ValueError("Must supply a chunks= keyword argument") - dimensions = tuple(dimensions) dtype = np.dtype(dtype) chunks = tuple(chunks) @@ -613,9 +606,8 @@ def offset_func(*args, **kwargs): @wraps(np.fromfunction) -def fromfunction(func, chunks=None, shape=None, dtype=None, **kwargs): - if chunks: - chunks = normalize_chunks(chunks, shape) +def fromfunction(func, chunks='auto', shape=None, dtype=None, **kwargs): + chunks = normalize_chunks(chunks, shape) name = 'fromfunction-' + tokenize(func, chunks, shape, dtype, kwargs) keys = list(product([name], *[range(len(bd)) for bd in chunks])) aggdims = [list(accumulate(add, (0,) + bd[:-1])) for bd in chunks] diff --git a/dask/array/fft.py b/dask/array/fft.py index 7b5ab266f4c..e5611c981fd 100644 --- a/dask/array/fft.py +++ b/dask/array/fft.py @@ -237,7 +237,7 @@ def _fftfreq_block(i, n, d): @wraps(np.fft.fftfreq) -def fftfreq(n, d=1.0, chunks=None): +def fftfreq(n, d=1.0, chunks='auto'): n = int(n) d = float(d) @@ -247,7 +247,7 @@ def fftfreq(n, d=1.0, chunks=None): @wraps(np.fft.rfftfreq) -def rfftfreq(n, d=1.0, chunks=None): +def rfftfreq(n, d=1.0, chunks='auto'): n = int(n) d = float(d) diff --git a/dask/array/random.py b/dask/array/random.py index e3dbd41a157..5ca3b257fa6 100644 --- a/dask/array/random.py +++ b/dask/array/random.py @@ -67,7 +67,7 @@ def _wrap(self, funcname, *args, **kwargs): extra_chunks should be a chunks tuple to append to the end of chunks """ size = kwargs.pop('size', None) - chunks = kwargs.pop('chunks') + chunks = kwargs.pop('chunks', 'auto') extra_chunks = kwargs.pop('extra_chunks', ()) if size is not None and not isinstance(size, (tuple, list)): @@ -177,20 +177,20 @@ def _broadcast_any(ar, shape, chunks): return Array(graph, name, chunks + extra_chunks, dtype=dtype) @doc_wraps(np.random.RandomState.beta) - def beta(self, a, b, size=None, chunks=None): + def beta(self, a, b, size=None, chunks="auto"): return self._wrap('beta', a, b, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.binomial) - def binomial(self, n, p, size=None, chunks=None): + def binomial(self, n, p, size=None, chunks="auto"): return self._wrap('binomial', n, p, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.chisquare) - def chisquare(self, df, size=None, chunks=None): + def chisquare(self, df, size=None, chunks="auto"): return self._wrap('chisquare', df, size=size, chunks=chunks) with ignoring(AttributeError): @doc_wraps(np.random.RandomState.choice) - def choice(self, a, size=None, replace=True, p=None, chunks=None): + def choice(self, a, size=None, replace=True, p=None, chunks="auto"): dependencies = [] # Normalize and validate `a` if isinstance(a, Integral): @@ -254,146 +254,146 @@ def choice(self, a, size=None, replace=True, p=None, chunks=None): return Array(graph, name, chunks, dtype=dtype) # @doc_wraps(np.random.RandomState.dirichlet) - # def dirichlet(self, alpha, size=None, chunks=None): + # def dirichlet(self, alpha, size=None, chunks="auto"): @doc_wraps(np.random.RandomState.exponential) - def exponential(self, scale=1.0, size=None, chunks=None): + def exponential(self, scale=1.0, size=None, chunks="auto"): return self._wrap('exponential', scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.f) - def f(self, dfnum, dfden, size=None, chunks=None): + def f(self, dfnum, dfden, size=None, chunks="auto"): return self._wrap('f', dfnum, dfden, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.gamma) - def gamma(self, shape, scale=1.0, size=None, chunks=None): + def gamma(self, shape, scale=1.0, size=None, chunks="auto"): return self._wrap('gamma', shape, scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.geometric) - def geometric(self, p, size=None, chunks=None): + def geometric(self, p, size=None, chunks="auto"): return self._wrap('geometric', p, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.gumbel) - def gumbel(self, loc=0.0, scale=1.0, size=None, chunks=None): + def gumbel(self, loc=0.0, scale=1.0, size=None, chunks="auto"): return self._wrap('gumbel', loc, scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.hypergeometric) - def hypergeometric(self, ngood, nbad, nsample, size=None, chunks=None): + def hypergeometric(self, ngood, nbad, nsample, size=None, chunks="auto"): return self._wrap('hypergeometric', ngood, nbad, nsample, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.laplace) - def laplace(self, loc=0.0, scale=1.0, size=None, chunks=None): + def laplace(self, loc=0.0, scale=1.0, size=None, chunks="auto"): return self._wrap('laplace', loc, scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.logistic) - def logistic(self, loc=0.0, scale=1.0, size=None, chunks=None): + def logistic(self, loc=0.0, scale=1.0, size=None, chunks="auto"): return self._wrap('logistic', loc, scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.lognormal) - def lognormal(self, mean=0.0, sigma=1.0, size=None, chunks=None): + def lognormal(self, mean=0.0, sigma=1.0, size=None, chunks="auto"): return self._wrap('lognormal', mean, sigma, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.logseries) - def logseries(self, p, size=None, chunks=None): + def logseries(self, p, size=None, chunks="auto"): return self._wrap('logseries', p, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.multinomial) - def multinomial(self, n, pvals, size=None, chunks=None): + def multinomial(self, n, pvals, size=None, chunks="auto"): return self._wrap('multinomial', n, pvals, size=size, chunks=chunks, extra_chunks=((len(pvals),),)) @doc_wraps(np.random.RandomState.negative_binomial) - def negative_binomial(self, n, p, size=None, chunks=None): + def negative_binomial(self, n, p, size=None, chunks="auto"): return self._wrap('negative_binomial', n, p, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.noncentral_chisquare) - def noncentral_chisquare(self, df, nonc, size=None, chunks=None): + def noncentral_chisquare(self, df, nonc, size=None, chunks="auto"): return self._wrap('noncentral_chisquare', df, nonc, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.noncentral_f) - def noncentral_f(self, dfnum, dfden, nonc, size=None, chunks=None): + def noncentral_f(self, dfnum, dfden, nonc, size=None, chunks="auto"): return self._wrap('noncentral_f', dfnum, dfden, nonc, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.normal) - def normal(self, loc=0.0, scale=1.0, size=None, chunks=None): + def normal(self, loc=0.0, scale=1.0, size=None, chunks="auto"): return self._wrap('normal', loc, scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.pareto) - def pareto(self, a, size=None, chunks=None): + def pareto(self, a, size=None, chunks="auto"): return self._wrap('pareto', a, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.poisson) - def poisson(self, lam=1.0, size=None, chunks=None): + def poisson(self, lam=1.0, size=None, chunks="auto"): return self._wrap('poisson', lam, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.power) - def power(self, a, size=None, chunks=None): + def power(self, a, size=None, chunks="auto"): return self._wrap('power', a, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.randint) - def randint(self, low, high=None, size=None, chunks=None): + def randint(self, low, high=None, size=None, chunks="auto"): return self._wrap('randint', low, high, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.random_integers) - def random_integers(self, low, high=None, size=None, chunks=None): + def random_integers(self, low, high=None, size=None, chunks="auto"): return self._wrap('random_integers', low, high, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.random_sample) - def random_sample(self, size=None, chunks=None): + def random_sample(self, size=None, chunks="auto"): return self._wrap('random_sample', size=size, chunks=chunks) random = random_sample @doc_wraps(np.random.RandomState.rayleigh) - def rayleigh(self, scale=1.0, size=None, chunks=None): + def rayleigh(self, scale=1.0, size=None, chunks="auto"): return self._wrap('rayleigh', scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.standard_cauchy) - def standard_cauchy(self, size=None, chunks=None): + def standard_cauchy(self, size=None, chunks="auto"): return self._wrap('standard_cauchy', size=size, chunks=chunks) @doc_wraps(np.random.RandomState.standard_exponential) - def standard_exponential(self, size=None, chunks=None): + def standard_exponential(self, size=None, chunks="auto"): return self._wrap('standard_exponential', size=size, chunks=chunks) @doc_wraps(np.random.RandomState.standard_gamma) - def standard_gamma(self, shape, size=None, chunks=None): + def standard_gamma(self, shape, size=None, chunks="auto"): return self._wrap('standard_gamma', shape, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.standard_normal) - def standard_normal(self, size=None, chunks=None): + def standard_normal(self, size=None, chunks="auto"): return self._wrap('standard_normal', size=size, chunks=chunks) @doc_wraps(np.random.RandomState.standard_t) - def standard_t(self, df, size=None, chunks=None): + def standard_t(self, df, size=None, chunks="auto"): return self._wrap('standard_t', df, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.tomaxint) - def tomaxint(self, size=None, chunks=None): + def tomaxint(self, size=None, chunks="auto"): return self._wrap('tomaxint', size=size, chunks=chunks) @doc_wraps(np.random.RandomState.triangular) - def triangular(self, left, mode, right, size=None, chunks=None): + def triangular(self, left, mode, right, size=None, chunks="auto"): return self._wrap('triangular', left, mode, right, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.uniform) - def uniform(self, low=0.0, high=1.0, size=None, chunks=None): + def uniform(self, low=0.0, high=1.0, size=None, chunks="auto"): return self._wrap('uniform', low, high, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.vonmises) - def vonmises(self, mu, kappa, size=None, chunks=None): + def vonmises(self, mu, kappa, size=None, chunks="auto"): return self._wrap('vonmises', mu, kappa, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.wald) - def wald(self, mean, scale, size=None, chunks=None): + def wald(self, mean, scale, size=None, chunks="auto"): return self._wrap('wald', mean, scale, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.weibull) - def weibull(self, a, size=None, chunks=None): + def weibull(self, a, size=None, chunks="auto"): return self._wrap('weibull', a, size=size, chunks=chunks) @doc_wraps(np.random.RandomState.zipf) - def zipf(self, a, size=None, chunks=None): + def zipf(self, a, size=None, chunks="auto"): return self._wrap('zipf', a, size=size, chunks=chunks) diff --git a/dask/array/tests/test_array_core.py b/dask/array/tests/test_array_core.py index 5ff859c5c3a..37602c8b8b9 100644 --- a/dask/array/tests/test_array_core.py +++ b/dask/array/tests/test_array_core.py @@ -2321,8 +2321,6 @@ def test_raise_on_no_chunks(): assert "dask" in str(e) assert ".org" in str(e) - pytest.raises(ValueError, lambda: da.ones(6)) - def test_chunks_is_immutable(): x = da.ones(6, chunks=3) diff --git a/dask/array/tests/test_creation.py b/dask/array/tests/test_creation.py index e172de8603d..e1269f1d93b 100644 --- a/dask/array/tests/test_creation.py +++ b/dask/array/tests/test_creation.py @@ -5,6 +5,7 @@ import pytest from toolz import concat +import dask import dask.array as da from dask.array.utils import assert_eq, same_keys @@ -131,9 +132,8 @@ def test_arange(): with pytest.raises(TypeError) as exc: da.arange(10, chunks=-1, whatsthis=1) assert 'whatsthis' in str(exc) - with pytest.raises(TypeError) as exc: - da.arange(10) - assert 'chunks' in str(exc) + + assert da.arange(10).chunks == ((10,),) @pytest.mark.parametrize("start,stop,step,dtype", [ @@ -529,3 +529,9 @@ def udf_pad(vector, pad_width, iaxis, kwargs): da_r = da.pad(da_a, pad_width, udf_pad, kwargs=kwargs) assert_eq(np_r, da_r) + + +def test_auto_chunks(): + with dask.config.set({'array.chunk-size': '50 MiB'}): + x = da.ones((10000, 10000)) + assert 4 < x.npartitions < 32 diff --git a/dask/array/tests/test_random.py b/dask/array/tests/test_random.py index 5a0b53bf4cf..5fb608fd886 100644 --- a/dask/array/tests/test_random.py +++ b/dask/array/tests/test_random.py @@ -312,3 +312,9 @@ def test_external_randomstate_class(): b = rs.normal(0, 1, size=(10), chunks=(5,)) assert a.name == b.name assert_eq(a, b) + + +def test_auto_chunks(): + with dask.config.set({'array.chunk-size': '50 MiB'}): + x = da.random.random((10000, 10000)) + assert 4 < x.npartitions < 32 diff --git a/dask/array/wrap.py b/dask/array/wrap.py index a7e51bdda08..209db5ffba2 100644 --- a/dask/array/wrap.py +++ b/dask/array/wrap.py @@ -30,7 +30,7 @@ def wrap_func_shape_as_first_arg(func, *args, **kwargs): if not isinstance(shape, (tuple, list)): shape = (shape,) - chunks = kwargs.pop('chunks', None) + chunks = kwargs.pop('chunks', 'auto') dtype = kwargs.pop('dtype', None) if dtype is None: