Use chunks="auto" in array creation routines

This uses the chunks="auto" option in array creation routines like ones, zeros, random, arange, and so on. ```python >>> import dask.array as da >>> x = da.ones((10000, 10000)) >>> x.chunks ((2500, 2500, 2500, 2500), (2500, 2500, 2500, 2500)) ```
mrocklin · Jan 6, 2019 · ccb12ac · ccb12ac
1 parent 2ca205b
commit ccb12ac
Show file tree

Hide file tree

Showing 7 changed files with 66 additions and 64 deletions.
diff --git a/dask/array/creation.py b/dask/array/creation.py
@@ -175,7 +175,7 @@ def full_like(a, fill_value, dtype=None, chunks=None):
     )
 
 
-def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks=None,
+def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks='auto',
              dtype=None):
     """
     Return `num` evenly spaced values over the closed interval [`start`,
@@ -215,9 +215,6 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks=None,
     """
     num = int(num)
 
-    if chunks is None:
-        raise ValueError("Must supply a chunks= keyword argument")
-
     chunks = normalize_chunks(chunks, (num,))
 
     range_ = stop - start
@@ -296,17 +293,16 @@ def arange(*args, **kwargs):
         arange takes 3 positional arguments: arange([start], stop, [step])
         ''')
 
-    try:
-        chunks = kwargs.pop('chunks')
-    except KeyError:
-        raise TypeError("Required argument 'chunks' not found")
+    chunks = kwargs.pop('chunks', 'auto')
 
     num = int(max(np.ceil((stop - start) / step), 0))
-    chunks = normalize_chunks(chunks, (num,))
 
     dtype = kwargs.pop('dtype', None)
     if dtype is None:
         dtype = np.arange(start, stop, step * num if num else step).dtype
+
+    chunks = normalize_chunks(chunks, (num,), dtype=dtype)
+
     if kwargs:
         raise TypeError("Unexpected keyword argument(s): %s" %
                         ",".join(kwargs.keys()))
@@ -364,7 +360,7 @@ def meshgrid(*xi, **kwargs):
     return grid
 
 
-def indices(dimensions, dtype=int, chunks=None):
+def indices(dimensions, dtype=int, chunks='auto'):
     """
     Implements NumPy's ``indices`` for Dask Arrays.
 
@@ -388,9 +384,6 @@ def indices(dimensions, dtype=int, chunks=None):
     -------
     grid : dask array
     """
-    if chunks is None:
-        raise ValueError("Must supply a chunks= keyword argument")
-
     dimensions = tuple(dimensions)
     dtype = np.dtype(dtype)
     chunks = tuple(chunks)
@@ -613,9 +606,8 @@ def offset_func(*args, **kwargs):
 
 
 @wraps(np.fromfunction)
-def fromfunction(func, chunks=None, shape=None, dtype=None, **kwargs):
-    if chunks:
-        chunks = normalize_chunks(chunks, shape)
+def fromfunction(func, chunks='auto', shape=None, dtype=None, **kwargs):
+    chunks = normalize_chunks(chunks, shape)
     name = 'fromfunction-' + tokenize(func, chunks, shape, dtype, kwargs)
     keys = list(product([name], *[range(len(bd)) for bd in chunks]))
     aggdims = [list(accumulate(add, (0,) + bd[:-1])) for bd in chunks]

diff --git a/dask/array/fft.py b/dask/array/fft.py
@@ -237,7 +237,7 @@ def _fftfreq_block(i, n, d):
 
 
 @wraps(np.fft.fftfreq)
-def fftfreq(n, d=1.0, chunks=None):
+def fftfreq(n, d=1.0, chunks='auto'):
     n = int(n)
     d = float(d)
 
@@ -247,7 +247,7 @@ def fftfreq(n, d=1.0, chunks=None):
 
 
 @wraps(np.fft.rfftfreq)
-def rfftfreq(n, d=1.0, chunks=None):
+def rfftfreq(n, d=1.0, chunks='auto'):
     n = int(n)
     d = float(d)
 

diff --git a/dask/array/random.py b/dask/array/random.py
@@ -67,7 +67,7 @@ def _wrap(self, funcname, *args, **kwargs):
         extra_chunks should be a chunks tuple to append to the end of chunks
         """
         size = kwargs.pop('size', None)
-        chunks = kwargs.pop('chunks')
+        chunks = kwargs.pop('chunks', 'auto')
         extra_chunks = kwargs.pop('extra_chunks', ())
 
         if size is not None and not isinstance(size, (tuple, list)):
@@ -177,20 +177,20 @@ def _broadcast_any(ar, shape, chunks):
         return Array(graph, name, chunks + extra_chunks, dtype=dtype)
 
     @doc_wraps(np.random.RandomState.beta)
-    def beta(self, a, b, size=None, chunks=None):
+    def beta(self, a, b, size=None, chunks="auto"):
         return self._wrap('beta', a, b, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.binomial)
-    def binomial(self, n, p, size=None, chunks=None):
+    def binomial(self, n, p, size=None, chunks="auto"):
         return self._wrap('binomial', n, p, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.chisquare)
-    def chisquare(self, df, size=None, chunks=None):
+    def chisquare(self, df, size=None, chunks="auto"):
         return self._wrap('chisquare', df, size=size, chunks=chunks)
 
     with ignoring(AttributeError):
         @doc_wraps(np.random.RandomState.choice)
-        def choice(self, a, size=None, replace=True, p=None, chunks=None):
+        def choice(self, a, size=None, replace=True, p=None, chunks="auto"):
             dependencies = []
             # Normalize and validate `a`
             if isinstance(a, Integral):
@@ -254,146 +254,146 @@ def choice(self, a, size=None, replace=True, p=None, chunks=None):
             return Array(graph, name, chunks, dtype=dtype)
 
     # @doc_wraps(np.random.RandomState.dirichlet)
-    # def dirichlet(self, alpha, size=None, chunks=None):
+    # def dirichlet(self, alpha, size=None, chunks="auto"):
 
     @doc_wraps(np.random.RandomState.exponential)
-    def exponential(self, scale=1.0, size=None, chunks=None):
+    def exponential(self, scale=1.0, size=None, chunks="auto"):
         return self._wrap('exponential', scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.f)
-    def f(self, dfnum, dfden, size=None, chunks=None):
+    def f(self, dfnum, dfden, size=None, chunks="auto"):
         return self._wrap('f', dfnum, dfden, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.gamma)
-    def gamma(self, shape, scale=1.0, size=None, chunks=None):
+    def gamma(self, shape, scale=1.0, size=None, chunks="auto"):
         return self._wrap('gamma', shape, scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.geometric)
-    def geometric(self, p, size=None, chunks=None):
+    def geometric(self, p, size=None, chunks="auto"):
         return self._wrap('geometric', p, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.gumbel)
-    def gumbel(self, loc=0.0, scale=1.0, size=None, chunks=None):
+    def gumbel(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
         return self._wrap('gumbel', loc, scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.hypergeometric)
-    def hypergeometric(self, ngood, nbad, nsample, size=None, chunks=None):
+    def hypergeometric(self, ngood, nbad, nsample, size=None, chunks="auto"):
         return self._wrap('hypergeometric', ngood, nbad, nsample,
                           size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.laplace)
-    def laplace(self, loc=0.0, scale=1.0, size=None, chunks=None):
+    def laplace(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
         return self._wrap('laplace', loc, scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.logistic)
-    def logistic(self, loc=0.0, scale=1.0, size=None, chunks=None):
+    def logistic(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
         return self._wrap('logistic', loc, scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.lognormal)
-    def lognormal(self, mean=0.0, sigma=1.0, size=None, chunks=None):
+    def lognormal(self, mean=0.0, sigma=1.0, size=None, chunks="auto"):
         return self._wrap('lognormal', mean, sigma, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.logseries)
-    def logseries(self, p, size=None, chunks=None):
+    def logseries(self, p, size=None, chunks="auto"):
         return self._wrap('logseries', p, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.multinomial)
-    def multinomial(self, n, pvals, size=None, chunks=None):
+    def multinomial(self, n, pvals, size=None, chunks="auto"):
         return self._wrap('multinomial', n, pvals, size=size, chunks=chunks,
                           extra_chunks=((len(pvals),),))
 
     @doc_wraps(np.random.RandomState.negative_binomial)
-    def negative_binomial(self, n, p, size=None, chunks=None):
+    def negative_binomial(self, n, p, size=None, chunks="auto"):
         return self._wrap('negative_binomial', n, p, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.noncentral_chisquare)
-    def noncentral_chisquare(self, df, nonc, size=None, chunks=None):
+    def noncentral_chisquare(self, df, nonc, size=None, chunks="auto"):
         return self._wrap('noncentral_chisquare', df, nonc, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.noncentral_f)
-    def noncentral_f(self, dfnum, dfden, nonc,  size=None, chunks=None):
+    def noncentral_f(self, dfnum, dfden, nonc,  size=None, chunks="auto"):
         return self._wrap('noncentral_f', dfnum, dfden, nonc, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.normal)
-    def normal(self, loc=0.0, scale=1.0, size=None, chunks=None):
+    def normal(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
         return self._wrap('normal', loc, scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.pareto)
-    def pareto(self, a, size=None, chunks=None):
+    def pareto(self, a, size=None, chunks="auto"):
         return self._wrap('pareto', a, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.poisson)
-    def poisson(self, lam=1.0, size=None, chunks=None):
+    def poisson(self, lam=1.0, size=None, chunks="auto"):
         return self._wrap('poisson', lam, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.power)
-    def power(self, a, size=None, chunks=None):
+    def power(self, a, size=None, chunks="auto"):
         return self._wrap('power', a, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.randint)
-    def randint(self, low, high=None, size=None, chunks=None):
+    def randint(self, low, high=None, size=None, chunks="auto"):
         return self._wrap('randint', low, high, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.random_integers)
-    def random_integers(self, low, high=None, size=None, chunks=None):
+    def random_integers(self, low, high=None, size=None, chunks="auto"):
         return self._wrap('random_integers', low, high, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.random_sample)
-    def random_sample(self, size=None, chunks=None):
+    def random_sample(self, size=None, chunks="auto"):
         return self._wrap('random_sample', size=size, chunks=chunks)
 
     random = random_sample
 
     @doc_wraps(np.random.RandomState.rayleigh)
-    def rayleigh(self, scale=1.0, size=None, chunks=None):
+    def rayleigh(self, scale=1.0, size=None, chunks="auto"):
         return self._wrap('rayleigh', scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.standard_cauchy)
-    def standard_cauchy(self, size=None, chunks=None):
+    def standard_cauchy(self, size=None, chunks="auto"):
         return self._wrap('standard_cauchy', size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.standard_exponential)
-    def standard_exponential(self, size=None, chunks=None):
+    def standard_exponential(self, size=None, chunks="auto"):
         return self._wrap('standard_exponential', size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.standard_gamma)
-    def standard_gamma(self, shape, size=None, chunks=None):
+    def standard_gamma(self, shape, size=None, chunks="auto"):
         return self._wrap('standard_gamma', shape, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.standard_normal)
-    def standard_normal(self, size=None, chunks=None):
+    def standard_normal(self, size=None, chunks="auto"):
         return self._wrap('standard_normal', size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.standard_t)
-    def standard_t(self, df, size=None, chunks=None):
+    def standard_t(self, df, size=None, chunks="auto"):
         return self._wrap('standard_t', df, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.tomaxint)
-    def tomaxint(self, size=None, chunks=None):
+    def tomaxint(self, size=None, chunks="auto"):
         return self._wrap('tomaxint', size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.triangular)
-    def triangular(self, left, mode, right, size=None, chunks=None):
+    def triangular(self, left, mode, right, size=None, chunks="auto"):
         return self._wrap('triangular', left, mode, right, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.uniform)
-    def uniform(self, low=0.0, high=1.0, size=None, chunks=None):
+    def uniform(self, low=0.0, high=1.0, size=None, chunks="auto"):
         return self._wrap('uniform', low, high, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.vonmises)
-    def vonmises(self, mu, kappa, size=None, chunks=None):
+    def vonmises(self, mu, kappa, size=None, chunks="auto"):
         return self._wrap('vonmises', mu, kappa, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.wald)
-    def wald(self, mean, scale, size=None, chunks=None):
+    def wald(self, mean, scale, size=None, chunks="auto"):
         return self._wrap('wald', mean, scale, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.weibull)
-    def weibull(self, a, size=None, chunks=None):
+    def weibull(self, a, size=None, chunks="auto"):
         return self._wrap('weibull', a, size=size, chunks=chunks)
 
     @doc_wraps(np.random.RandomState.zipf)
-    def zipf(self, a, size=None, chunks=None):
+    def zipf(self, a, size=None, chunks="auto"):
         return self._wrap('zipf', a, size=size, chunks=chunks)
 
 

diff --git a/dask/array/tests/test_array_core.py b/dask/array/tests/test_array_core.py
@@ -2321,8 +2321,6 @@ def test_raise_on_no_chunks():
         assert "dask" in str(e)
         assert ".org" in str(e)
 
-    pytest.raises(ValueError, lambda: da.ones(6))
-
 
 def test_chunks_is_immutable():
     x = da.ones(6, chunks=3)

diff --git a/dask/array/tests/test_creation.py b/dask/array/tests/test_creation.py
@@ -5,6 +5,7 @@
 import pytest
 from toolz import concat
 
+import dask
 import dask.array as da
 from dask.array.utils import assert_eq, same_keys
 
@@ -131,9 +132,8 @@ def test_arange():
     with pytest.raises(TypeError) as exc:
         da.arange(10, chunks=-1, whatsthis=1)
     assert 'whatsthis' in str(exc)
-    with pytest.raises(TypeError) as exc:
-        da.arange(10)
-    assert 'chunks' in str(exc)
+
+    assert da.arange(10).chunks == ((10,),)
 
 
 @pytest.mark.parametrize("start,stop,step,dtype", [
@@ -529,3 +529,9 @@ def udf_pad(vector, pad_width, iaxis, kwargs):
     da_r = da.pad(da_a, pad_width, udf_pad, kwargs=kwargs)
 
     assert_eq(np_r, da_r)
+
+
+def test_auto_chunks():
+    with dask.config.set({'array.chunk-size': '50 MiB'}):
+        x = da.ones((10000, 10000))
+        assert 4 < x.npartitions < 32
diff --git a/dask/array/tests/test_random.py b/dask/array/tests/test_random.py
@@ -312,3 +312,9 @@ def test_external_randomstate_class():
     b = rs.normal(0, 1, size=(10), chunks=(5,))
     assert a.name == b.name
     assert_eq(a, b)
+
+
+def test_auto_chunks():
+    with dask.config.set({'array.chunk-size': '50 MiB'}):
+        x = da.random.random((10000, 10000))
+        assert 4 < x.npartitions < 32
diff --git a/dask/array/wrap.py b/dask/array/wrap.py
@@ -30,7 +30,7 @@ def wrap_func_shape_as_first_arg(func, *args, **kwargs):
     if not isinstance(shape, (tuple, list)):
         shape = (shape,)
 
-    chunks = kwargs.pop('chunks', None)
+    chunks = kwargs.pop('chunks', 'auto')
 
     dtype = kwargs.pop('dtype', None)
     if dtype is None: