Skip to content

Commit

Permalink
Use chunks="auto" in array creation routines
Browse files Browse the repository at this point in the history
This uses the chunks="auto" option in array creation routines like
ones, zeros, random, arange, and so on.

```python
>>> import dask.array as da

>>> x = da.ones((10000, 10000))
>>> x.chunks
((2500, 2500, 2500, 2500), (2500, 2500, 2500, 2500))
```
  • Loading branch information
mrocklin committed Jan 6, 2019
1 parent 2ca205b commit ccb12ac
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 64 deletions.
24 changes: 8 additions & 16 deletions dask/array/creation.py
Expand Up @@ -175,7 +175,7 @@ def full_like(a, fill_value, dtype=None, chunks=None):
)


def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks=None,
def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks='auto',
dtype=None):
"""
Return `num` evenly spaced values over the closed interval [`start`,
Expand Down Expand Up @@ -215,9 +215,6 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, chunks=None,
"""
num = int(num)

if chunks is None:
raise ValueError("Must supply a chunks= keyword argument")

chunks = normalize_chunks(chunks, (num,))

range_ = stop - start
Expand Down Expand Up @@ -296,17 +293,16 @@ def arange(*args, **kwargs):
arange takes 3 positional arguments: arange([start], stop, [step])
''')

try:
chunks = kwargs.pop('chunks')
except KeyError:
raise TypeError("Required argument 'chunks' not found")
chunks = kwargs.pop('chunks', 'auto')

num = int(max(np.ceil((stop - start) / step), 0))
chunks = normalize_chunks(chunks, (num,))

dtype = kwargs.pop('dtype', None)
if dtype is None:
dtype = np.arange(start, stop, step * num if num else step).dtype

chunks = normalize_chunks(chunks, (num,), dtype=dtype)

if kwargs:
raise TypeError("Unexpected keyword argument(s): %s" %
",".join(kwargs.keys()))
Expand Down Expand Up @@ -364,7 +360,7 @@ def meshgrid(*xi, **kwargs):
return grid


def indices(dimensions, dtype=int, chunks=None):
def indices(dimensions, dtype=int, chunks='auto'):
"""
Implements NumPy's ``indices`` for Dask Arrays.
Expand All @@ -388,9 +384,6 @@ def indices(dimensions, dtype=int, chunks=None):
-------
grid : dask array
"""
if chunks is None:
raise ValueError("Must supply a chunks= keyword argument")

dimensions = tuple(dimensions)
dtype = np.dtype(dtype)
chunks = tuple(chunks)
Expand Down Expand Up @@ -613,9 +606,8 @@ def offset_func(*args, **kwargs):


@wraps(np.fromfunction)
def fromfunction(func, chunks=None, shape=None, dtype=None, **kwargs):
if chunks:
chunks = normalize_chunks(chunks, shape)
def fromfunction(func, chunks='auto', shape=None, dtype=None, **kwargs):
chunks = normalize_chunks(chunks, shape)
name = 'fromfunction-' + tokenize(func, chunks, shape, dtype, kwargs)
keys = list(product([name], *[range(len(bd)) for bd in chunks]))
aggdims = [list(accumulate(add, (0,) + bd[:-1])) for bd in chunks]
Expand Down
4 changes: 2 additions & 2 deletions dask/array/fft.py
Expand Up @@ -237,7 +237,7 @@ def _fftfreq_block(i, n, d):


@wraps(np.fft.fftfreq)
def fftfreq(n, d=1.0, chunks=None):
def fftfreq(n, d=1.0, chunks='auto'):
n = int(n)
d = float(d)

Expand All @@ -247,7 +247,7 @@ def fftfreq(n, d=1.0, chunks=None):


@wraps(np.fft.rfftfreq)
def rfftfreq(n, d=1.0, chunks=None):
def rfftfreq(n, d=1.0, chunks='auto'):
n = int(n)
d = float(d)

Expand Down
80 changes: 40 additions & 40 deletions dask/array/random.py
Expand Up @@ -67,7 +67,7 @@ def _wrap(self, funcname, *args, **kwargs):
extra_chunks should be a chunks tuple to append to the end of chunks
"""
size = kwargs.pop('size', None)
chunks = kwargs.pop('chunks')
chunks = kwargs.pop('chunks', 'auto')
extra_chunks = kwargs.pop('extra_chunks', ())

if size is not None and not isinstance(size, (tuple, list)):
Expand Down Expand Up @@ -177,20 +177,20 @@ def _broadcast_any(ar, shape, chunks):
return Array(graph, name, chunks + extra_chunks, dtype=dtype)

@doc_wraps(np.random.RandomState.beta)
def beta(self, a, b, size=None, chunks=None):
def beta(self, a, b, size=None, chunks="auto"):
return self._wrap('beta', a, b, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.binomial)
def binomial(self, n, p, size=None, chunks=None):
def binomial(self, n, p, size=None, chunks="auto"):
return self._wrap('binomial', n, p, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.chisquare)
def chisquare(self, df, size=None, chunks=None):
def chisquare(self, df, size=None, chunks="auto"):
return self._wrap('chisquare', df, size=size, chunks=chunks)

with ignoring(AttributeError):
@doc_wraps(np.random.RandomState.choice)
def choice(self, a, size=None, replace=True, p=None, chunks=None):
def choice(self, a, size=None, replace=True, p=None, chunks="auto"):
dependencies = []
# Normalize and validate `a`
if isinstance(a, Integral):
Expand Down Expand Up @@ -254,146 +254,146 @@ def choice(self, a, size=None, replace=True, p=None, chunks=None):
return Array(graph, name, chunks, dtype=dtype)

# @doc_wraps(np.random.RandomState.dirichlet)
# def dirichlet(self, alpha, size=None, chunks=None):
# def dirichlet(self, alpha, size=None, chunks="auto"):

@doc_wraps(np.random.RandomState.exponential)
def exponential(self, scale=1.0, size=None, chunks=None):
def exponential(self, scale=1.0, size=None, chunks="auto"):
return self._wrap('exponential', scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.f)
def f(self, dfnum, dfden, size=None, chunks=None):
def f(self, dfnum, dfden, size=None, chunks="auto"):
return self._wrap('f', dfnum, dfden, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.gamma)
def gamma(self, shape, scale=1.0, size=None, chunks=None):
def gamma(self, shape, scale=1.0, size=None, chunks="auto"):
return self._wrap('gamma', shape, scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.geometric)
def geometric(self, p, size=None, chunks=None):
def geometric(self, p, size=None, chunks="auto"):
return self._wrap('geometric', p, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.gumbel)
def gumbel(self, loc=0.0, scale=1.0, size=None, chunks=None):
def gumbel(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
return self._wrap('gumbel', loc, scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.hypergeometric)
def hypergeometric(self, ngood, nbad, nsample, size=None, chunks=None):
def hypergeometric(self, ngood, nbad, nsample, size=None, chunks="auto"):
return self._wrap('hypergeometric', ngood, nbad, nsample,
size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.laplace)
def laplace(self, loc=0.0, scale=1.0, size=None, chunks=None):
def laplace(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
return self._wrap('laplace', loc, scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.logistic)
def logistic(self, loc=0.0, scale=1.0, size=None, chunks=None):
def logistic(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
return self._wrap('logistic', loc, scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.lognormal)
def lognormal(self, mean=0.0, sigma=1.0, size=None, chunks=None):
def lognormal(self, mean=0.0, sigma=1.0, size=None, chunks="auto"):
return self._wrap('lognormal', mean, sigma, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.logseries)
def logseries(self, p, size=None, chunks=None):
def logseries(self, p, size=None, chunks="auto"):
return self._wrap('logseries', p, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.multinomial)
def multinomial(self, n, pvals, size=None, chunks=None):
def multinomial(self, n, pvals, size=None, chunks="auto"):
return self._wrap('multinomial', n, pvals, size=size, chunks=chunks,
extra_chunks=((len(pvals),),))

@doc_wraps(np.random.RandomState.negative_binomial)
def negative_binomial(self, n, p, size=None, chunks=None):
def negative_binomial(self, n, p, size=None, chunks="auto"):
return self._wrap('negative_binomial', n, p, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.noncentral_chisquare)
def noncentral_chisquare(self, df, nonc, size=None, chunks=None):
def noncentral_chisquare(self, df, nonc, size=None, chunks="auto"):
return self._wrap('noncentral_chisquare', df, nonc, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.noncentral_f)
def noncentral_f(self, dfnum, dfden, nonc, size=None, chunks=None):
def noncentral_f(self, dfnum, dfden, nonc, size=None, chunks="auto"):
return self._wrap('noncentral_f', dfnum, dfden, nonc, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.normal)
def normal(self, loc=0.0, scale=1.0, size=None, chunks=None):
def normal(self, loc=0.0, scale=1.0, size=None, chunks="auto"):
return self._wrap('normal', loc, scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.pareto)
def pareto(self, a, size=None, chunks=None):
def pareto(self, a, size=None, chunks="auto"):
return self._wrap('pareto', a, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.poisson)
def poisson(self, lam=1.0, size=None, chunks=None):
def poisson(self, lam=1.0, size=None, chunks="auto"):
return self._wrap('poisson', lam, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.power)
def power(self, a, size=None, chunks=None):
def power(self, a, size=None, chunks="auto"):
return self._wrap('power', a, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.randint)
def randint(self, low, high=None, size=None, chunks=None):
def randint(self, low, high=None, size=None, chunks="auto"):
return self._wrap('randint', low, high, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.random_integers)
def random_integers(self, low, high=None, size=None, chunks=None):
def random_integers(self, low, high=None, size=None, chunks="auto"):
return self._wrap('random_integers', low, high, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.random_sample)
def random_sample(self, size=None, chunks=None):
def random_sample(self, size=None, chunks="auto"):
return self._wrap('random_sample', size=size, chunks=chunks)

random = random_sample

@doc_wraps(np.random.RandomState.rayleigh)
def rayleigh(self, scale=1.0, size=None, chunks=None):
def rayleigh(self, scale=1.0, size=None, chunks="auto"):
return self._wrap('rayleigh', scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.standard_cauchy)
def standard_cauchy(self, size=None, chunks=None):
def standard_cauchy(self, size=None, chunks="auto"):
return self._wrap('standard_cauchy', size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.standard_exponential)
def standard_exponential(self, size=None, chunks=None):
def standard_exponential(self, size=None, chunks="auto"):
return self._wrap('standard_exponential', size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.standard_gamma)
def standard_gamma(self, shape, size=None, chunks=None):
def standard_gamma(self, shape, size=None, chunks="auto"):
return self._wrap('standard_gamma', shape, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.standard_normal)
def standard_normal(self, size=None, chunks=None):
def standard_normal(self, size=None, chunks="auto"):
return self._wrap('standard_normal', size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.standard_t)
def standard_t(self, df, size=None, chunks=None):
def standard_t(self, df, size=None, chunks="auto"):
return self._wrap('standard_t', df, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.tomaxint)
def tomaxint(self, size=None, chunks=None):
def tomaxint(self, size=None, chunks="auto"):
return self._wrap('tomaxint', size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.triangular)
def triangular(self, left, mode, right, size=None, chunks=None):
def triangular(self, left, mode, right, size=None, chunks="auto"):
return self._wrap('triangular', left, mode, right, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.uniform)
def uniform(self, low=0.0, high=1.0, size=None, chunks=None):
def uniform(self, low=0.0, high=1.0, size=None, chunks="auto"):
return self._wrap('uniform', low, high, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.vonmises)
def vonmises(self, mu, kappa, size=None, chunks=None):
def vonmises(self, mu, kappa, size=None, chunks="auto"):
return self._wrap('vonmises', mu, kappa, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.wald)
def wald(self, mean, scale, size=None, chunks=None):
def wald(self, mean, scale, size=None, chunks="auto"):
return self._wrap('wald', mean, scale, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.weibull)
def weibull(self, a, size=None, chunks=None):
def weibull(self, a, size=None, chunks="auto"):
return self._wrap('weibull', a, size=size, chunks=chunks)

@doc_wraps(np.random.RandomState.zipf)
def zipf(self, a, size=None, chunks=None):
def zipf(self, a, size=None, chunks="auto"):
return self._wrap('zipf', a, size=size, chunks=chunks)


Expand Down
2 changes: 0 additions & 2 deletions dask/array/tests/test_array_core.py
Expand Up @@ -2321,8 +2321,6 @@ def test_raise_on_no_chunks():
assert "dask" in str(e)
assert ".org" in str(e)

pytest.raises(ValueError, lambda: da.ones(6))


def test_chunks_is_immutable():
x = da.ones(6, chunks=3)
Expand Down
12 changes: 9 additions & 3 deletions dask/array/tests/test_creation.py
Expand Up @@ -5,6 +5,7 @@
import pytest
from toolz import concat

import dask
import dask.array as da
from dask.array.utils import assert_eq, same_keys

Expand Down Expand Up @@ -131,9 +132,8 @@ def test_arange():
with pytest.raises(TypeError) as exc:
da.arange(10, chunks=-1, whatsthis=1)
assert 'whatsthis' in str(exc)
with pytest.raises(TypeError) as exc:
da.arange(10)
assert 'chunks' in str(exc)

assert da.arange(10).chunks == ((10,),)


@pytest.mark.parametrize("start,stop,step,dtype", [
Expand Down Expand Up @@ -529,3 +529,9 @@ def udf_pad(vector, pad_width, iaxis, kwargs):
da_r = da.pad(da_a, pad_width, udf_pad, kwargs=kwargs)

assert_eq(np_r, da_r)


def test_auto_chunks():
with dask.config.set({'array.chunk-size': '50 MiB'}):
x = da.ones((10000, 10000))
assert 4 < x.npartitions < 32
6 changes: 6 additions & 0 deletions dask/array/tests/test_random.py
Expand Up @@ -312,3 +312,9 @@ def test_external_randomstate_class():
b = rs.normal(0, 1, size=(10), chunks=(5,))
assert a.name == b.name
assert_eq(a, b)


def test_auto_chunks():
with dask.config.set({'array.chunk-size': '50 MiB'}):
x = da.random.random((10000, 10000))
assert 4 < x.npartitions < 32
2 changes: 1 addition & 1 deletion dask/array/wrap.py
Expand Up @@ -30,7 +30,7 @@ def wrap_func_shape_as_first_arg(func, *args, **kwargs):
if not isinstance(shape, (tuple, list)):
shape = (shape,)

chunks = kwargs.pop('chunks', None)
chunks = kwargs.pop('chunks', 'auto')

dtype = kwargs.pop('dtype', None)
if dtype is None:
Expand Down

0 comments on commit ccb12ac

Please sign in to comment.