Merge branch 'master' of github.com:dask/dask into pandas-warnings

mrocklin · Mar 16, 2019 · de12ca6 · de12ca6
2 parents b44620a + 87f3a48
commit de12ca6
Show file tree

Hide file tree

Showing 24 changed files with 549 additions and 172 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,6 +11,7 @@ _base_envs:
   - &no_optimize XTRATESTARGS=
   - &imports TEST_IMPORTS='true'
   - &no_imports TEST_IMPORTS='false'
+  - &array_function NUMPY_EXPERIMENTAL_ARRAY_FUNCTION='1'
 
 jobs:
   fast_finish: true
@@ -44,7 +45,7 @@ jobs:
 
     - env: &py37_env
       - PYTHON=3.7
-      - NUMPY=1.15.0
+      - NUMPY=1.16.2
       - PANDAS>=0.24.1
       - *test_and_lint
       - *no_coverage

diff --git a/continuous_integration/travis/run_tests.sh b/continuous_integration/travis/run_tests.sh
@@ -13,3 +13,7 @@ else
     echo "py.test dask --runslow $XTRATESTARGS"
     py.test dask --runslow $XTRATESTARGS
 fi
+
+# This needs to be enabled to test __array_function__ protocol with
+# NumPy v1.16.x, enabled by default starting in v1.17
+export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
diff --git a/dask/array/core.py b/dask/array/core.py
@@ -1002,6 +1002,20 @@ def __array__(self, dtype=None, **kwargs):
             x = np.array(x)
         return x
 
+    def __array_function__(self, func, types, args, kwargs):
+        import dask.array as module
+        for submodule in func.__module__.split('.')[1:]:
+            try:
+                module = getattr(module, submodule)
+            except AttributeError:
+                return NotImplemented
+        if not hasattr(module, func.__name__):
+            return NotImplemented
+        da_func = getattr(module, func.__name__)
+        if da_func is func:
+            return NotImplemented
+        return da_func(*args, **kwargs)
+
     @property
     def _elemwise(self):
         return elemwise

diff --git a/dask/array/tests/test_array_function.py b/dask/array/tests/test_array_function.py
@@ -0,0 +1,95 @@
+import pytest
+np = pytest.importorskip('numpy', minversion='1.16')
+
+import os
+
+import dask.array as da
+from dask.array.utils import assert_eq
+
+
+env_name = "NUMPY_EXPERIMENTAL_ARRAY_FUNCTION"
+missing_arrfunc_cond = env_name not in os.environ or os.environ[env_name] != "1"
+missing_arrfunc_reason = env_name + " undefined or disabled"
+
+
+@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
+@pytest.mark.parametrize('func', [
+    lambda x: np.concatenate([x, x, x]),
+    lambda x: np.cov(x, x),
+    lambda x: np.dot(x, x),
+    lambda x: np.dstack(x),
+    lambda x: np.flip(x, axis=0),
+    lambda x: np.hstack(x),
+    lambda x: np.matmul(x, x),
+    lambda x: np.mean(x),
+    lambda x: np.stack([x, x]),
+    lambda x: np.sum(x),
+    lambda x: np.var(x),
+    lambda x: np.vstack(x),
+    lambda x: np.fft.fft(x.rechunk(x.shape) if isinstance(x, da.Array) else x),
+    lambda x: np.fft.fft2(x.rechunk(x.shape) if isinstance(x, da.Array) else x),
+    lambda x: np.linalg.norm(x)])
+def test_array_function_dask(func):
+    x = np.random.random((100, 100))
+    y = da.from_array(x, chunks=(50, 50))
+    res_x = func(x)
+    res_y = func(y)
+
+    assert isinstance(res_y, da.Array)
+    assert_eq(res_y, res_x)
+
+
+@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
+@pytest.mark.parametrize('func', [
+    lambda x: np.min_scalar_type(x),
+    lambda x: np.linalg.det(x),
+    lambda x: np.linalg.eigvals(x)])
+def test_array_notimpl_function_dask(func):
+    x = np.random.random((100, 100))
+    y = da.from_array(x, chunks=(50, 50))
+
+    with pytest.raises(TypeError):
+        func(y)
+
+
+@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
+def test_array_function_sparse_transpose():
+    sparse = pytest.importorskip('sparse')
+    x = da.random.random((500, 500), chunks=(100, 100))
+    x[x < 0.9] = 0
+
+    y = x.map_blocks(sparse.COO)
+
+    assert_eq(np.transpose(x), np.transpose(y))
+
+
+@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
+@pytest.mark.xfail(reason="requires sparse support for __array_function__",
+                   strict=False)
+def test_array_function_sparse_tensordot():
+    sparse = pytest.importorskip('sparse')
+    x = np.random.random((2, 3, 4))
+    x[x < 0.9] = 0
+    y = np.random.random((4, 3, 2))
+    y[y < 0.9] = 0
+
+    xx = sparse.COO(x)
+    yy = sparse.COO(y)
+
+    assert_eq(np.tensordot(x, y, axes=(2, 0)),
+              np.tensordot(xx, yy, axes=(2, 0)).todense())
+
+
+@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
+def test_array_function_cupy_svd():
+    cupy = pytest.importorskip('cupy')
+    x = cupy.random.random((500, 100))
+
+    y = da.from_array(x, chunks=(100, 100), asarray=False)
+
+    u_base, s_base, v_base = da.linalg.svd(y)
+    u, s, v = np.linalg.svd(y)
+
+    assert_eq(u, u_base)
+    assert_eq(s, s_base)
+    assert_eq(v, v_base)
diff --git a/dask/array/tests/test_reductions.py b/dask/array/tests/test_reductions.py
@@ -468,35 +468,38 @@ def test_topk_argtopk1(npfunc, daskfunc, split_every):
     k = 5
     # Test at least 3 levels of aggregation when split_every=2
     # to stress the different chunk, combine, aggregate kernels
-    a = da.random.random(800, chunks=((120, 80, 100, 200, 300), ))
-    b = da.random.random((10, 20, 30), chunks=(4, 8, 8))
+    npa = np.random.random(800)
+    npb = np.random.random((10, 20, 30))
+
+    a = da.from_array(npa, chunks=((120, 80, 100, 200, 300), ))
+    b = da.from_array(npb, chunks=(4, 8, 8))
 
     # 1-dimensional arrays
     # top 5 elements, sorted descending
-    assert_eq(npfunc(a)[-k:][::-1],
+    assert_eq(npfunc(npa)[-k:][::-1],
               daskfunc(a, k, split_every=split_every))
     # bottom 5 elements, sorted ascending
-    assert_eq(npfunc(a)[:k],
+    assert_eq(npfunc(npa)[:k],
               daskfunc(a, -k, split_every=split_every))
 
     # n-dimensional arrays
     # also testing when k > chunk
     # top 5 elements, sorted descending
-    assert_eq(npfunc(b, axis=0)[-k:, :, :][::-1, :, :],
+    assert_eq(npfunc(npb, axis=0)[-k:, :, :][::-1, :, :],
               daskfunc(b, k, axis=0, split_every=split_every))
-    assert_eq(npfunc(b, axis=1)[:, -k:, :][:, ::-1, :],
+    assert_eq(npfunc(npb, axis=1)[:, -k:, :][:, ::-1, :],
               daskfunc(b, k, axis=1, split_every=split_every))
-    assert_eq(npfunc(b, axis=-1)[:, :, -k:][:, :, ::-1],
+    assert_eq(npfunc(npb, axis=-1)[:, :, -k:][:, :, ::-1],
               daskfunc(b, k, axis=-1, split_every=split_every))
     with pytest.raises(ValueError):
         daskfunc(b, k, axis=3, split_every=split_every)
 
     # bottom 5 elements, sorted ascending
-    assert_eq(npfunc(b, axis=0)[:k, :, :],
+    assert_eq(npfunc(npb, axis=0)[:k, :, :],
               daskfunc(b, -k, axis=0, split_every=split_every))
-    assert_eq(npfunc(b, axis=1)[:, :k, :],
+    assert_eq(npfunc(npb, axis=1)[:, :k, :],
               daskfunc(b, -k, axis=1, split_every=split_every))
-    assert_eq(npfunc(b, axis=-1)[:, :, :k],
+    assert_eq(npfunc(npb, axis=-1)[:, :, :k],
               daskfunc(b, -k, axis=-1, split_every=split_every))
     with pytest.raises(ValueError):
         daskfunc(b, -k, axis=3, split_every=split_every)
@@ -510,14 +513,15 @@ def test_topk_argtopk1(npfunc, daskfunc, split_every):
 @pytest.mark.parametrize('chunksize', [1, 2, 3, 4, 5, 10])
 def test_topk_argtopk2(npfunc, daskfunc, split_every, chunksize):
     """Fine test use cases when k is larger than chunk size"""
-    a = da.random.random((10, ), chunks=chunksize)
+    npa = np.random.random((10, ))
+    a = da.from_array(npa, chunks=chunksize)
     k = 5
 
     # top 5 elements, sorted descending
-    assert_eq(npfunc(a)[-k:][::-1],
+    assert_eq(npfunc(npa)[-k:][::-1],
               daskfunc(a, k, split_every=split_every))
     # bottom 5 elements, sorted ascending
-    assert_eq(npfunc(a)[:k],
+    assert_eq(npfunc(npa)[:k],
               daskfunc(a, -k, split_every=split_every))
 
 

diff --git a/dask/array/tests/test_routines.py b/dask/array/tests/test_routines.py
@@ -502,7 +502,7 @@ def test_bincount_with_weights():
 
     dweights = da.from_array(weights, chunks=2)
     e = da.bincount(d, weights=dweights, minlength=6)
-    assert_eq(e, np.bincount(x, weights=dweights, minlength=6))
+    assert_eq(e, np.bincount(x, weights=dweights.compute(), minlength=6))
     assert same_keys(da.bincount(d, weights=dweights, minlength=6), e)
 
 

diff --git a/dask/array/tests/test_ufunc.py b/dask/array/tests/test_ufunc.py
@@ -56,10 +56,10 @@ def test_ufunc():
 unary_ufuncs = ['absolute', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan',
                 'arctanh', 'bitwise_not', 'cbrt', 'ceil', 'conj', 'cos',
                 'cosh', 'deg2rad', 'degrees', 'exp', 'exp2', 'expm1', 'fabs',
-                'fix', 'floor', 'i0', 'invert','isfinite', 'isinf', 'isnan', 'log',
-                'log10', 'log1p', 'log2', 'logical_not', 'nan_to_num',
+                'fix', 'floor', 'invert','isfinite', 'isinf', 'isnan', 'log',
+                'log10', 'log1p', 'log2', 'logical_not',
                 'negative', 'rad2deg', 'radians', 'reciprocal', 'rint', 'sign',
-                'signbit', 'sin', 'sinc', 'sinh', 'spacing', 'sqrt', 'square',
+                'signbit', 'sin', 'sinh', 'spacing', 'sqrt', 'square',
                 'tan', 'tanh', 'trunc']
 
 
@@ -276,6 +276,17 @@ def test_issignedinf():
     assert_eq(np.isposinf(arr), da.isposinf(darr))
 
 
+@pytest.mark.parametrize('func', ['i0', 'sinc', 'nan_to_num'])
+def test_non_ufunc_others(func):
+    arr = np.random.randint(1, 100, size=(20, 20))
+    darr = da.from_array(arr, 3)
+
+    dafunc = getattr(da, func)
+    npfunc = getattr(np, func)
+
+    assert_eq(dafunc(darr), npfunc(arr), equal_nan=True)
+
+
 def test_frompyfunc():
     myadd = da.frompyfunc(add, 2, 1)
     np_myadd = np.frompyfunc(add, 2, 1)

diff --git a/dask/bytes/local.py b/dask/bytes/local.py
@@ -33,7 +33,10 @@ def _normalize_path(self, path):
 
     def glob(self, path):
         """For a template path, return matching files"""
-        return sorted(glob(self._normalize_path(path)))
+        try:
+            return sorted(glob(self._normalize_path(path), recursive=True))
+        except TypeError:  # recursive kwarg is new in Python 3.5
+            return sorted(glob(self._normalize_path(path)))
 
     def mkdirs(self, path):
         """Make any intermediate directories to make path writable"""

diff --git a/dask/bytes/tests/test_local.py b/dask/bytes/tests/test_local.py
@@ -31,7 +31,9 @@
 csv_files = {'.test.fakedata.1.csv': (b'a,b\n'
                                       b'1,2\n'),
              '.test.fakedata.2.csv': (b'a,b\n'
-                                      b'3,4\n')}
+                                      b'3,4\n'),
+             'subdir/.test.fakedata.2.csv': (b'a,b\n'
+                                             b'5,6\n')}
 
 
 try:
@@ -101,6 +103,15 @@ def test_urlpath_expand_read():
         assert len(paths) == 2
 
 
+@pytest.mark.skipif(sys.version_info < (3, 5),
+                    reason="Recursive glob is new in Python 3.5")
+def test_recursive_glob_expand():
+    """Make sure * is expanded in file paths when reading."""
+    with filetexts(csv_files, mode='b'):
+        _, _, paths = get_fs_token_paths('**/.*.csv')
+        assert len(paths) == 3
+
+
 def test_urlpath_expand_write():
     """Make sure * is expanded in file paths when writing."""
     _, _, paths = get_fs_token_paths('prefix-*.csv', mode='wb', num=2)