mars-project · qinxuye · Feb 8, 2022 · Feb 8, 2022
@@ -210,7 +210,9 @@ def merge_small_files(
     if merged_file_size is not None:
         merged_file_size = parse_readable_size(merged_file_size)[0]
     else:
-        merged_file_size = options.chunk_store_limit
+        # Estimated size is relatively large than the real one,
+        # so we double the merged size
+        merged_file_size = options.chunk_store_limit * 2
     # sample files whose size equals `n_sample_file`
     sampled_chunks = np.random.choice(df.chunks, n_sample_file)
     max_chunk_size = 0

@@ -94,7 +94,11 @@ def _tile_series(cls, op):
 
     @classmethod
     def _tile(cls, op):
-        if op.inputs[0].ndim == 2:
+        inp = op.inputs[0]
+        if inp.shape[op.axis] == 0:
+            # if the length is zero, return input directly
+            return inp
+        if inp.ndim == 2:
             return (yield from cls._tile_dataframe(op))
         else:
             return (yield from cls._tile_series(op))

@@ -238,6 +238,21 @@ def test_sort_values_execution(setup, distinct_opt):
         result.reset_index(drop=True), expected.reset_index(drop=True)
     )
 
+    # test for empty input(#GH 2649)
+    pd_df = pd.DataFrame(np.random.rand(10, 3), columns=["col1", "col2", "col3"])
+    df = DataFrame(pd_df, chunk_size=4)
+    df = df[df["col2"] > 1].execute()
+    result = df.sort_values(by="col1").execute().fetch()
+    expected = pd_df[pd_df["col2"] > 1].sort_values(by="col1")
+    pd.testing.assert_frame_equal(result, expected)
+
+    pd_s = pd.Series(np.random.rand(10))
+    s = Series(pd_s, chunk_size=4)
+    s = s[s > 1].execute()
+    result = s.sort_values().execute().fetch()
+    expected = pd_s[pd_s > 1].sort_values()
+    pd.testing.assert_series_equal(result, expected)
+
 
 def test_sort_index_execution(setup):
     raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

@@ -47,7 +47,7 @@ def test_pairwise_distances_execution(setup):
     weight = np.random.rand(5)
     d = pairwise_distances(x, y, metric="wminkowski", p=3, w=weight)
     result = d.execute().fetch()
-    expected = sk_pairwise_distances(raw_x, raw_y, metric="wminkowski", p=3, w=weight)
+    expected = sk_pairwise_distances(raw_x, raw_y, metric="minkowski", p=3, w=weight)
     np.testing.assert_almost_equal(result, expected)
 
     # test pdist

@@ -47,11 +47,20 @@ def execute(cls, ctx, op):
 def fromzarr(path, group=None, dataset=None, chunk_size=None):
     import zarr
 
+    try:
+        # since v2.11.0, zarr convert mutable mappings to KVStore
+        from zarr.storage import KVStore as zarr_kvstore
+    except ImportError:  # pragma: no cover
+        zarr_kvstore = None
+
     if isinstance(path, zarr.Array):
         arr = path
-        if isinstance(arr.store, FSMap):
+        if zarr_kvstore is None and isinstance(arr.store, FSMap):  # pragma: no cover
             root = arr.store.root
             path, dataset = root.rsplit("/", 1)
+        elif zarr_kvstore and isinstance(arr.store, zarr_kvstore):
+            root = arr.store._mutable_mapping.root
+            path, dataset = root.rsplit("/", 1)
         else:
             path = arr.store.path
             if "/" in arr.path and group is None:

@@ -149,7 +149,7 @@ def test_sparse_randint_execution(setup):
     assert res.shape == (30, 50)
     np.testing.assert_array_less(res.data, 2)
     np.testing.assert_array_less(0, res.data)
-    assert pytest.approx((res >= 1).toarray().sum(), 30 * 50 * 0.1, abs=20)
+    assert (res >= 1).toarray().sum() == pytest.approx(30 * 50 * 0.1, abs=20)
 
 
 random_test_options = namedtuple("random_test_options", ["func_name", "args", "kwargs"])

@@ -532,6 +532,11 @@ def cdist(XA, XB, metric="euclidean", **kwargs):
             "3rd argument metric must be a string identifier " "or a function."
         )
 
+    # scipy remove "wminkowski" since v1.8.0, use "minkowski" with `w=`
+    # keyword-argument for the given weight.
+    if metric == "wminkowski":
+        metric = "minkowski"
+
     p = kwargs.pop("p", None)
     w = kwargs.pop("w", None)
     if w is not None:

@@ -697,6 +697,11 @@ def pdist(X, metric="euclidean", **kwargs):
             "2nd argument metric must be a string identifier " "or a function."
         )
 
+    # scipy remove "wminkowski" since v1.8.0, use "minkowski" with `w=`
+    # keyword-argument for the given weight.
+    if metric == "wminkowski":
+        metric = "minkowski"
+
     p = kwargs.pop("p", None)
     w = kwargs.pop("w", None)
     if w is not None:

@@ -81,7 +81,7 @@ def test_pdist_execution(setup):
         w = tensor(weight, chunk_size=7)
         dist = distance.pdist(x, metric="wminkowski", p=3, w=w)
         result = dist.execute().fetch()
-        expected = sp_pdist(raw, metric="wminkowski", p=3, w=weight)
+        expected = sp_pdist(raw, metric="minkowski", p=3, w=weight)
         np.testing.assert_array_equal(result, expected)
 
         # test V
@@ -157,7 +157,7 @@ def test_cdist_execution(setup):
         w = tensor(weight, chunk_size=7)
         dist = distance.cdist(xa, xb, metric="wminkowski", p=3, w=w)
         result = dist.execute().fetch()
-        expected = sp_cdist(raw_a, raw_b, metric="wminkowski", p=3, w=weight)
+        expected = sp_cdist(raw_a, raw_b, metric="minkowski", p=3, w=weight)
         np.testing.assert_array_equal(result, expected)
 
         # test V