Fix up groupby-standard deviation again on object dtype keys

mrocklin · Mar 4, 2019 · 0e09333 · 0e09333
1 parent f3c2d5d
commit 0e09333
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 4 deletions.
diff --git a/dask/dataframe/groupby.py b/dask/dataframe/groupby.py
@@ -240,14 +240,17 @@ def _apply_chunk(df, *index, **kwargs):
 def _var_chunk(df, *index):
     if is_series_like(df):
         df = df.to_frame()
-    df = df._get_numeric_data()
+
+    df = df.copy()
+    cols = df._get_numeric_data().columns
+
     g = _groupby_raise_unaligned(df, by=index)
     x = g.sum()
 
-    n = g.count().rename(columns=lambda c: c + '-count')
+    n = g[x.columns].count().rename(columns=lambda c: c + '-count')
 
-    df2 = df ** 2
-    g2 = _groupby_raise_unaligned(df2, by=index)
+    df[cols] = df[cols] ** 2
+    g2 = _groupby_raise_unaligned(df, by=index)
     x2 = g2.sum().rename(columns=lambda c: c + '-x2')
 
     x2.index = x.index

diff --git a/dask/dataframe/tests/test_groupby.py b/dask/dataframe/tests/test_groupby.py
@@ -1535,3 +1535,9 @@ def test_std_object_dtype(func):
     ddf = dd.from_pandas(df, npartitions=2)
 
     assert_eq(func(df), func(ddf))
+
+
+def test_timeseries():
+    df = dask.datasets.timeseries().partitions[:2]
+    assert_eq(df.groupby('name').std(),
+              df.groupby('name').std())