Do not return categoricals from Classify (#77)

nens · Oct 23, 2020 · 9a8c1d8 · 9a8c1d8
1 parent fd5a33c
commit 9a8c1d8
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 17 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,7 +4,9 @@ Changelog of dask-geomodeling
 2.3.1 (unreleased)
 ------------------
 
-- Nothing changed yet.
+- Never return Categorical dtypes in the field_operations.Classify and
+  ClassifyFromColumns. This leads to pandas incompatibilities with later
+  operations (round, subtract, where, mask).
 
 
 2.3.0 (2020-10-09)

diff --git a/dask_geomodeling/geometry/field_operations.py b/dask_geomodeling/geometry/field_operations.py
@@ -112,9 +112,8 @@ def process(series, bins, labels, right):
         if series.dtype == object:
             series = series.fillna(value=np.nan)
         result = pd.cut(series, bins, right, labels)
-        labels_dtype = pd.Series(labels).dtype
-        if labels_dtype.name != "object":
-            result = pd.Series(result, dtype=labels_dtype)
+        # transform from categorical to whatever suits the "labels"
+        result = pd.Series(result, dtype=pd.Series(labels).dtype)
         if open_bounds:
             # patch the result, we actually want to classify np.inf
             if right:
@@ -218,17 +217,23 @@ def process(data, value_column, bin_columns, labels, right):
             else:
                 indices = np.sum(values[:, np.newaxis] >= bins, axis=1)
 
-        # If we have e.g. 2 labels and 3 bins, the outside intervals are closed
-        # any index that is 0 or 3 should become -1 (unclassified).
-        if len(labels) == n_bins + 1:  # open bounds
-            indices[np.isnan(values)] = -1  # else NaN gets classified
-        else:  # closed bounds
-            indices[indices == n_bins] = 0
-            indices -= 1
-
-        # The output of pd.cut is a categorical Series.
-        labeled_data = pd.Categorical.from_codes(indices, labels, ordered=True)
-        return pd.Series(labeled_data, index=features.index)
+        # If values was NaN this is now assigned the value 0 (the first bin).
+        # Convert to the last label so that we can map it later to NaN
+        if len(labels) == n_bins + 1:
+            indices[np.isnan(values)] = len(labels)
+        else:
+            # If we have e.g. 2 labels and 3 bins, the outside intervals are
+            # closed. Therefore, indices 0 and 3 do not map to a bin. Index 0
+            # also covers the values = NaN situation.
+            indices -= 1  # indices become -1, 0, 1, 2
+            indices[indices == -1] = len(labels)  # -1 --> 2
+
+        # Convert indices to labels, append labels with with np.nan to cover
+        # unclassified data.
+        labeled_data = pd.Series(labels + [np.nan]).loc[indices]
+        # Set the index to the features index
+        labeled_data.index = features.index
+        return labeled_data
 
 
 class BaseFieldOperation(BaseSingleSeries):

diff --git a/dask_geomodeling/tests/test_geometry.py b/dask_geomodeling/tests/test_geometry.py
@@ -1531,11 +1531,11 @@ def test_classify_astype_category_int(self):
         ).get_data(**self.request)
         self.assertNotEqual(expected.dtypes.name, "category")
 
-    def test_classify_astype_category_object(self):
+    def test_classify_not_categorical(self):
         expected = field_operations.Classify(
             self.source["col_source"], bins=[0, 0.5, 1.0], labels=["A", "B", "C", "D"]
         ).get_data(**self.request)
-        self.assertEqual(expected.dtypes.name, "category")
+        self.assertEqual(expected.dtypes.name, "object")
 
     def test_classify_from_columns_left(self):
         source_with_bins = self.source.set("bin_1", 0, "bin_2", 1.2, "bin_3", 5.0)