Skip to content

Commit

Permalink
Do not return categoricals from Classify (#77)
Browse files Browse the repository at this point in the history
  • Loading branch information
caspervdw committed Oct 23, 2020
1 parent fd5a33c commit 9a8c1d8
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 17 deletions.
4 changes: 3 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ Changelog of dask-geomodeling
2.3.1 (unreleased)
------------------

- Nothing changed yet.
- Never return Categorical dtypes in the field_operations.Classify and
ClassifyFromColumns. This leads to pandas incompatibilities with later
operations (round, subtract, where, mask).


2.3.0 (2020-10-09)
Expand Down
33 changes: 19 additions & 14 deletions dask_geomodeling/geometry/field_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,8 @@ def process(series, bins, labels, right):
if series.dtype == object:
series = series.fillna(value=np.nan)
result = pd.cut(series, bins, right, labels)
labels_dtype = pd.Series(labels).dtype
if labels_dtype.name != "object":
result = pd.Series(result, dtype=labels_dtype)
# transform from categorical to whatever suits the "labels"
result = pd.Series(result, dtype=pd.Series(labels).dtype)
if open_bounds:
# patch the result, we actually want to classify np.inf
if right:
Expand Down Expand Up @@ -218,17 +217,23 @@ def process(data, value_column, bin_columns, labels, right):
else:
indices = np.sum(values[:, np.newaxis] >= bins, axis=1)

# If we have e.g. 2 labels and 3 bins, the outside intervals are closed
# any index that is 0 or 3 should become -1 (unclassified).
if len(labels) == n_bins + 1: # open bounds
indices[np.isnan(values)] = -1 # else NaN gets classified
else: # closed bounds
indices[indices == n_bins] = 0
indices -= 1

# The output of pd.cut is a categorical Series.
labeled_data = pd.Categorical.from_codes(indices, labels, ordered=True)
return pd.Series(labeled_data, index=features.index)
# If values was NaN this is now assigned the value 0 (the first bin).
# Convert to the last label so that we can map it later to NaN
if len(labels) == n_bins + 1:
indices[np.isnan(values)] = len(labels)
else:
# If we have e.g. 2 labels and 3 bins, the outside intervals are
# closed. Therefore, indices 0 and 3 do not map to a bin. Index 0
# also covers the values = NaN situation.
indices -= 1 # indices become -1, 0, 1, 2
indices[indices == -1] = len(labels) # -1 --> 2

# Convert indices to labels, append labels with with np.nan to cover
# unclassified data.
labeled_data = pd.Series(labels + [np.nan]).loc[indices]
# Set the index to the features index
labeled_data.index = features.index
return labeled_data


class BaseFieldOperation(BaseSingleSeries):
Expand Down
4 changes: 2 additions & 2 deletions dask_geomodeling/tests/test_geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1531,11 +1531,11 @@ def test_classify_astype_category_int(self):
).get_data(**self.request)
self.assertNotEqual(expected.dtypes.name, "category")

def test_classify_astype_category_object(self):
def test_classify_not_categorical(self):
expected = field_operations.Classify(
self.source["col_source"], bins=[0, 0.5, 1.0], labels=["A", "B", "C", "D"]
).get_data(**self.request)
self.assertEqual(expected.dtypes.name, "category")
self.assertEqual(expected.dtypes.name, "object")

def test_classify_from_columns_left(self):
source_with_bins = self.source.set("bin_1", 0, "bin_2", 1.2, "bin_3", 5.0)
Expand Down

0 comments on commit 9a8c1d8

Please sign in to comment.