Merge pull request #6 from nens/casper-reclassify

Fix multiple issues in reclassify
nens · Oct 4, 2019 · a8c9862 · a8c9862
2 parents bfa121f + f5b0384
commit a8c9862
Show file tree

Hide file tree

Showing 5 changed files with 209 additions and 90 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -7,6 +7,12 @@ Changelog of dask-geomodeling
 
 - Added documentation.
 
+- Fixed MemorySource incase of a request outside of the data boundary.
+
+- Fixed multiple bugs in Reclassify and added some tests. The 'from' dtype can
+  now be boolean or integer, and the 'to' dtype integer or float. The returned
+  dtype is now decided by numpy (int64 or float64).
+
 
 2.0.2 (2019-09-04)
 ------------------

diff --git a/dask_geomodeling/raster/misc.py b/dask_geomodeling/raster/misc.py
@@ -283,103 +283,106 @@ class Reclassify(BaseSingle):
 
     :param store: rasterdata to reclassify
     :param data: list of (from, to) values defining the reclassification
+        the from values can be of bool or int datatype; the to values can be
+        of int or float datatype
     :param select: leave only reclassified values, set others to 'no data'.
       Default False.
 
     :type store: RasterBlock
     :type data: list
     :type select: bool
-
-    Specify the mapping in the form of a list of (from, to)-tuples. If there
-    is any float value in the 'to' data, then all 'to' values are treated as
-    float values.
-
-    The maximum of the store's fillvalue and the 'from' values determine the
-    size of the lookup array. For float target values, the dtype is 'f4' and
-    the fillvalue is the maximum possible value for that dtype. for integer
-    target values, the fillvalue of the raster is the maximum of:
-
-    - the target values plus one,
-    - the source values plus one and
-    - the fillvalue of the store
-
-    the raster's dtype is the unsigned integer dtype with the smallest
-    itemsize that still accomodates the raster's fillvalue.
-
-    These properties make it possible to keep the lookup array for the
-    conversion small, but the drawback is that stores with fillvalues below
-    data values will not work.
     """
 
     def __init__(self, store, data, select=False):
-        if not np.issubdtype(store.dtype, np.integer):
-            raise TypeError("The store must be of integer datatype")
+        dtype = store.dtype
+        if dtype != np.bool and not np.issubdtype(dtype, np.integer):
+            raise TypeError("The store must be of boolean or integer datatype")
+
+        # validate "data"
         if not hasattr(data, "__iter__"):
             raise TypeError("'{}' object is not allowed".format(type(data)))
+        try:
+            source, target = map(np.asarray, zip(*data))
+        except ValueError:
+            raise ValueError("Please supply a list of [from, to] values")
+        # "from" can have bool or int dtype, "to" can also be float
+        if source.dtype != np.bool and not np.issubdtype(source.dtype, np.integer):
+            raise TypeError(
+                "Cannot reclassify from value with type '{}'".format(source.dtype)
+            )
+        if len(np.unique(source)) != len(source):
+            raise ValueError("There are duplicates in the reclassify values")
+        if not np.issubdtype(target.dtype, np.number):
+            raise TypeError(
+                "Cannot reclassify to value with type '{}'".format(target.dtype)
+            )
+        # put 'data' into a list with consistent dtypes
+        data = [list(x) for x in zip(source.tolist(), target.tolist())]
+
         if select is not True and select is not False:
             raise TypeError("'{}' object is not allowed".format(type(select)))
-        super(Reclassify, self).__init__(store, data, select)
+        super().__init__(store, data, select)
 
     @property
     def data(self):
         return self.args[1]
 
     @property
-    def dtype(self):
-        source, target = map(np.asarray, zip(*self.data))
+    def select(self):
+        return self.args[2]
 
-        if np.issubdtype(target.dtype, np.floating):
-            dtype = np.dtype("f4")
-        else:
-            # integer mapping
-            fillvalue = max(source.max() + 1, target.max() + 1, self.store.fillvalue)
-            for dtype in np.uint8, np.uint16, np.uint32, np.uint64:
-                if fillvalue <= dtype(-1):
-                    break
-        return dtype
+    @property
+    def dtype(self):
+        _, target = map(np.asarray, zip(*self.data))
+        return target.dtype
 
     @property
     def fillvalue(self):
-        source, target = map(np.asarray, zip(*self.data))
+        return get_dtype_max(self.dtype)
 
-        if np.issubdtype(target.dtype, np.floating):
-            fillvalue = np.finfo("f4").max
-        else:
-            fillvalue = max(source.max() + 1, target.max() + 1, self.store.fillvalue)
-        return fillvalue
+    def get_sources_and_requests(self, **request):
+        process_kwargs = {
+            "dtype": self.dtype.str,
+            "fillvalue": self.fillvalue,
+            "data": self.data,
+            "select": self.select,
+        }
+        return [(self.store, request), (process_kwargs, None)]
 
     @staticmethod
-    def process(store_data, mapping_data, select):
+    def process(store_data, process_kwargs):
         if store_data is None or "values" not in store_data:
             return store_data
 
         no_data_value = store_data["no_data_value"]
-        source, target = map(np.asarray, zip(*mapping_data))
-
-        if np.issubdtype(target.dtype, np.floating):
-            dtype = np.dtype("f4")
-            fillvalue = np.finfo("f4").max
-        else:
-            # integer mapping
-            fillvalue = max(source.max() + 1, target.max() + 1, no_data_value)
-            for dtype in np.uint8, np.uint16, np.uint32, np.uint64:
-                if fillvalue <= dtype(-1):
-                    break
-
-        # create mapping
-        size = max(source.max(), no_data_value) + 1
-        if select:
-            # map everything, including old fillvalue, to fillvalue
-            mapping = np.full(size, fillvalue, dtype=dtype)
-        else:
-            # map every value onto itself
-            mapping = np.arange(size, dtype=dtype)
-
-        # modify mapping using provided data
-        mapping[source] = target
-
-        values = mapping[store_data["values"]]
-        return {"values": values, "no_data_value": fillvalue}
+        values = store_data["values"]
+        source, target = map(np.asarray, zip(*process_kwargs["data"]))
+        dtype = np.dtype(process_kwargs["dtype"])
+        fillvalue = process_kwargs["fillvalue"]
+
+        # add the nodata value to the source array and map it to the target
+        # nodata
+        if no_data_value is not None and no_data_value not in source:
+            source = np.append(source, no_data_value)
+            target = np.append(target, fillvalue)
+
+        # sort the source and target values
+        inds = np.argsort(source)
+        source = source[inds]
+        target = target[inds]
+
+        # create the result array
+        if process_kwargs["select"]:  # select = True: initialize with nodata
+            result = np.full(values.shape, fillvalue, dtype=dtype)
+        else:  # select = True: initialize with existing data
+            result = values.astype(dtype)  # makes a copy
+
+        # find all values in the source data that are to be mapped
+        mask = np.in1d(values.ravel(), source)
+        mask.shape = values.shape
+        # place the target values (this also maps nodata values)
+        result[mask] = target[np.searchsorted(source, values[mask])]
+        return {"values": result, "no_data_value": fillvalue}
 
 
 class Rasterize(RasterBlock):

diff --git a/dask_geomodeling/tests/test_raster.py b/dask_geomodeling/tests/test_raster.py
@@ -1606,27 +1606,6 @@ def test_step(self):
         self.assertEqual(view.get_data(**self.meta_request)["meta"], self.expected_meta)
         self.assertEqual(view.get_data(**self.time_request)["time"], self.expected_time)
 
-    def test_reclassify_meta_time(self):
-        view = raster.Reclassify(store=self.raster, data=[[7, 1000]])
-        self.assertEqual(view.get_data(**self.meta_request)["meta"], self.expected_meta)
-        self.assertEqual(view.get_data(**self.time_request)["time"], self.expected_time)
-
-    def test_reclassify_with_integers(self):
-        for select in True, False:
-            view = raster.Reclassify(store=self.raster, data=[[7, 1000]], select=select)
-
-            data = view.get_data(**self.vals_request)
-            self.assertEqual(view.dtype, np.uint16)
-            assert_equal(data["values"], 1000)
-
-    def test_reclassify_with_floats(self):
-        for select in True, False:
-            view = raster.Reclassify(store=self.raster, data=[[7, 8.0]], select=select)
-            data = view.get_data(**self.vals_request)
-            self.assertEqual(view.dtype, np.float32)
-            assert_equal(data["values"], 8.0)
-            self.assertEqual(data["no_data_value"], view.fillvalue)
-
     def test_classify_meta_time(self):
         view = raster.Classify(store=self.raster, bins=[1, 2, 3])
         self.assertEqual(view.get_data(**self.meta_request)["meta"], self.expected_meta)

diff --git a/dask_geomodeling/tests/test_raster_misc.py b/dask_geomodeling/tests/test_raster_misc.py
@@ -0,0 +1,129 @@
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+from numpy.testing import assert_equal
+
+from dask_geomodeling import raster
+from dask_geomodeling.raster.sources import MemorySource
+
+
+@pytest.fixture
+def source():
+    bands = 3
+    time_first = datetime(2000, 1, 1)
+    time_delta = timedelta(hours=1)
+    yield MemorySource(
+        data=[
+            np.full((10, 10), 1, dtype=np.uint8),
+            np.full((10, 10), 7, dtype=np.uint8),
+            np.full((10, 10), 255, dtype=np.uint8),
+        ],
+        no_data_value=255,
+        projection="EPSG:28992",
+        pixel_size=0.5,
+        pixel_origin=(135000, 456000),
+        time_first=time_first,
+        time_delta=time_delta,
+        metadata=["Testmeta for band {}".format(i) for i in range(bands)],
+    )
+
+
+@pytest.fixture
+def vals_request():
+    bands = 3
+    time_first = datetime(2000, 1, 1)
+    time_delta = timedelta(hours=1)
+    yield {
+        "mode": "vals",
+        "start": time_first,
+        "stop": time_first + bands * time_delta,
+        "width": 4,
+        "height": 6,
+        "bbox": (135000, 456000 - 3, 135000 + 2, 456000),
+        "projection": "EPSG:28992",
+    }
+
+
+@pytest.fixture
+def expected_meta():
+    bands = 3
+    return ["Testmeta for band {}".format(i) for i in range(bands)]
+
+
+@pytest.fixture
+def expected_time():
+    bands = 3
+    time_first = datetime(2000, 1, 1)
+    time_delta = timedelta(hours=1)
+    return [time_first + i * time_delta for i in range(bands)]
+
+
+def test_reclassify(source, vals_request):
+    view = raster.Reclassify(store=source, data=[[7, 1000]])
+    data = view.get_data(**vals_request)
+
+    assert_equal(data["values"][:, 0, 0], [1, 1000, data["no_data_value"]])
+
+
+def test_reclassify_select(source, vals_request):
+    view = raster.Reclassify(store=source, data=[[7, 1000]], select=True)
+    data = view.get_data(**vals_request)
+
+    expected = [data["no_data_value"], 1000, data["no_data_value"]]
+    assert_equal(data["values"][:, 0, 0], expected)
+
+
+def test_reclassify_to_float(source, vals_request):
+    view = raster.Reclassify(store=source, data=[[7, 8.2]])
+    data = view.get_data(**vals_request)
+
+    assert_equal(data["values"][:, 0, 0], [1.0, 8.2, data["no_data_value"]])
+
+
+def test_reclassify_bool(source, vals_request):
+    source_bool = source == 7
+    view = raster.Reclassify(store=source_bool, data=[[True, 1000]])
+    data = view.get_data(**vals_request)
+
+    assert_equal(data["values"][:, 0, 0], [0, 1000, 0])
+
+
+def test_reclassify_int32(source, vals_request):
+    # this will have a high fillvalue that may lead to a MemoryError
+    source_int32 = source * 1
+    assert source_int32.dtype == np.int32
+
+    view = raster.Reclassify(store=source_int32, data=[[7, 1000]])
+    data = view.get_data(**vals_request)
+
+    assert_equal(data["values"][:, 0, 0], [1, 1000, data["no_data_value"]])
+
+
+def test_reclassify_float_raster(source):
+    source_float = source / 2
+    assert source_float.dtype == np.float32
+    with pytest.raises(TypeError):
+        raster.Reclassify(store=source_float, data=[[7.0, 1000]])
+
+
+def test_reclassify_float_data(source):
+    with pytest.raises(TypeError):
+        raster.Reclassify(store=source, data=[[7.4, 1000]])
+
+
+def test_reclassify_wrong_mapping_shape(source):
+    with pytest.raises(ValueError):
+        raster.Reclassify(store=source, data=[[[7, 1000]], [1, 100]])
+
+
+def test_reclassify_meta_request(source, vals_request, expected_meta):
+    view = raster.Reclassify(store=source, data=[[7, 1000]])
+    vals_request["mode"] = "meta"
+    assert view.get_data(**vals_request)["meta"] == expected_meta
+
+
+def test_reclassify_time_request(source, vals_request, expected_time):
+    view = raster.Reclassify(store=source, data=[[7, 1000]])
+    vals_request["mode"] = "time"
+    assert view.get_data(**vals_request)["time"] == expected_time
diff --git a/dask_geomodeling/utils.py b/dask_geomodeling/utils.py
@@ -296,13 +296,15 @@ def get_array_ranges(self, bbox, shape):
         if j1 == j2:
             j2 += 1
 
-        # clip the indices to the array bounds and get the data
+        # clip the indices to the array bounds for getting the data
         _i1, _i2 = np.clip([i1, i2], 0, shape[1])
         _j1, _j2 = np.clip([j1, j2], 0, shape[2])
         ranges = (_i1, _i2), (_j1, _j2)
 
         # pad the data to the shape given by the index
-        padding = (_i1 - i1, i2 - _i2), (_j1 - j1, j2 - _j2)
+        padding_i = (i2 - i1, 0) if _i1 == _i2 else (_i1 - i1, i2 - _i2)
+        padding_j = (j2 - j1, 0) if _j1 == _j2 else (_j1 - j1, j2 - _j2)
+        padding = padding_i, padding_j
         if np.all(np.array(padding) <= 0):
             padding = None
         return ranges, padding