ENH: structured_to_unstructured: view more often

Converting with structured_to_unstructured() now returns a view, if all the fields have a constant stride.
numpy · Apr 24, 2023 · 582d6b4 · 582d6b4
1 parent 99f08e3
commit 582d6b4
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 5 deletions.
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
@@ -885,6 +885,52 @@ def count_elem(dt):
                     fields.extend([(d, c, o + i*size) for d, c, o in subfields])
     return fields
 
+def _common_stride(offsets, counts, itemsize):
+    """
+    Returns the stride between the fields, or None if the stride is not
+    constant. The values in "counts" designate the lengths of
+    sub-arrays. Sub-arrays are treated as many contiguous fields, with
+    always positive stride.
+    """
+
+    if len(offsets) <= 1:
+        return itemsize
+
+    negative = offsets[1] < offsets[0]  # negative stride
+    if negative:
+        # reverse, so offsets will be ascending
+        it = zip(reversed(offsets), reversed(counts))
+    else:
+        it = zip(offsets, counts)
+
+    prev_offset = None
+    stride = None
+    for offset, count in it:
+        if count != 1:  # sub array: always c-contiguous
+            if negative:
+                return None  # sub-arrays can never have a negative stride
+            if stride is None:
+                stride = itemsize
+            if stride != itemsize:
+                return None
+            end_offset = offset + (count - 1) * itemsize
+        else:
+            end_offset = offset
+
+        if prev_offset is not None:
+            new_stride = offset - prev_offset
+            if stride is None:
+                stride = new_stride
+            if stride != new_stride:
+                return None
+
+        prev_offset = end_offset
+
+    if stride is not None:
+        if negative:
+            return -stride
+        return stride
+
 
 def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
                                            casting=None):
@@ -960,7 +1006,7 @@ def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
     if dtype is None:
         out_dtype = np.result_type(*[dt.base for dt in dts])
     else:
-        out_dtype = dtype
+        out_dtype = np.dtype(dtype)
 
     # Use a series of views and casts to convert to an unstructured array:
 
@@ -972,6 +1018,30 @@ def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
                                  'itemsize': arr.dtype.itemsize})
     arr = arr.view(flattened_fields)
 
+    if (not copy) and all(dt.base == out_dtype for dt in dts):
+        # all elements have the right dtype already; if they have a common
+        # stride, we can just return a view
+        common_stride = _common_stride(offsets, counts, out_dtype.itemsize)
+        if common_stride is not None:
+            # ensure that we have a real ndarray; other types (e.g. matrix)
+            # have strange slicing behavior
+            arr = arr.view(type=np.ndarray)
+            new_shape = arr.shape + (sum(counts), out_dtype.itemsize)
+            new_strides = arr.strides + (abs(common_stride), 1)
+
+            arr = arr[..., None].view(np.uint8)  # view as bytes
+            arr = arr[..., min(offsets):]  # remove the leading unused data
+            arr = np.lib.stride_tricks.as_strided(arr,
+                                                  new_shape,
+                                                  new_strides)
+
+            # cast and drop the last dimension again
+            arr = arr.view(out_dtype)[..., 0]
+
+            if common_stride < 0:
+                arr = arr[..., ::-1]  # reverse, if the stride was negative
+            return arr
+
     # next cast to a packed format with all fields converted to new dtype
     packed_fields = np.dtype({'names': names,
                               'formats': [(out_dtype, dt.shape) for dt in dts]})

diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
@@ -263,8 +263,13 @@ def test_structured_to_unstructured(self):
                      dtype=[('x', 'i4'), ('y', 'i4'), ('z', 'i4')])
         dd = structured_to_unstructured(d)
         ddd = unstructured_to_structured(dd, d.dtype)
-        assert_(dd.base is d)
-        assert_(ddd.base is d)
+        assert_(np.shares_memory(dd, d))
+        assert_(np.shares_memory(ddd, d))
+
+        # check that reversing the order of attributes works
+        dd_attrib_rev = structured_to_unstructured(d[['z', 'x']])
+        assert_equal(dd_attrib_rev, [[5, 1], [7, 4], [11, 7], [12, 10]])
+        assert_(np.shares_memory(dd_attrib_rev, d))
 
         # including uniform fields with subarrays unpacked
         d = np.array([(1, [2,  3], [[ 4,  5], [ 6,  7]]),
@@ -273,8 +278,30 @@ def test_structured_to_unstructured(self):
                             ('x2', ('i4', (2, 2)))])
         dd = structured_to_unstructured(d)
         ddd = unstructured_to_structured(dd, d.dtype)
-        assert_(dd.base is d)
-        assert_(ddd.base is d)
+        assert_(np.shares_memory(dd, d))
+        assert_(np.shares_memory(ddd, d))
+
+        # check that reversing with sub-arrays works as expected
+        d_rev = d[::-1]
+        dd_rev = structured_to_unstructured(d_rev)
+        assert_equal(dd_rev, [[8, 9, 10, 11, 12, 13, 14],
+                              [1, 2, 3, 4, 5, 6, 7]])
+
+        # check that sub-arrays keep the order of their values
+        d_attrib_rev = d[['x2', 'x1', 'x0']]
+        dd_attrib_rev = structured_to_unstructured(d_attrib_rev)
+        assert_equal(dd_attrib_rev, [[4, 5, 6, 7, 2, 3, 1],
+                                     [11, 12, 13, 14, 9, 10, 8]])
+
+        # with ignored field at the end
+        d = np.array([(1, [2,  3], [[ 4,  5], [ 6,  7]], 32),
+                      (8, [9, 10], [[11, 12], [13, 14]], 64)],
+                     dtype=[('x0', 'i4'), ('x1', ('i4', 2)),
+                            ('x2', ('i4', (2, 2))), ('ignored', 'u1')])
+        dd = structured_to_unstructured(d[['x0', 'x1', 'x2']])
+        assert_(np.shares_memory(dd, d))
+        assert_equal(dd, [[1, 2, 3, 4, 5, 6, 7],
+                          [8, 9, 10, 11, 12, 13, 14]])
 
         # test that nested fields with identical names don't break anything
         point = np.dtype([('x', int), ('y', int)])