ENH: add support for big-endian dumps data

neutrinoceros · Jul 15, 2023 · 2d4ca76 · 2d4ca76
1 parent e266bf2
commit 2d4ca76
Show file tree

Hide file tree

Showing 2 changed files with 134 additions and 35 deletions.
diff --git a/src/yt_idefix/_io/dmp_io.py b/src/yt_idefix/_io/dmp_io.py
@@ -23,10 +23,31 @@ class CharCount(IntEnum):
     NAME = 16
 
 
+HEADER_REGEXP = re.compile(
+    r"Idefix (?P<version>[\w\.-]+) Dump Data" r"((?P<byteorder>(little|big)) endian)?"
+)
+
+ByteOrder = Literal["little", "big", "native"]
+
+
+def parse_byteorder(fh: BinaryIO) -> ByteOrder:
+    header = read_header(fh)
+    match = HEADER_REGEXP.match(header)
+    if match is None:
+        raise ValueError(f"received invalid dump header string input {header!r}")
+
+    if (res := match.group("byteorder")) in ("little", "big"):
+        return cast(Literal["little", "big"], res)
+    else:
+        # early versions of Idefix didn't include byteorder in dump headers,
+        # fallback to native for backward compatibility
+        return "native"
+
+
 # emulating C++
 # enum DataType {DoubleType, SingleType, IntegerType};
 DTYPES: dict[int, Prec] = {0: "d", 1: "f", 2: "i", 3: "?"}
-DTYPES_2_NUMPY: dict[Prec, str] = {"d": "=f8", "f": "=f4", "i": "=i4"}
+DTYPES_2_NUMPY: dict[Prec, str] = {"d": "f8", "f": "f4", "i": "i4"}
 
 
 def read_null_terminated_string(fh: BinaryIO, maxsize: int = CharCount.NAME) -> str:
@@ -39,6 +60,8 @@ def read_null_terminated_string(fh: BinaryIO, maxsize: int = CharCount.NAME) ->
 
 def read_next_field_properties(
     fh: BinaryIO,
+    *,
+    byteorder: ByteOrder,
 ) -> tuple[str, Prec, Dim, np.ndarray]:
     """Emulate Idefix's OutputDump::ReadNextFieldProperty"""
     field_name = read_null_terminated_string(fh)
@@ -64,6 +87,7 @@ def read_chunk(
     dim: list[int],
     dtype: str,
     *,
+    byteorder: ByteOrder,
     is_scalar: bool,
     skip_data: Literal[True],
 ) -> None:
@@ -77,6 +101,7 @@ def read_chunk(
     dim: list[int],
     dtype: str,
     *,
+    byteorder: ByteOrder,
     is_scalar: Literal[True],
     skip_data: Literal[False],
 ) -> float:
@@ -90,6 +115,7 @@ def read_chunk(
     dim: list[int],
     dtype: str,
     *,
+    byteorder: ByteOrder,
     is_scalar: Literal[False],
     skip_data: Literal[False],
 ) -> np.ndarray:
@@ -103,6 +129,7 @@ def read_chunk(
     dim: list[int],
     dtype: str,
     *,
+    byteorder: ByteOrder,
     is_scalar: bool,
     skip_data: bool,
 ) -> float | np.ndarray | None:
@@ -115,6 +142,7 @@ def read_chunk(
     dim,
     dtype,
     *,
+    byteorder: ByteOrder,
     skip_data=False,
     is_scalar=False,
 ):
@@ -129,83 +157,135 @@ def read_chunk(
         fh.seek(size, 1)
         return None
 
+    alignement = {"little": "<", "big": ">", "native": "="}[byteorder]
+
     # note: this reversal may not be desirable in general
     if is_scalar:
-        fmt = f"={count}{dtype}"
+        fmt = f"{alignement}{count}{dtype}"
         retv = struct.unpack(fmt, fh.read(size))[0]
         return retv
     else:
-        data = np.fromfile(fh, DTYPES_2_NUMPY[dtype], count=count)
+        data = np.fromfile(fh, alignement + DTYPES_2_NUMPY[dtype], count=count).astype(
+            "=" + DTYPES_2_NUMPY[dtype]
+        )
         data.shape = dim[::-1]
         return data.T
 
 
 @overload
 def read_serial(
-    fh: BinaryIO, ndim: int, dim: np.ndarray, dtype: str, *, is_scalar: Literal[True]
+    fh: BinaryIO,
+    ndim: int,
+    dim: np.ndarray,
+    dtype: str,
+    *,
+    byteorder: ByteOrder,
+    is_scalar: Literal[True],
 ) -> float:
     ...
 
 
 @overload
 def read_serial(
-    fh: BinaryIO, ndim: int, dim: np.ndarray, dtype: str, *, is_scalar: Literal[False]
+    fh: BinaryIO,
+    ndim: int,
+    dim: np.ndarray,
+    dtype: str,
+    *,
+    byteorder: ByteOrder,
+    is_scalar: Literal[False],
 ) -> np.ndarray:
     ...
 
 
 @overload
 def read_serial(
-    fh: BinaryIO, ndim: int, dim: np.ndarray, dtype: str, *, is_scalar: bool = False
+    fh: BinaryIO,
+    ndim: int,
+    dim: np.ndarray,
+    dtype: str,
+    *,
+    byteorder: ByteOrder,
+    is_scalar: bool = False,
 ) -> float | np.ndarray:
     ...
 
 
-def read_serial(fh, ndim, dim, dtype, *, is_scalar=False):
+def read_serial(fh, ndim, dim, dtype, *, byteorder, is_scalar=False):
     """Emulate Idefix's OutputDump::ReadSerial"""
     assert ndim == 1  # corresponds to an error raised in IDEFIX
     return read_chunk(
-        fh, ndim=ndim, dim=dim, dtype=dtype, is_scalar=is_scalar, skip_data=False
+        fh,
+        ndim=ndim,
+        dim=dim,
+        dtype=dtype,
+        byteorder=byteorder,
+        is_scalar=is_scalar,
+        skip_data=False,
     )
 
 
 @overload
 def read_distributed(
-    fh: BinaryIO, dim: np.ndarray, *, dtype: str, skip_data: Literal[False]
+    fh: BinaryIO,
+    dim: np.ndarray,
+    *,
+    byteorder: ByteOrder,
+    dtype: str,
+    skip_data: Literal[False],
 ) -> np.ndarray:
     ...
 
 
 @overload
 def read_distributed(
-    fh: BinaryIO, dim: np.ndarray, *, dtype: str, skip_data: Literal[True]
+    fh: BinaryIO,
+    dim: np.ndarray,
+    *,
+    byteorder: ByteOrder,
+    dtype: str,
+    skip_data: Literal[True],
 ) -> None:
     ...
 
 
 @overload
 def read_distributed(
-    fh: BinaryIO, dim: np.ndarray, *, dtype: str, skip_data: bool = False
+    fh: BinaryIO,
+    dim: np.ndarray,
+    *,
+    byteorder: ByteOrder,
+    dtype: str,
+    skip_data: bool = False,
 ) -> np.ndarray | None:
     ...
 
 
-def read_distributed(fh, dim, *, dtype, skip_data):
+def read_distributed(fh, dim, *, byteorder, dtype, skip_data):
     """Emulate Idefix's OutputDump::ReadDistributed"""
     # note: OutputDump::ReadDistributed only reads doubles
     # because chunks written as integers are small enough
     # that parallelization is counter productive.
     # This a design choice on idefix's size.
-    return read_chunk(fh, ndim=len(dim), dim=dim, dtype=dtype, skip_data=skip_data)
+    return read_chunk(
+        fh,
+        ndim=len(dim),
+        dim=dim,
+        byteorder=byteorder,
+        dtype=dtype,
+        skip_data=skip_data,
+    )
 
 
 # The following functions are originally designed for yt
 
 
-def read_header(filename: str) -> str:
-    with open(filename, "rb") as fh:
-        header = read_null_terminated_string(fh, maxsize=CharCount.HEADER)
-    return header
+def read_header(source: str | BinaryIO, /) -> str:
+    if isinstance(source, str):
+        with open(source, "rb") as fh:
+            return read_header(fh)
+    else:
+        return read_null_terminated_string(source, maxsize=CharCount.HEADER)
 
 
 def get_field_offset_index(fh: BinaryIO) -> dict[str, int]:
@@ -217,33 +297,40 @@ def get_field_offset_index(fh: BinaryIO) -> dict[str, int]:
     """
     field_index = {}
 
-    # skip header
-    fh.seek(CharCount.HEADER)
+    fh.seek(0)
+    byteorder = parse_byteorder(fh)
+
     # skip grid properties
     for _ in range(9):
-        _field_name, dtype, ndim, dim = read_next_field_properties(fh)
-        read_serial(fh, ndim, dim, dtype)
+        _field_name, dtype, ndim, dim = read_next_field_properties(
+            fh, byteorder=byteorder
+        )
+        read_serial(fh, ndim, dim, dtype, byteorder=byteorder)
 
     while True:
         offset = fh.tell()
-        field_name, dtype, ndim, dim = read_next_field_properties(fh)
+        field_name, dtype, ndim, dim = read_next_field_properties(
+            fh, byteorder=byteorder
+        )
         if not re.match("^V[cs]-", field_name):
             break
         field_index[field_name] = offset
-        read_distributed(fh, dim, dtype=dtype, skip_data=True)
+        read_distributed(fh, dim, dtype=dtype, byteorder=byteorder, skip_data=True)
 
     return field_index
 
 
-def read_single_field(fh: BinaryIO, field_offset: int) -> np.ndarray:
+def read_single_field(
+    fh: BinaryIO, field_offset: int, *, byteorder: ByteOrder
+) -> np.ndarray:
     """
     Returns
     -------
     data: 3D np.ndarray with dtype float64
     """
     fh.seek(field_offset)
-    field_name, dtype, ndim, dim = read_next_field_properties(fh)
-    data = read_distributed(fh, dim, dtype=dtype, skip_data=False)
+    field_name, dtype, ndim, dim = read_next_field_properties(fh, byteorder=byteorder)
+    data = read_distributed(fh, dim, dtype=dtype, byteorder=byteorder, skip_data=False)
     return data
 
 
@@ -257,34 +344,42 @@ def read_idefix_dmpfile(
 def read_idefix_dump_from_buffer(
     fh: BinaryIO, skip_data: bool = False
 ) -> tuple[IdefixFieldProperties, IdefixMetadata]:
-    # skip header
-    fh.seek(CharCount.HEADER)
+    fh.seek(0)
+    byteorder = parse_byteorder(fh)
 
     data: float | np.ndarray | None
     fprops: IdefixFieldProperties = {}
     fdata: IdefixMetadata = {}
     for _ in range(9):
         # read grid properties
         # (cell centers, left and right edges in 3D -> 9 arrays)
-        field_name, dtype, ndim, dim = read_next_field_properties(fh)
-        data = read_serial(fh, ndim, dim, dtype)
+        field_name, dtype, ndim, dim = read_next_field_properties(
+            fh, byteorder=byteorder
+        )
+        data = read_serial(fh, ndim, dim, dtype, byteorder=byteorder)
         fprops[field_name] = dtype, ndim, dim
         fdata[field_name] = data
 
-    field_name, dtype, ndim, dim = read_next_field_properties(fh)
+    field_name, dtype, ndim, dim = read_next_field_properties(fh, byteorder=byteorder)
     while field_name != "eof":
         # note that this could likely be implemented using a call to
         # `iter` with a sentinel value, to the condition that read_next_field_properties
         # would be splitted into 2 parts (I don't the sentinel pattern works with tuples)
         fprops[field_name] = dtype, ndim, dim
         if field_name.startswith(("Vc-", "Vs-")):
-            data = read_distributed(fh, dim, dtype=dtype, skip_data=skip_data)
+            data = read_distributed(
+                fh, dim, dtype=dtype, byteorder=byteorder, skip_data=skip_data
+            )
         else:
             is_scalar = ndim == 1 and dim[0] == 1
             is_scalar &= field_name not in ("x1", "x2", "x3")
-            data = read_serial(fh, ndim, dim, dtype, is_scalar=is_scalar)
+            data = read_serial(
+                fh, ndim, dim, dtype, byteorder=byteorder, is_scalar=is_scalar
+            )
         fdata[field_name] = data
-        field_name, dtype, ndim, dim = read_next_field_properties(fh)
+        field_name, dtype, ndim, dim = read_next_field_properties(
+            fh, byteorder=byteorder
+        )
 
     if isinstance(fdata["geometry"], int):
         fdata["geometry"] = KNOWN_GEOMETRIES[fdata["geometry"]]

diff --git a/src/yt_idefix/io.py b/src/yt_idefix/io.py
@@ -70,8 +70,12 @@ def _read_particle_fields(self, chunks, ptf, selector):
 class IdefixDmpIO(SingleGridIO, BaseParticleIOHandler):
     _dataset_type = "idefix-dmp"
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._byteorder = dmp_io.parse_byteorder(self.ds.filename)
+
     def _read_single_field(self, fh: BinaryIO, offset: int) -> np.ndarray:
-        return dmp_io.read_single_field(fh, offset)
+        return dmp_io.read_single_field(fh, offset, byteorder=self._byteorder)
 
     def _read_particle_coords(self, chunks, ptf):
         # This needs to *yield* a series of tuples of (ptype, (x, y, z)).