Skip to content

Commit

Permalink
ENH: add support for big-endian dumps data
Browse files Browse the repository at this point in the history
  • Loading branch information
neutrinoceros committed Jul 15, 2023
1 parent e266bf2 commit 2d4ca76
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 35 deletions.
163 changes: 129 additions & 34 deletions src/yt_idefix/_io/dmp_io.py
Expand Up @@ -23,10 +23,31 @@ class CharCount(IntEnum):
NAME = 16


HEADER_REGEXP = re.compile(
r"Idefix (?P<version>[\w\.-]+) Dump Data" r"((?P<byteorder>(little|big)) endian)?"
)

ByteOrder = Literal["little", "big", "native"]


def parse_byteorder(fh: BinaryIO) -> ByteOrder:
header = read_header(fh)
match = HEADER_REGEXP.match(header)
if match is None:
raise ValueError(f"received invalid dump header string input {header!r}")

if (res := match.group("byteorder")) in ("little", "big"):
return cast(Literal["little", "big"], res)
else:
# early versions of Idefix didn't include byteorder in dump headers,
# fallback to native for backward compatibility
return "native"


# emulating C++
# enum DataType {DoubleType, SingleType, IntegerType};
DTYPES: dict[int, Prec] = {0: "d", 1: "f", 2: "i", 3: "?"}
DTYPES_2_NUMPY: dict[Prec, str] = {"d": "=f8", "f": "=f4", "i": "=i4"}
DTYPES_2_NUMPY: dict[Prec, str] = {"d": "f8", "f": "f4", "i": "i4"}


def read_null_terminated_string(fh: BinaryIO, maxsize: int = CharCount.NAME) -> str:
Expand All @@ -39,6 +60,8 @@ def read_null_terminated_string(fh: BinaryIO, maxsize: int = CharCount.NAME) ->

def read_next_field_properties(
fh: BinaryIO,
*,
byteorder: ByteOrder,
) -> tuple[str, Prec, Dim, np.ndarray]:
"""Emulate Idefix's OutputDump::ReadNextFieldProperty"""
field_name = read_null_terminated_string(fh)
Expand All @@ -64,6 +87,7 @@ def read_chunk(
dim: list[int],
dtype: str,
*,
byteorder: ByteOrder,
is_scalar: bool,
skip_data: Literal[True],
) -> None:
Expand All @@ -77,6 +101,7 @@ def read_chunk(
dim: list[int],
dtype: str,
*,
byteorder: ByteOrder,
is_scalar: Literal[True],
skip_data: Literal[False],
) -> float:
Expand All @@ -90,6 +115,7 @@ def read_chunk(
dim: list[int],
dtype: str,
*,
byteorder: ByteOrder,
is_scalar: Literal[False],
skip_data: Literal[False],
) -> np.ndarray:
Expand All @@ -103,6 +129,7 @@ def read_chunk(
dim: list[int],
dtype: str,
*,
byteorder: ByteOrder,
is_scalar: bool,
skip_data: bool,
) -> float | np.ndarray | None:
Expand All @@ -115,6 +142,7 @@ def read_chunk(
dim,
dtype,
*,
byteorder: ByteOrder,
skip_data=False,
is_scalar=False,
):
Expand All @@ -129,83 +157,135 @@ def read_chunk(
fh.seek(size, 1)
return None

alignement = {"little": "<", "big": ">", "native": "="}[byteorder]

# note: this reversal may not be desirable in general
if is_scalar:
fmt = f"={count}{dtype}"
fmt = f"{alignement}{count}{dtype}"
retv = struct.unpack(fmt, fh.read(size))[0]
return retv
else:
data = np.fromfile(fh, DTYPES_2_NUMPY[dtype], count=count)
data = np.fromfile(fh, alignement + DTYPES_2_NUMPY[dtype], count=count).astype(
"=" + DTYPES_2_NUMPY[dtype]
)
data.shape = dim[::-1]
return data.T


@overload
def read_serial(
fh: BinaryIO, ndim: int, dim: np.ndarray, dtype: str, *, is_scalar: Literal[True]
fh: BinaryIO,
ndim: int,
dim: np.ndarray,
dtype: str,
*,
byteorder: ByteOrder,
is_scalar: Literal[True],
) -> float:
...


@overload
def read_serial(
fh: BinaryIO, ndim: int, dim: np.ndarray, dtype: str, *, is_scalar: Literal[False]
fh: BinaryIO,
ndim: int,
dim: np.ndarray,
dtype: str,
*,
byteorder: ByteOrder,
is_scalar: Literal[False],
) -> np.ndarray:
...


@overload
def read_serial(
fh: BinaryIO, ndim: int, dim: np.ndarray, dtype: str, *, is_scalar: bool = False
fh: BinaryIO,
ndim: int,
dim: np.ndarray,
dtype: str,
*,
byteorder: ByteOrder,
is_scalar: bool = False,
) -> float | np.ndarray:
...


def read_serial(fh, ndim, dim, dtype, *, is_scalar=False):
def read_serial(fh, ndim, dim, dtype, *, byteorder, is_scalar=False):
"""Emulate Idefix's OutputDump::ReadSerial"""
assert ndim == 1 # corresponds to an error raised in IDEFIX
return read_chunk(
fh, ndim=ndim, dim=dim, dtype=dtype, is_scalar=is_scalar, skip_data=False
fh,
ndim=ndim,
dim=dim,
dtype=dtype,
byteorder=byteorder,
is_scalar=is_scalar,
skip_data=False,
)


@overload
def read_distributed(
fh: BinaryIO, dim: np.ndarray, *, dtype: str, skip_data: Literal[False]
fh: BinaryIO,
dim: np.ndarray,
*,
byteorder: ByteOrder,
dtype: str,
skip_data: Literal[False],
) -> np.ndarray:
...


@overload
def read_distributed(
fh: BinaryIO, dim: np.ndarray, *, dtype: str, skip_data: Literal[True]
fh: BinaryIO,
dim: np.ndarray,
*,
byteorder: ByteOrder,
dtype: str,
skip_data: Literal[True],
) -> None:
...


@overload
def read_distributed(
fh: BinaryIO, dim: np.ndarray, *, dtype: str, skip_data: bool = False
fh: BinaryIO,
dim: np.ndarray,
*,
byteorder: ByteOrder,
dtype: str,
skip_data: bool = False,
) -> np.ndarray | None:
...


def read_distributed(fh, dim, *, dtype, skip_data):
def read_distributed(fh, dim, *, byteorder, dtype, skip_data):
"""Emulate Idefix's OutputDump::ReadDistributed"""
# note: OutputDump::ReadDistributed only reads doubles
# because chunks written as integers are small enough
# that parallelization is counter productive.
# This a design choice on idefix's size.
return read_chunk(fh, ndim=len(dim), dim=dim, dtype=dtype, skip_data=skip_data)
return read_chunk(
fh,
ndim=len(dim),
dim=dim,
byteorder=byteorder,
dtype=dtype,
skip_data=skip_data,
)


# The following functions are originally designed for yt


def read_header(filename: str) -> str:
with open(filename, "rb") as fh:
header = read_null_terminated_string(fh, maxsize=CharCount.HEADER)
return header
def read_header(source: str | BinaryIO, /) -> str:
if isinstance(source, str):
with open(source, "rb") as fh:
return read_header(fh)
else:
return read_null_terminated_string(source, maxsize=CharCount.HEADER)


def get_field_offset_index(fh: BinaryIO) -> dict[str, int]:
Expand All @@ -217,33 +297,40 @@ def get_field_offset_index(fh: BinaryIO) -> dict[str, int]:
"""
field_index = {}

# skip header
fh.seek(CharCount.HEADER)
fh.seek(0)
byteorder = parse_byteorder(fh)

# skip grid properties
for _ in range(9):
_field_name, dtype, ndim, dim = read_next_field_properties(fh)
read_serial(fh, ndim, dim, dtype)
_field_name, dtype, ndim, dim = read_next_field_properties(
fh, byteorder=byteorder
)
read_serial(fh, ndim, dim, dtype, byteorder=byteorder)

while True:
offset = fh.tell()
field_name, dtype, ndim, dim = read_next_field_properties(fh)
field_name, dtype, ndim, dim = read_next_field_properties(
fh, byteorder=byteorder
)
if not re.match("^V[cs]-", field_name):
break
field_index[field_name] = offset
read_distributed(fh, dim, dtype=dtype, skip_data=True)
read_distributed(fh, dim, dtype=dtype, byteorder=byteorder, skip_data=True)

return field_index


def read_single_field(fh: BinaryIO, field_offset: int) -> np.ndarray:
def read_single_field(
fh: BinaryIO, field_offset: int, *, byteorder: ByteOrder
) -> np.ndarray:
"""
Returns
-------
data: 3D np.ndarray with dtype float64
"""
fh.seek(field_offset)
field_name, dtype, ndim, dim = read_next_field_properties(fh)
data = read_distributed(fh, dim, dtype=dtype, skip_data=False)
field_name, dtype, ndim, dim = read_next_field_properties(fh, byteorder=byteorder)
data = read_distributed(fh, dim, dtype=dtype, byteorder=byteorder, skip_data=False)
return data


Expand All @@ -257,34 +344,42 @@ def read_idefix_dmpfile(
def read_idefix_dump_from_buffer(
fh: BinaryIO, skip_data: bool = False
) -> tuple[IdefixFieldProperties, IdefixMetadata]:
# skip header
fh.seek(CharCount.HEADER)
fh.seek(0)
byteorder = parse_byteorder(fh)

data: float | np.ndarray | None
fprops: IdefixFieldProperties = {}
fdata: IdefixMetadata = {}
for _ in range(9):
# read grid properties
# (cell centers, left and right edges in 3D -> 9 arrays)
field_name, dtype, ndim, dim = read_next_field_properties(fh)
data = read_serial(fh, ndim, dim, dtype)
field_name, dtype, ndim, dim = read_next_field_properties(
fh, byteorder=byteorder
)
data = read_serial(fh, ndim, dim, dtype, byteorder=byteorder)
fprops[field_name] = dtype, ndim, dim
fdata[field_name] = data

field_name, dtype, ndim, dim = read_next_field_properties(fh)
field_name, dtype, ndim, dim = read_next_field_properties(fh, byteorder=byteorder)
while field_name != "eof":
# note that this could likely be implemented using a call to
# `iter` with a sentinel value, to the condition that read_next_field_properties
# would be splitted into 2 parts (I don't the sentinel pattern works with tuples)
fprops[field_name] = dtype, ndim, dim
if field_name.startswith(("Vc-", "Vs-")):
data = read_distributed(fh, dim, dtype=dtype, skip_data=skip_data)
data = read_distributed(
fh, dim, dtype=dtype, byteorder=byteorder, skip_data=skip_data
)
else:
is_scalar = ndim == 1 and dim[0] == 1
is_scalar &= field_name not in ("x1", "x2", "x3")
data = read_serial(fh, ndim, dim, dtype, is_scalar=is_scalar)
data = read_serial(
fh, ndim, dim, dtype, byteorder=byteorder, is_scalar=is_scalar
)
fdata[field_name] = data
field_name, dtype, ndim, dim = read_next_field_properties(fh)
field_name, dtype, ndim, dim = read_next_field_properties(
fh, byteorder=byteorder
)

if isinstance(fdata["geometry"], int):
fdata["geometry"] = KNOWN_GEOMETRIES[fdata["geometry"]]
Expand Down
6 changes: 5 additions & 1 deletion src/yt_idefix/io.py
Expand Up @@ -70,8 +70,12 @@ def _read_particle_fields(self, chunks, ptf, selector):
class IdefixDmpIO(SingleGridIO, BaseParticleIOHandler):
_dataset_type = "idefix-dmp"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._byteorder = dmp_io.parse_byteorder(self.ds.filename)

def _read_single_field(self, fh: BinaryIO, offset: int) -> np.ndarray:
return dmp_io.read_single_field(fh, offset)
return dmp_io.read_single_field(fh, offset, byteorder=self._byteorder)

def _read_particle_coords(self, chunks, ptf):
# This needs to *yield* a series of tuples of (ptype, (x, y, z)).
Expand Down

0 comments on commit 2d4ca76

Please sign in to comment.