Skip to content

Commit

Permalink
Merge pull request #5523 from jenshnielsen/better_treshold
Browse files Browse the repository at this point in the history
Better treshold calculation for chunked output
  • Loading branch information
jenshnielsen committed Nov 16, 2023
2 parents a0a33ce + 3123af7 commit 136ee26
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 7 deletions.
13 changes: 6 additions & 7 deletions src/qcodes/dataset/data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import importlib
import json
import logging
import sys
import tempfile
import time
import uuid
Expand Down Expand Up @@ -1467,7 +1466,7 @@ def _export_as_netcdf(self, path: Path, file_name: str) -> Path:
log.info(
"Dataset is expected to be larger that threshold. Using distributed export.",
extra={
"file_name": file_path,
"file_name": str(file_path),
"qcodes_guid": self.guid,
"ds_name": self.name,
"exp_name": self.exp_name,
Expand All @@ -1483,7 +1482,7 @@ def _export_as_netcdf(self, path: Path, file_name: str) -> Path:
log.info(
"Writing individual files to temp dir.",
extra={
"file_name": file_path,
"file_name": str(file_path),
"qcodes_guid": self.guid,
"ds_name": self.name,
"exp_name": self.exp_name,
Expand All @@ -1504,7 +1503,7 @@ def _export_as_netcdf(self, path: Path, file_name: str) -> Path:
log.info(
"Combining temp files into one file.",
extra={
"file_name": file_path,
"file_name": str(file_path),
"qcodes_guid": self.guid,
"ds_name": self.name,
"exp_name": self.exp_name,
Expand All @@ -1530,15 +1529,15 @@ def _estimate_ds_size(self) -> float:
Give an estimated size of the dataset as the size of a single row
times the len of the dataset. Result is returned in Mega Bytes.
Note that this does not take overhead into account so it is more accurate
if the row size is "large"
Note that this does not take overhead from storing the array into account
so it is assumed that the total array is large compared to the overhead.
"""
sample_data = self.get_parameter_data(start=1, end=1)
row_size = 0.0

for param_data in sample_data.values():
for array in param_data.values():
row_size += sys.getsizeof(array)
row_size += array.size * array.dtype.itemsize
return row_size * len(self) / 1024 / 1024


Expand Down
33 changes: 33 additions & 0 deletions tests/dataset/test_dataset_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,39 @@ def test_export_dataset_small_no_delated(
assert "Writing netcdf file directly" in caplog.records[0].msg


def test_export_dataset_delayed_numeric(
tmp_path_factory: TempPathFactory, mock_dataset_grid: DataSet, caplog
) -> None:
tmp_path = tmp_path_factory.mktemp("export_netcdf")
mock_dataset_grid._export_limit = 0
with caplog.at_level(logging.INFO):
mock_dataset_grid.export(export_type="netcdf", path=tmp_path, prefix="qcodes_")

assert (
"Dataset is expected to be larger that threshold. Using distributed export."
in caplog.records[0].msg
)
assert "Writing individual files to temp dir" in caplog.records[1].msg
assert "Combining temp files into one file" in caplog.records[2].msg
assert "Writing netcdf file using Dask delayed writer" in caplog.records[3].msg

loaded_ds = xr.load_dataset(mock_dataset_grid.export_info.export_paths["nc"])
assert loaded_ds.x.shape == (10,)
assert_allclose(loaded_ds.x, np.arange(10))
assert loaded_ds.y.shape == (5,)
assert_allclose(loaded_ds.y, np.arange(20, 25, 1))

arrays = []
for i in range(10):
arrays.append(np.arange(20 + i, 25 + i))
expected_z = np.array(arrays)

assert loaded_ds.z.shape == (10, 5)
assert_allclose(loaded_ds.z, expected_z)

_assert_xarray_metadata_is_as_expected(loaded_ds, mock_dataset_grid)


def test_export_dataset_delayed(
tmp_path_factory: TempPathFactory, mock_dataset_numpy: DataSet, caplog
) -> None:
Expand Down

0 comments on commit 136ee26

Please sign in to comment.