Merge pull request #5523 from jenshnielsen/better_treshold

Better treshold calculation for chunked output
microsoft · Nov 16, 2023 · 136ee26 · 136ee26
2 parents a0a33ce + 3123af7
commit 136ee26
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 7 deletions.
diff --git a/src/qcodes/dataset/data_set.py b/src/qcodes/dataset/data_set.py
@@ -3,7 +3,6 @@
 import importlib
 import json
 import logging
-import sys
 import tempfile
 import time
 import uuid
@@ -1467,7 +1466,7 @@ def _export_as_netcdf(self, path: Path, file_name: str) -> Path:
             log.info(
                 "Dataset is expected to be larger that threshold. Using distributed export.",
                 extra={
-                    "file_name": file_path,
+                    "file_name": str(file_path),
                     "qcodes_guid": self.guid,
                     "ds_name": self.name,
                     "exp_name": self.exp_name,
@@ -1483,7 +1482,7 @@ def _export_as_netcdf(self, path: Path, file_name: str) -> Path:
                 log.info(
                     "Writing individual files to temp dir.",
                     extra={
-                        "file_name": file_path,
+                        "file_name": str(file_path),
                         "qcodes_guid": self.guid,
                         "ds_name": self.name,
                         "exp_name": self.exp_name,
@@ -1504,7 +1503,7 @@ def _export_as_netcdf(self, path: Path, file_name: str) -> Path:
                     log.info(
                         "Combining temp files into one file.",
                         extra={
-                            "file_name": file_path,
+                            "file_name": str(file_path),
                             "qcodes_guid": self.guid,
                             "ds_name": self.name,
                             "exp_name": self.exp_name,
@@ -1530,15 +1529,15 @@ def _estimate_ds_size(self) -> float:
         Give an estimated size of the dataset as the size of a single row
         times the len of the dataset. Result is returned in Mega Bytes.
 
-        Note that this does not take overhead into account so it is more accurate
-        if the row size is "large"
+        Note that this does not take overhead from storing the array into account
+        so it is assumed that the total array is large compared to the overhead.
         """
         sample_data = self.get_parameter_data(start=1, end=1)
         row_size = 0.0
 
         for param_data in sample_data.values():
             for array in param_data.values():
-                row_size += sys.getsizeof(array)
+                row_size += array.size * array.dtype.itemsize
         return row_size * len(self) / 1024 / 1024
 
 

diff --git a/tests/dataset/test_dataset_export.py b/tests/dataset/test_dataset_export.py
@@ -758,6 +758,39 @@ def test_export_dataset_small_no_delated(
     assert "Writing netcdf file directly" in caplog.records[0].msg
 
 
+def test_export_dataset_delayed_numeric(
+    tmp_path_factory: TempPathFactory, mock_dataset_grid: DataSet, caplog
+) -> None:
+    tmp_path = tmp_path_factory.mktemp("export_netcdf")
+    mock_dataset_grid._export_limit = 0
+    with caplog.at_level(logging.INFO):
+        mock_dataset_grid.export(export_type="netcdf", path=tmp_path, prefix="qcodes_")
+
+    assert (
+        "Dataset is expected to be larger that threshold. Using distributed export."
+        in caplog.records[0].msg
+    )
+    assert "Writing individual files to temp dir" in caplog.records[1].msg
+    assert "Combining temp files into one file" in caplog.records[2].msg
+    assert "Writing netcdf file using Dask delayed writer" in caplog.records[3].msg
+
+    loaded_ds = xr.load_dataset(mock_dataset_grid.export_info.export_paths["nc"])
+    assert loaded_ds.x.shape == (10,)
+    assert_allclose(loaded_ds.x, np.arange(10))
+    assert loaded_ds.y.shape == (5,)
+    assert_allclose(loaded_ds.y, np.arange(20, 25, 1))
+
+    arrays = []
+    for i in range(10):
+        arrays.append(np.arange(20 + i, 25 + i))
+    expected_z = np.array(arrays)
+
+    assert loaded_ds.z.shape == (10, 5)
+    assert_allclose(loaded_ds.z, expected_z)
+
+    _assert_xarray_metadata_is_as_expected(loaded_ds, mock_dataset_grid)
+
+
 def test_export_dataset_delayed(
     tmp_path_factory: TempPathFactory, mock_dataset_numpy: DataSet, caplog
 ) -> None: