From 171783436fa62111dcd13501c9fd5b90963fbd1e Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 15:00:59 +0000 Subject: [PATCH 1/9] fix False to false in yaml --- nowcasting_dataset/config/on_premises.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nowcasting_dataset/config/on_premises.yaml b/nowcasting_dataset/config/on_premises.yaml index 9834b3ff..0e12d7f3 100644 --- a/nowcasting_dataset/config/on_premises.yaml +++ b/nowcasting_dataset/config/on_premises.yaml @@ -29,7 +29,7 @@ input_data: pv: pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/passiv.netcdf pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/system_metadata_OCF_ONLY.csv - get_center: False + get_center: false #---------------------- Satellite ------------- satellite: From 91af68b5e5f0c7d26acc1efd8e8751a1d4dd34de Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 15:12:12 +0000 Subject: [PATCH 2/9] add failing test --- tests/test_manager.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_manager.py b/tests/test_manager.py index 836b5cc2..4ad6892a 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -163,3 +163,21 @@ def test_save_config(): manager.save_yaml_configuration() assert os.path.exists(f"{dst_path}/configuration.yaml") + + +def test_run(): + """Test to initila data sources and get batches""" + + manager = Manager() + local_path = Path(nowcasting_dataset.__file__).parent.parent + filename = local_path / "tests" / "config" / "test.yaml" + manager.load_yaml_configuration(filename=filename) + manager.initialise_data_sources() + + with tempfile.TemporaryDirectory() as local_temp_path, tempfile.TemporaryDirectory() as dst_path: # noqa 101 + + manager.config.output_data.filepath = Path(dst_path) + manager.local_temp_path = Path(local_temp_path) + + manager.create_files_specifying_spatial_and_temporal_locations_of_each_example_if_necessary() # noqa 101 + manager.create_batches(overwrite_batches=True) From 54aaded81ec2d19bef6823a24b7c7d6b8ef95e48 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 15:17:33 +0000 Subject: [PATCH 3/9] add get_center to config model for pv --- nowcasting_dataset/config/model.py | 5 +++++ tests/config/test.yaml | 1 + tests/test_manager.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nowcasting_dataset/config/model.py b/nowcasting_dataset/config/model.py index 9b55d19e..d933ef05 100644 --- a/nowcasting_dataset/config/model.py +++ b/nowcasting_dataset/config/model.py @@ -98,6 +98,11 @@ class PV(DataSourceMixin): ) pv_image_size_pixels: int = IMAGE_SIZE_PIXELS_FIELD pv_meters_per_pixel: int = METERS_PER_PIXEL_FIELD + get_center: bool = Field( + False, + description="If the batches are centered on one PV system (or not). " + "The other options is to have one GSP at the center of a batch. ", + ) class Satellite(DataSourceMixin): diff --git a/tests/config/test.yaml b/tests/config/test.yaml index 37f846cc..3ad593c2 100644 --- a/tests/config/test.yaml +++ b/tests/config/test.yaml @@ -14,6 +14,7 @@ input_data: pv: pv_filename: tests/data/pv_data/test.nc pv_metadata_filename: tests/data/pv_metadata/UK_PV_metadata.csv + get_center: false satellite: satellite_channels: - HRV diff --git a/tests/test_manager.py b/tests/test_manager.py index 4ad6892a..0cc91b1d 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -166,7 +166,7 @@ def test_save_config(): def test_run(): - """Test to initila data sources and get batches""" + """Test to initialize data sources and get batches""" manager = Manager() local_path = Path(nowcasting_dataset.__file__).parent.parent From faf6882c85b777f78364cb5c0a8a6da61ddfec3a Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 15:41:38 +0000 Subject: [PATCH 4/9] try fix for nwp --- nowcasting_dataset/data_sources/data_source.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/nowcasting_dataset/data_sources/data_source.py b/nowcasting_dataset/data_sources/data_source.py index 83c4acb9..76b1585b 100644 --- a/nowcasting_dataset/data_sources/data_source.py +++ b/nowcasting_dataset/data_sources/data_source.py @@ -81,7 +81,16 @@ def __post_init__(self): def _get_start_dt( self, t0_dt: Union[pd.Timestamp, pd.DatetimeIndex] ) -> Union[pd.Timestamp, pd.DatetimeIndex]: - return t0_dt - self.history_duration + + start_dt = t0_dt - self.history_duration + + # if t0_dt is not on the hour, e.g. 13.05. + # Then if the history_minutes is 1 hours, + # we want also to load the previous time step e.g. 12.00 + if t0_dt.minute % self.sample_period_minutes != 0: + start_dt -= pd.Timedelta(self.history_minutes, unit="minutes") + + return start_dt def _get_end_dt( self, t0_dt: Union[pd.Timestamp, pd.DatetimeIndex] From af80d83685f0b3f24904b2135ded406ba204bb2d Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 16:40:36 +0000 Subject: [PATCH 5/9] TDD: add failing test --- tests/data_sources/test_nwp_data_source.py | 29 +++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tests/data_sources/test_nwp_data_source.py b/tests/data_sources/test_nwp_data_source.py index 9200607d..cc384e3d 100644 --- a/tests/data_sources/test_nwp_data_source.py +++ b/tests/data_sources/test_nwp_data_source.py @@ -41,7 +41,7 @@ def test_nwp_data_source_batch(): # noqa: D103 nwp.open() - t0_datetimes = nwp._data.init_time[2:6].values + t0_datetimes = [pd.Timestamp(t) for t in nwp._data.init_time[2:6].values] x = nwp._data.x[0:4].values y = nwp._data.y[0:4].values @@ -54,6 +54,29 @@ def test_nwp_data_source_batch(): # noqa: D103 assert batch.data.shape == (4, 1, 3, 2, 2) +def test_nwp_data_source_batch_not_on_hour(): # noqa: D103 + nwp = NWPDataSource( + zarr_path=NWP_ZARR_PATH, + history_minutes=60, + forecast_minutes=60, + channels=["t"], + ) + + nwp.open() + + t0_datetimes = [pd.Timestamp("2019-01-01 12:05:00")] + x = nwp._data.x[0:1].values + y = nwp._data.y[0:1].values + + batch = nwp.get_batch(t0_datetimes=t0_datetimes, x_locations=x, y_locations=y) + + # batch size 4 + # channel 1 + # time series, 1 int he past, 1 now, 1 in the future + # x,y of size 2 + assert batch.data.shape == (4, 1, 3, 2, 2) + + def test_nwp_get_contiguous_time_periods(): # noqa: D103 nwp = NWPDataSource( zarr_path=NWP_ZARR_PATH, @@ -64,7 +87,7 @@ def test_nwp_get_contiguous_time_periods(): # noqa: D103 contiguous_time_periods = nwp.get_contiguous_time_periods() correct_time_periods = pd.DataFrame( - [{"start_dt": pd.Timestamp("2019-01-01 00:00"), "end_dt": pd.Timestamp("2019-01-02 02:00")}] + [{"start_dt": pd.Timestamp("2019-01-01 00:00"), "end_dt": pd.Timestamp("2019-01-02 04:00")}] ) pd.testing.assert_frame_equal(contiguous_time_periods, correct_time_periods) @@ -79,6 +102,6 @@ def test_nwp_get_contiguous_t0_time_periods(): # noqa: D103 contiguous_time_periods = nwp.get_contiguous_t0_time_periods() correct_time_periods = pd.DataFrame( - [{"start_dt": pd.Timestamp("2019-01-01 01:00"), "end_dt": pd.Timestamp("2019-01-02 01:00")}] + [{"start_dt": pd.Timestamp("2019-01-01 01:00"), "end_dt": pd.Timestamp("2019-01-02 03:00")}] ) pd.testing.assert_frame_equal(contiguous_time_periods, correct_time_periods) From fff8792b8c2d122ae3076ace37bacb885742e140 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 16:49:50 +0000 Subject: [PATCH 6/9] take floor of start_dt for nwp data --- nowcasting_dataset/data_sources/data_source.py | 10 +--------- nowcasting_dataset/data_sources/nwp/nwp_data_source.py | 8 +++++++- tests/config/test.yaml | 1 + tests/data_sources/test_nwp_data_source.py | 4 ++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/nowcasting_dataset/data_sources/data_source.py b/nowcasting_dataset/data_sources/data_source.py index 76b1585b..762bf911 100644 --- a/nowcasting_dataset/data_sources/data_source.py +++ b/nowcasting_dataset/data_sources/data_source.py @@ -82,15 +82,7 @@ def _get_start_dt( self, t0_dt: Union[pd.Timestamp, pd.DatetimeIndex] ) -> Union[pd.Timestamp, pd.DatetimeIndex]: - start_dt = t0_dt - self.history_duration - - # if t0_dt is not on the hour, e.g. 13.05. - # Then if the history_minutes is 1 hours, - # we want also to load the previous time step e.g. 12.00 - if t0_dt.minute % self.sample_period_minutes != 0: - start_dt -= pd.Timedelta(self.history_minutes, unit="minutes") - - return start_dt + return t0_dt - self.history_duration def _get_end_dt( self, t0_dt: Union[pd.Timestamp, pd.DatetimeIndex] diff --git a/nowcasting_dataset/data_sources/nwp/nwp_data_source.py b/nowcasting_dataset/data_sources/nwp/nwp_data_source.py index f1bd980d..a1aaed9b 100644 --- a/nowcasting_dataset/data_sources/nwp/nwp_data_source.py +++ b/nowcasting_dataset/data_sources/nwp/nwp_data_source.py @@ -119,6 +119,11 @@ def _post_process_example(self, selected_data: xr.Dataset, t0_dt: pd.Timestamp) start_dt = self._get_start_dt(t0_dt) end_dt = self._get_end_dt(t0_dt) + # if t0_dt is not on the hour, e.g. 13.05. + # Then if the history_minutes is 1 hours, + # so start_dt will be 12.05, but we want to the 12.00 time step too + start_dt = start_dt.floor("H") + selected_data = selected_data.sel(target_time=slice(start_dt, end_dt)) selected_data = selected_data.rename({"target_time": "time", "variable": "channels"}) selected_data.data = selected_data.data.astype(np.float16) @@ -131,11 +136,12 @@ def datetime_index(self) -> pd.DatetimeIndex: nwp = self._open_data() else: nwp = self._data - target_times = nwp["init_time"] + nwp["step"][:3] + target_times = nwp["init_time"] + nwp["step"] target_times = target_times.values.flatten() target_times = np.unique(target_times) target_times = np.sort(target_times) target_times = pd.DatetimeIndex(target_times) + return target_times @property diff --git a/tests/config/test.yaml b/tests/config/test.yaml index 3ad593c2..feffb673 100644 --- a/tests/config/test.yaml +++ b/tests/config/test.yaml @@ -11,6 +11,7 @@ input_data: nwp_image_size_pixels: 2 nwp_zarr_path: tests/data/nwp_data/test.zarr history_minutes: 60 + forecast_minutes: 60 pv: pv_filename: tests/data/pv_data/test.nc pv_metadata_filename: tests/data/pv_metadata/UK_PV_metadata.csv diff --git a/tests/data_sources/test_nwp_data_source.py b/tests/data_sources/test_nwp_data_source.py index cc384e3d..0d69442e 100644 --- a/tests/data_sources/test_nwp_data_source.py +++ b/tests/data_sources/test_nwp_data_source.py @@ -70,11 +70,11 @@ def test_nwp_data_source_batch_not_on_hour(): # noqa: D103 batch = nwp.get_batch(t0_datetimes=t0_datetimes, x_locations=x, y_locations=y) - # batch size 4 + # batch size 1 # channel 1 # time series, 1 int he past, 1 now, 1 in the future # x,y of size 2 - assert batch.data.shape == (4, 1, 3, 2, 2) + assert batch.data.shape == (1, 1, 3, 2, 2) def test_nwp_get_contiguous_time_periods(): # noqa: D103 From a89bdefd7da8977e8cb206ecf109daaa9d9e36e6 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 16:53:29 +0000 Subject: [PATCH 7/9] update gcp.yaml for pv centers --- nowcasting_dataset/config/gcp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nowcasting_dataset/config/gcp.yaml b/nowcasting_dataset/config/gcp.yaml index b813abdb..3fecccfc 100644 --- a/nowcasting_dataset/config/gcp.yaml +++ b/nowcasting_dataset/config/gcp.yaml @@ -29,7 +29,7 @@ input_data: history_minutes: 30 pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv - get_center: False + get_center: false satellite: forecast_minutes: 60 history_minutes: 30 From 9746a8cae0415703c5ed9fcd0aab815c8d6a76a2 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 16:58:16 +0000 Subject: [PATCH 8/9] fix for sun data source --- nowcasting_dataset/data_sources/sun/sun_data_source.py | 2 +- .../data_sources/topographic/topographic_data_source.py | 2 +- nowcasting_dataset/dataset/xr_utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nowcasting_dataset/data_sources/sun/sun_data_source.py b/nowcasting_dataset/data_sources/sun/sun_data_source.py index 790de1a7..400bc40c 100644 --- a/nowcasting_dataset/data_sources/sun/sun_data_source.py +++ b/nowcasting_dataset/data_sources/sun/sun_data_source.py @@ -85,7 +85,7 @@ def get_example( sun = azimuth.to_dataset(name="azimuth") sun["elevation"] = elevation - return Sun(sun) + return sun def _load(self): diff --git a/nowcasting_dataset/data_sources/topographic/topographic_data_source.py b/nowcasting_dataset/data_sources/topographic/topographic_data_source.py index 07c7c514..3b8af08a 100644 --- a/nowcasting_dataset/data_sources/topographic/topographic_data_source.py +++ b/nowcasting_dataset/data_sources/topographic/topographic_data_source.py @@ -103,7 +103,7 @@ def get_example( # change to dataset topo_xd = selected_data.to_dataset(name="data") - return Topographic(topo_xd) + return topo_xd def _post_process_example( self, selected_data: xr.DataArray, t0_dt: pd.Timestamp diff --git a/nowcasting_dataset/dataset/xr_utils.py b/nowcasting_dataset/dataset/xr_utils.py index 12e37129..79c7c75b 100644 --- a/nowcasting_dataset/dataset/xr_utils.py +++ b/nowcasting_dataset/dataset/xr_utils.py @@ -37,7 +37,7 @@ def convert_coordinates_to_indexes(dataset: xr.Dataset) -> xr.Dataset: This is useful to align multiple examples into a single batch. """ - assert type(dataset) == xr.Dataset + assert type(dataset) == xr.Dataset, f" Should be xr.Dataset but found {type(dataset)}" original_dim_names = dataset.dims From feb5ab2ad987ff4abcc531ff54b3332ef3bdc1d2 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 16 Nov 2021 21:11:01 +0000 Subject: [PATCH 9/9] PR comments --- nowcasting_dataset/config/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nowcasting_dataset/config/model.py b/nowcasting_dataset/config/model.py index d933ef05..46bc2efe 100644 --- a/nowcasting_dataset/config/model.py +++ b/nowcasting_dataset/config/model.py @@ -101,7 +101,9 @@ class PV(DataSourceMixin): get_center: bool = Field( False, description="If the batches are centered on one PV system (or not). " - "The other options is to have one GSP at the center of a batch. ", + "The other options is to have one GSP at the center of a batch. " + "Typically, get_center would be set to true if and only if " + "PVDataSource is used to define the geospatial positions of each example.", )