In [1]:
from pathlib import Path
import shutil

In [36]:
SRC_PATH = Path("/mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v16/validation/")
DST_PATH = Path("/mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train/")
NUM_DST_BATCHES = 8000
BATCH_SIZE = 32

In [37]:
data_source_names = [directory.name for directory in SRC_PATH.iterdir() if directory.is_dir()]
data_source_names

['sun',
 'topographic',
 'gsp',
 'nwp',
 'opticalflow',
 'hrvsatellite',
 'satellite',
 'pv']

In [38]:
for data_source_name in data_source_names:
    src_data_source_directory = SRC_PATH / data_source_name
    src_full_netcdf_filenames = list(src_data_source_directory.glob("*.nc"))
    src_full_netcdf_filenames.sort()
    print(f"Found {len(src_full_netcdf_filenames):,d} .nc files in {src_data_source_directory}.", flush=True)
    print("Copying...")
    for src_full_netcdf_filename in src_full_netcdf_filenames:
        src_batch_idx = int(src_full_netcdf_filename.stem)
        dst_batch_idx = src_batch_idx + NUM_DST_BATCHES
        dst_netcdf_filename = f"{dst_batch_idx:06d}.nc"
        dst_full_netcdf_filename = DST_PATH / data_source_name / dst_netcdf_filename
        src_full_netcdf_filename, dst_full_netcdf_filename
        shutil.copy2(src_full_netcdf_filename, dst_full_netcdf_filename)

print("Done!")

Found 400 .nc files in /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v16/validation/sun.
Copying...
Found 400 .nc files in /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v16/validation/topographic.
Copying...
Found 400 .nc files in /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v16/validation/gsp.
Copying...
Found 400 .nc files in /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v16/validation/nwp.
Copying...
Found 400 .nc files in /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v16/validation/opticalflow.
Copying...
Found 400 .nc files in /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v16/validation/hrvsatellite.
Copying...
Found 400 .nc files in /mnt

In [39]:
# Now append the spatial and temporal locations CSV file
import pandas as pd
LOCATIONS_FILENAME = "spatial_and_temporal_locations_of_each_example.csv"

def load_locations(path: Path) -> pd.DataFrame:
    return pd.read_csv(path / LOCATIONS_FILENAME, index_col=0)

src_locations = load_locations(SRC_PATH)
src_locations

Unnamed: 0,t0_datetime_UTC,x_center_OSGB,y_center_OSGB
0,2020-09-01 11:35:00,287366.663246,179660.016720
1,2020-07-01 13:55:00,436416.454716,554859.051404
2,2020-05-05 08:15:00,390309.405259,807407.310855
3,2020-08-07 12:35:00,515591.114429,201989.024505
4,2020-05-25 10:40:00,260650.477469,673514.946368
...,...,...,...
12795,2020-06-05 07:30:00,250784.156260,767262.504590
12796,2020-07-11 10:30:00,578023.142088,162566.741848
12797,2020-07-16 13:25:00,313273.984730,368847.134745
12798,2020-06-05 10:20:00,378384.580316,783152.675669


In [40]:
src_locations.index += NUM_DST_BATCHES * BATCH_SIZE
src_locations

Unnamed: 0,t0_datetime_UTC,x_center_OSGB,y_center_OSGB
256000,2020-09-01 11:35:00,287366.663246,179660.016720
256001,2020-07-01 13:55:00,436416.454716,554859.051404
256002,2020-05-05 08:15:00,390309.405259,807407.310855
256003,2020-08-07 12:35:00,515591.114429,201989.024505
256004,2020-05-25 10:40:00,260650.477469,673514.946368
...,...,...,...
268795,2020-06-05 07:30:00,250784.156260,767262.504590
268796,2020-07-11 10:30:00,578023.142088,162566.741848
268797,2020-07-16 13:25:00,313273.984730,368847.134745
268798,2020-06-05 10:20:00,378384.580316,783152.675669


In [41]:
dst_locations = load_locations(DST_PATH)
dst_locations

Unnamed: 0,t0_datetime_UTC,x_center_OSGB,y_center_OSGB
0,2020-09-12 09:35:00,222049.973088,715656.079947
1,2020-09-22 11:10:00,182808.808298,51597.245949
2,2020-06-09 07:05:00,510723.528607,178402.994559
3,2020-10-01 10:00:00,507636.659095,197928.890020
4,2020-05-02 07:00:00,620261.387146,306036.118595
...,...,...,...
255995,2020-08-08 14:00:00,322274.030114,669219.297568
255996,2020-10-13 09:35:00,283909.494356,678872.185907
255997,2020-06-23 06:55:00,359530.429634,743344.527727
255998,2020-05-09 10:00:00,303724.425384,729183.041917


In [42]:
combined_locations = pd.concat((dst_locations, src_locations))
combined_locations

Unnamed: 0,t0_datetime_UTC,x_center_OSGB,y_center_OSGB
0,2020-09-12 09:35:00,222049.973088,715656.079947
1,2020-09-22 11:10:00,182808.808298,51597.245949
2,2020-06-09 07:05:00,510723.528607,178402.994559
3,2020-10-01 10:00:00,507636.659095,197928.890020
4,2020-05-02 07:00:00,620261.387146,306036.118595
...,...,...,...
268795,2020-06-05 07:30:00,250784.156260,767262.504590
268796,2020-07-11 10:30:00,578023.142088,162566.741848
268797,2020-07-16 13:25:00,313273.984730,368847.134745
268798,2020-06-05 10:20:00,378384.580316,783152.675669


In [43]:
combined_locations.to_csv(DST_PATH / LOCATIONS_FILENAME, mode="w")