In [1]:
import xarray as xr
import pandas as pd
from pathlib import Path
import numcodecs
import gcsfs
import dask

In [2]:
import rechunker

In [12]:
BUCKET = Path('solar-pv-nowcasting-data')
SAT_PATH = BUCKET / 'satellite/EUMETSAT/SEVIRI_RSS/OSGB36/'
SOURCE_SAT_FILENAME = 'gs://' + str(SAT_PATH / 'all_zarr_int16')
TARGET_SAT_FILENAME = SAT_PATH / 'all_zarr_int16_single_timestep_uint8.zarr'
TEMP_STORE_FILENAME = SAT_PATH / 'temp.zarr'

In [4]:
source_sat_dataset = xr.open_zarr(SOURCE_SAT_FILENAME, consolidated=True)

In [5]:
# Need to select at chunk boundaries!
source_sat_dataset = source_sat_dataset.isel(time=slice(0, 3600))

In [6]:
source_sat_dataset['stacked_eumetsat_data']

Unnamed: 0,Array,Chunk
Bytes,31.04 GiB,26.49 MiB
Shape,"(3600, 704, 548, 12)","(36, 704, 548, 1)"
Count,55561 Tasks,1200 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 31.04 GiB 26.49 MiB Shape (3600, 704, 548, 12) (36, 704, 548, 1) Count 55561 Tasks 1200 Chunks Type int16 numpy.ndarray",3600  1  12  548  704,

Unnamed: 0,Array,Chunk
Bytes,31.04 GiB,26.49 MiB
Shape,"(3600, 704, 548, 12)","(36, 704, 548, 1)"
Count,55561 Tasks,1200 Chunks
Type,int16,numpy.ndarray


In [7]:
gcs = gcsfs.GCSFileSystem()

In [13]:
target_store = gcs.get_mapper(TARGET_SAT_FILENAME)

In [9]:
temp_store = gcs.get_mapper(TEMP_STORE_FILENAME)

In [15]:
target_chunks = {
    'stacked_eumetsat_data': {
        "time": 1,
        "y": 704,
        "x": 548,
        "variable": 1,
    }
}

encoding = {
    'stacked_eumetsat_data': {
        'compressor': numcodecs.Blosc(cname="zstd", clevel=5),
    }
}

rechunk_plan = rechunker.rechunk(
    source=source_sat_dataset,
    target_chunks=target_chunks,
    max_mem="10GB",
    target_store=target_store,
    target_options=encoding,
    temp_store=temp_store
)

In [16]:
%%time
rechunk_plan.execute()

_copy_chunk((slice(0, 36, None), slice(0, 704, None), slice(0, 548, None), slice(0, 1, None)))
_copy_chunk((slice(972, 1008, None), slice(0, 704, None), slice(0, 548, None), slice(10, 11, None)))
_copy_chunk((slice(2952, 2988, None), slice(0, 704, None), slice(0, 548, None), slice(6, 7, None)))
_copy_chunk((slice(252, 288, None), slice(0, 704, None), slice(0, 548, None), slice(3, 4, None)))
_copy_chunk((slice(2196, 2232, None), slice(0, 704, None), slice(0, 548, None), slice(11, 12, None)))
_copy_chunk((slice(1476, 1512, None), slice(0, 704, None), slice(0, 548, None), slice(4, 5, None)))
_copy_chunk((slice(180, 216, None), slice(0, 704, None), slice(0, 548, None), slice(9, 10, None)))
_copy_chunk((slice(3456, 3492, None), slice(0, 704, None), slice(0, 548, None), slice(0, 1, None)))
_copy_chunk((slice(2160, 2196, None), slice(0, 704, None), slice(0, 548, None), slice(5, 6, None)))
_copy_chunk((slice(1404, 1440, None), slice(0, 704, None), slice(0, 548, None), slice(10, 11, None)))
_co

<zarr.hierarchy.Group '/'>

In [17]:
opened_dataset = xr.open_zarr(f'gs://{TARGET_SAT_FILENAME}', consolidated=False)

In [18]:
opened_dataset

Unnamed: 0,Array,Chunk
Bytes,15.52 GiB,376.75 kiB
Shape,"(3600, 704, 548, 12)","(1, 704, 548, 1)"
Count,43201 Tasks,43200 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 15.52 GiB 376.75 kiB Shape (3600, 704, 548, 12) (1, 704, 548, 1) Count 43201 Tasks 43200 Chunks Type uint8 numpy.ndarray",3600  1  12  548  704,

Unnamed: 0,Array,Chunk
Bytes,15.52 GiB,376.75 kiB
Shape,"(3600, 704, 548, 12)","(1, 704, 548, 1)"
Count,43201 Tasks,43200 Chunks
Type,uint8,numpy.ndarray


In [None]:
f'gs://{TARGET_SAT_FILENAME}'