In [72]:
from nowcasting_dataset.datamodule import NowcastingDataModule
from pathlib import Path
import pandas as pd
import numpy as np
import xarray as xr
import numcodecs
import gcsfs
from typing import List
import io
import hashlib
import os
import glob

import logging
logging.basicConfig()
logger = logging.getLogger('nowcasting_dataset')
logger.setLevel(logging.DEBUG)

## Load Zarr batches

In [63]:
SRC_ZARR_FILENAME = 'gs://solar-pv-nowcasting-data/prepared_ML_training_data/testing.zarr'
DST_NETCDF4_PATH = 'gs://solar-pv-nowcasting-data/prepared_ML_training_data/netcdf4/'
LOCAL_TEMP_PATH = Path('~/temp/').expanduser()

In [64]:
DST_NETCDF4_PATH

'gs://solar-pv-nowcasting-data/prepared_ML_training_data/netcdf4/'

In [3]:
%%time
ds = xr.open_dataset(SRC_ZARR_FILENAME, engine='zarr', chunks={})

CPU times: user 21.8 s, sys: 935 ms, total: 22.7 s
Wall time: 35.5 s


In [4]:
ds

Unnamed: 0,Array,Chunk
Bytes,7.44 MiB,42.52 kiB
Shape,"(974880,)","(5442,)"
Count,181 Tasks,180 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 7.44 MiB 42.52 kiB Shape (974880,) (5442,) Count 181 Tasks 180 Chunks Type datetime64[ns] numpy.ndarray",974880  1,

Unnamed: 0,Array,Chunk
Bytes,7.44 MiB,42.52 kiB
Shape,"(974880,)","(5442,)"
Count,181 Tasks,180 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.76 GiB,95.00 kiB
Shape,"(974880, 10, 19, 2, 2)","(32, 10, 19, 2, 2)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.76 GiB 95.00 kiB Shape (974880, 10, 19, 2, 2) (32, 10, 19, 2, 2) Count 30466 Tasks 30465 Chunks Type float32 numpy.ndarray",10  974880  2  2  19,

Unnamed: 0,Array,Chunk
Bytes,2.76 GiB,95.00 kiB
Shape,"(974880, 10, 19, 2, 2)","(32, 10, 19, 2, 2)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,141.32 MiB,4.75 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 141.32 MiB 4.75 kiB Shape (974880, 19) (32, 19) Count 30466 Tasks 30465 Chunks Type datetime64[ns] numpy.ndarray",19  974880,

Unnamed: 0,Array,Chunk
Bytes,141.32 MiB,4.75 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.44 MiB,256 B
Shape,"(974880, 2)","(32, 2)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 7.44 MiB 256 B Shape (974880, 2) (32, 2) Count 30466 Tasks 30465 Chunks Type float32 numpy.ndarray",2  974880,

Unnamed: 0,Array,Chunk
Bytes,7.44 MiB,256 B
Shape,"(974880, 2)","(32, 2)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.44 MiB,256 B
Shape,"(974880, 2)","(32, 2)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 7.44 MiB 256 B Shape (974880, 2) (32, 2) Count 30466 Tasks 30465 Chunks Type float32 numpy.ndarray",2  974880,

Unnamed: 0,Array,Chunk
Bytes,7.44 MiB,256 B
Shape,"(974880, 2)","(32, 2)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.72 MiB,128 B
Shape,"(974880,)","(32,)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 3.72 MiB 128 B Shape (974880,) (32,) Count 30466 Tasks 30465 Chunks Type int32 numpy.ndarray",974880  1,

Unnamed: 0,Array,Chunk
Bytes,3.72 MiB,128 B
Shape,"(974880,)","(32,)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.72 MiB,128 B
Shape,"(974880,)","(32,)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 3.72 MiB 128 B Shape (974880,) (32,) Count 30466 Tasks 30465 Chunks Type int32 numpy.ndarray",974880  1,

Unnamed: 0,Array,Chunk
Bytes,3.72 MiB,128 B
Shape,"(974880,)","(32,)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,70.66 MiB,2.38 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 70.66 MiB 2.38 kiB Shape (974880, 19) (32, 19) Count 30466 Tasks 30465 Chunks Type float32 numpy.ndarray",19  974880,

Unnamed: 0,Array,Chunk
Bytes,70.66 MiB,2.38 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,141.32 MiB,4.75 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 141.32 MiB 4.75 kiB Shape (974880, 19) (32, 19) Count 30466 Tasks 30465 Chunks Type datetime64[ns] numpy.ndarray",19  974880,

Unnamed: 0,Array,Chunk
Bytes,141.32 MiB,4.75 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,847.90 GiB,28.50 MiB
Shape,"(974880, 19, 32, 32, 12)","(32, 19, 32, 32, 12)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 847.90 GiB 28.50 MiB Shape (974880, 19, 32, 32, 12) (32, 19, 32, 32, 12) Count 30466 Tasks 30465 Chunks Type float32 numpy.ndarray",19  974880  12  32  32,

Unnamed: 0,Array,Chunk
Bytes,847.90 GiB,28.50 MiB
Shape,"(974880, 19, 32, 32, 12)","(32, 19, 32, 32, 12)"
Count,30466 Tasks,30465 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,141.32 MiB,4.75 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 141.32 MiB 4.75 kiB Shape (974880, 19) (32, 19) Count 30466 Tasks 30465 Chunks Type datetime64[ns] numpy.ndarray",19  974880,

Unnamed: 0,Array,Chunk
Bytes,141.32 MiB,4.75 kiB
Shape,"(974880, 19)","(32, 19)"
Count,30466 Tasks,30465 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,119.00 MiB,4.00 kiB
Shape,"(974880, 32)","(32, 32)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 119.00 MiB 4.00 kiB Shape (974880, 32) (32, 32) Count 30466 Tasks 30465 Chunks Type int32 numpy.ndarray",32  974880,

Unnamed: 0,Array,Chunk
Bytes,119.00 MiB,4.00 kiB
Shape,"(974880, 32)","(32, 32)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,119.00 MiB,4.00 kiB
Shape,"(974880, 32)","(32, 32)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 119.00 MiB 4.00 kiB Shape (974880, 32) (32, 32) Count 30466 Tasks 30465 Chunks Type int32 numpy.ndarray",32  974880,

Unnamed: 0,Array,Chunk
Bytes,119.00 MiB,4.00 kiB
Shape,"(974880, 32)","(32, 32)"
Count,30466 Tasks,30465 Chunks
Type,int32,numpy.ndarray


In [52]:
batch_size = 32
n_batches = len(ds.example) // batch_size

gcs = gcsfs.GCSFileSystem()

In [81]:
def get_filename(batch_i: int) -> Path:
    """Generate full filename, excluding path.
    
    Filename includes the first 6 digits of the MD5 hash of the filename,
    as recommended by Google Cloud in order to distribute data across
    multiple back-end servers.
    """
    filename = f'{batch_i}.nc'
    hash_of_filename = hashlib.md5(filename.encode()).hexdigest()
    return f'{hash_of_filename[:6]}_{filename}'


def write_batch_locally(batch: xr.Dataset, batch_i: int):
    encoding = {
        name: {'compression': 'lzf'}
        for name in batch.data_vars}
    filename = get_filename(batch_i)
    local_filename = LOCAL_TEMP_PATH / filename
    batch.to_netcdf(local_filename, engine='h5netcdf', mode='w', encoding=encoding)

In [82]:
DST_NETCDF4_PATH

'gs://solar-pv-nowcasting-data/prepared_ML_training_data/netcdf4/'

In [83]:
UPLOAD_EVERY_N_BATCHES = 64

for batch_i in range(n_batches):
    print(f'\r{batch_i}/{n_batches}', end='', flush=True)
    start_example = batch_i * batch_size
    end_example = start_example + batch_size - 1  # -1 because slice gets range [start, end] not [start, end).
    batch = ds.sel(example=slice(start_example, end_example)).load()
    write_batch_locally(batch, batch_i)
    if batch_i > 0 and (batch_i % UPLOAD_EVERY_N_BATCHES) == 0:
        print('\ruploading***************', end='', flush=True)
        gcs.put(str(LOCAL_TEMP_PATH), DST_NETCDF4_PATH, recursive=True)
        files = glob.glob(str(LOCAL_TEMP_PATH / '*.nc'))
        for f in files:
            os.remove(f)
    
print()
print('Done!')

uploading***************
117/30465

  w = _WorkItem(f, fn, args, kwargs)
  w = _WorkItem(f, fn, args, kwargs)


uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
uploading***************
