In [1]:
"""Compute standard deviation and means by streaming in training batches.

Adapted from https://stackoverflow.com/a/5543790/732596
"""

'Compute standard deviation and means by streaming in training batches.\n\nAdapted from https://stackoverflow.com/a/5543790/732596\n'

TODO: Compute mins and maxes too.

In [2]:
import xarray as xr
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
BASE_PATH = Path("/mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train")
# DATA_SOURCE_NAMES = ("gsp", "hrvsatellite", "nwp", "pv", "satellite", "sun", "topographic")
DATA_SOURCE_NAME = "hrvsatellite"

In [4]:
def compute_accumulators(data_array: xr.DataArray) -> pd.DataFrame:
    dims_to_aggregate_over = set(data_array.dims) - set(['channels_index'])
    data_array = data_array.astype(np.float64)  # Minimise numerical instability.
    _count = data_array.count(dim=dims_to_aggregate_over).to_series()
    _sum = data_array.sum(dim=dims_to_aggregate_over).to_series()
    _sum_of_squares = (data_array ** 2).sum(dim=dims_to_aggregate_over).to_series()
    return pd.DataFrame({
        'count': _count,
        'sum': _sum.astype(np.float128),
        'sum_of_squares': _sum_of_squares.astype(np.float128)
    })

In [5]:
def compute_std(accumulators: pd.DataFrame):
    return np.sqrt(
        (accumulators['count'] * accumulators['sum_of_squares'] - accumulators['sum'] * accumulators['sum'])
        / 
        (accumulators['count'] * (accumulators['count'] - 1))
    )

In [6]:
def compute_mean(accumulators: pd.DataFrame):
    return accumulators['sum'] / accumulators['count']

In [7]:
def load_and_check_batch(filename: Path) -> pd.DataFrame:
    """Loads a batch NetCDF file. Computes stats. Returns pd.Series mapping stat name to stat value."""
    dataset = xr.load_dataset(filename, mode="r")
    data_array = dataset['data']
    
    # Validation checks:
    msg = ""
    if not np.isfinite(data_array).all():
        msg += "NOT FINITE "
    if (data_array < 0).any():
        msg += f"NEGATIVE!  min={data_array.min().values} "
    if (data_array > 1023).any():
        msg += f"ABOVE 1,023!  max={data_array.max().values} "
    if msg:
        print("\n", filename.stem, msg, "\n", flush=True)
    
    # Compute accumulators for standard deviation and mean:
    return compute_accumulators(data_array)

In [9]:
def run_on_all_files():
    filenames = (BASE_PATH / DATA_SOURCE_NAME).glob("*.nc")
    filenames = np.sort(list(filenames))
    n = len(filenames)
    print(n, "filenames found")
    accumulators = None
    for i, filename in enumerate(filenames):
        print(f"{i+1:5,d}/{n:5,d}: {filename}\r", flush=True, end="")
        accumulators_for_filename = load_and_check_batch(filename)
        if accumulators is None:
            accumulators = accumulators_for_filename
        else:
            accumulators += accumulators_for_filename
            
    return accumulators

accumulators = run_on_all_files()

4000 filenames found
  165/4,000: /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train/hrvsatellite/000164.nc
 000164 ABOVE 1,023!  max=1041  

  168/4,000: /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train/hrvsatellite/000167.nc
 000167 ABOVE 1,023!  max=1038  

  173/4,000: /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train/hrvsatellite/000172.nc
 000172 ABOVE 1,023!  max=1083  

  194/4,000: /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train/hrvsatellite/000193.nc
 000193 ABOVE 1,023!  max=1030  

  304/4,000: /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train/hrvsatellite/000303.nc
 000303 ABOVE 1,023!  max=1034  

  368/4,000: /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcast

In [10]:
accumulators

Unnamed: 0_level_0,count,sum,sum_of_squares
channels_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16252928000,3837846000000.0,1224429000000000.0


In [10]:
filename = BASE_PATH / DATA_SOURCE_NAME / "000000.nc"
dataset = xr.load_dataset(filename, mode="r")

In [17]:
accumulators["channel_name"] = dataset.channels[0]
accumulators = accumulators.reset_index().set_index("channel_name").sort_values("channels_index")
accumulators

Unnamed: 0_level_0,channels_index,count,sum,sum_of_squares
channel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IR_016,0,2285568000,666508700000.0,261989200000000.0
IR_039,1,2285568000,1962855000000.0,1707008000000000.0
IR_087,2,2285568000,1687458000000.0,1267026000000000.0
IR_097,3,2285568000,1766952000000.0,1383207000000000.0
IR_108,4,2285568000,1388555000000.0,899360500000000.0
IR_120,5,2285568000,1967124000000.0,1717936000000000.0
IR_134,6,2285568000,2114260000000.0,1980686000000000.0
VIS006,7,2285568000,521158300000.0,170425100000000.0
VIS008,8,2285568000,588678500000.0,204539600000000.0
WV_062,9,2285568000,1448130000000.0,946126100000000.0


In [18]:
accumulators.to_csv(f"{DATA_SOURCE_NAME}_accumulators.csv")

In [19]:
std = compute_std(accumulators)
std.to_csv(f"{DATA_SOURCE_NAME}_std.csv")
std.to_dict()

{'IR_016': 172.01044433112992,
 'IR_039': 96.53756504807913,
 'IR_087': 96.21369354283686,
 'IR_097': 86.72892737648276,
 'IR_108': 156.20651744208888,
 'IR_120': 104.35287930753246,
 'IR_134': 104.36462050405994,
 'VIS006': 150.2399269307514,
 'VIS008': 152.16086321818398,
 'WV_062': 111.8514878214775,
 'WV_073': 106.8855172848904}

In [20]:
mean = compute_mean(accumulators)
mean.to_csv(f"{DATA_SOURCE_NAME}_mean.csv")
mean.to_dict()

{'IR_016': 291.61620182554185,
 'IR_039': 858.8040610176552,
 'IR_087': 738.3103442750336,
 'IR_097': 773.0910794778366,
 'IR_108': 607.5318145165666,
 'IR_120': 860.6716261423857,
 'IR_134': 925.0477987594331,
 'VIS006': 228.02134593063957,
 'VIS008': 257.56333202381205,
 'WV_062': 633.5975770915588,
 'WV_073': 543.4963868823854}