Compute standard deviation and means by streaming in training batches.

Adapted from https://stackoverflow.com/a/5543790/732596

In [70]:
import xarray as xr
from pathlib import Path
import pandas as pd
import numpy as np

In [71]:
BASE_PATH = Path("/mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train")
DATA_SOURCE_NAME = "nwp"

In [72]:
def compute_accumulators(data_array: xr.DataArray) -> pd.DataFrame:
    dims_to_aggregate_over = set(data_array.dims) - set(['channels_index'])
    data_array = data_array.astype(np.float64)  # Minimise numerical instability.
    _count = data_array.count(dim=dims_to_aggregate_over).to_pandas()
    _sum = data_array.sum(dim=dims_to_aggregate_over).to_pandas()
    _sum_of_squares = (data_array ** 2).sum(dim=dims_to_aggregate_over).to_pandas()
    return pd.Series({
        'count': _count,
        'sum': _sum.astype(np.float128),
        'sum_of_squares': _sum_of_squares.astype(np.float128)
    })

In [73]:
def compute_std(accumulators: pd.DataFrame):
    return np.sqrt(
        (accumulators['count'] * accumulators['sum_of_squares'] - accumulators['sum'] * accumulators['sum'])
        / 
        (accumulators['count'] * (accumulators['count'] - 1))
    )

In [74]:
def compute_mean(accumulators: pd.DataFrame):
    return accumulators['sum'] / accumulators['count']

In [75]:
def load_and_check_batch(filename: Path) -> pd.DataFrame:
    """Loads a batch NetCDF file. Computes stats. Returns pd.Series mapping stat name to stat value."""
    dataset = xr.load_dataset(filename, mode="r")
    data_array = dataset['data']
    
    # Validation checks:
    msg = ""
    if not np.isfinite(data_array).all():
        msg += "NOT FINITE "
    #if (data_array < 0).any():
    #    msg += f"NEGATIVE!  min={data_array.min().values} "
    #if (data_array > 1023).any():
    #    msg += f"ABOVE 1,023!  max={data_array.max().values} "
    if msg:
        print("\n", filename.stem, msg, "\n", flush=True)
    
    # Compute accumulators for standard deviation and mean:
    return compute_accumulators(data_array)

In [76]:
def run_on_all_files():
    filenames = (BASE_PATH / DATA_SOURCE_NAME).glob("*.nc")
    filenames = np.sort(list(filenames))
    n = len(filenames)
    print(n, "filenames found")
    accumulators = None
    for i, filename in enumerate(filenames):
        print(f"{i+1:5,d}/{n:5,d}: {filename}\r", flush=True, end="")
        accumulators_for_filename = load_and_check_batch(filename)
        if accumulators is None:
            accumulators = accumulators_for_filename
        else:
            accumulators += accumulators_for_filename
            
    return accumulators

accumulators = run_on_all_files()

4000 filenames found
4,000/4,000: /mnt/storage_ssd_4tb/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v15/train/topographic/003999.nc

In [77]:
accumulators

count                         524288000
sum                  71243672301.444901
sum_of_squares    20803280670966.800781
dtype: object

In [81]:
filename = BASE_PATH / DATA_SOURCE_NAME / "000000.nc"
dataset = xr.load_dataset(filename, mode="r")
dataset

In [17]:
accumulators["channel_name"] = dataset.channels[0]
accumulators = accumulators.reset_index().set_index("channel_name").sort_values("channels_index")
accumulators

Unnamed: 0_level_0,channels_index,count,sum,sum_of_squares
channel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IR_016,0,2285568000,666508700000.0,261989200000000.0
IR_039,1,2285568000,1962855000000.0,1707008000000000.0
IR_087,2,2285568000,1687458000000.0,1267026000000000.0
IR_097,3,2285568000,1766952000000.0,1383207000000000.0
IR_108,4,2285568000,1388555000000.0,899360500000000.0
IR_120,5,2285568000,1967124000000.0,1717936000000000.0
IR_134,6,2285568000,2114260000000.0,1980686000000000.0
VIS006,7,2285568000,521158300000.0,170425100000000.0
VIS008,8,2285568000,588678500000.0,204539600000000.0
WV_062,9,2285568000,1448130000000.0,946126100000000.0


In [78]:
accumulators.to_csv(f"{DATA_SOURCE_NAME}_accumulators.csv")

In [79]:
mean = compute_mean(accumulators)
#mean.to_csv(f"{DATA_SOURCE_NAME}_mean.csv")
#mean.to_dict()
mean

135.88652096070271

In [80]:
std = compute_std(accumulators)
#std.to_csv(f"{DATA_SOURCE_NAME}_std.csv")
#std.to_dict()
std

145.65013699767726867

In [84]:
dataset['data'].std()