In [1]:
import pathlib as Path

import numpy as np
from tabulate import tabulate

from remake import load_remake

In [2]:
e5p = load_remake('../../remakefiles/era5_process.py')
e5h = load_remake('../../remakefiles/era5_histograms.py')
mle = load_remake('../../remakefiles/mcs_local_envs.py')
plt = load_remake('../../remakefiles/plotting.py')
remakes = [e5p, e5h, mle, plt]


TODOS
* Make sure filenames are consistent
* Make sure variables names are sensible/consistent
* Docstrings for all fns, classes
* Validate all data
* Consistent attrs for all created .nc files
* Units on data vars etc.

loading pixel inputs cache

TODOS
* Make sure filenames are consistent
* Make sure variables names are sensible/consistent
* Docstrings for all fns, classes
* Validate all data
* Consistent attrs for all created .nc files
* Units on data vars etc.

loading pixel inputs cache

TODOS
* Make sure filenames are consistent
* Make sure variables names are sensible/consistent
* Docstrings for all fns, classes
* Validate all data
* Consistent attrs for all created .nc files
* Units on data vars etc.



In [3]:
def input_output_file_size(rule):
    input_paths = set()
    output_paths = []
    for task in rule.tasks:
        input_paths.update(task.inputs.values())
        output_paths.extend(task.outputs.values())
    input_paths = sorted(input_paths)
    input_file_sizes = {
        p: p.lstat().st_size if p.exists() else 0
        for p in input_paths
    }
    output_file_sizes = {
        p: p.lstat().st_size if p.exists() else 0
        for p in output_paths
    }

    return input_file_sizes, output_file_sizes

In [4]:
insizes, outsizes = input_output_file_size(plt.PlotCombineConditionalERA5Hist)

In [5]:
insizes

{PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_output/conditional_era5_histograms/2018/coreprecip_hourly_hist_2018_01.nc'): 107489937,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_output/conditional_era5_histograms/2018/coreprecip_hourly_hist_2018_02.nc'): 97121361,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_output/conditional_era5_histograms/2018/coreprecip_hourly_hist_2018_03.nc'): 107489937,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_output/conditional_era5_histograms/2018/coreprecip_hourly_hist_2018_04.nc'): 104033745,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_output/conditional_era5_histograms/2018/coreprecip_hourly_hist_2018_05.nc'): 107489937,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_output/conditional_era5_histograms/2018/coreprecip_hourly_hist_2018_06.nc'): 104033745,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_output/conditional_era5_histograms/2018/coreprecip_hourly_hist_2018

In [6]:
outsizes

{PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_figs/conditional_era5_histograms/yearly_hist_cape_2018_tb.png'): 126054,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_figs/conditional_era5_histograms/yearly_hist_tcwv_2018_tb.png'): 160030,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_figs/conditional_era5_histograms/yearly_hist_shear_0_2018_tb.png'): 141135,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_figs/conditional_era5_histograms/yearly_hist_shear_1_2018_tb.png'): 126471,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_figs/conditional_era5_histograms/yearly_hist_shear_2_2018_tb.png'): 111908,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_figs/conditional_era5_histograms/yearly_hist_shear_3_2018_tb.png'): 138784,
 PosixPath('/gws/nopw/j04/mcs_prime/mmuetz/data/mcs_prime_figs/conditional_era5_histograms/yearly_hist_vertically_integrated_moisture_flux_div_2018_tb.png'): 138177,
 PosixPath('/gws/nopw/j04/mcs_prime/

In [12]:
class RemakeInfo:
    def __init__(self, rmk):
        self.rmk = rmk
        self.rule_data = {}

    def gen_rule_data(self):
        for rule in self.rmk.rules:
            input_file_sizes, output_file_sizes = input_output_file_size(rule)
            self.rule_data[rule] = (input_file_sizes, output_file_sizes)

    def display(self, size_fmt='GB'):
        fmts = {
            'B': 1,
            'kB': 1e3,
            'MB': 1e6,
            'GB': 1e9,
            'TB': 1e12,
        }
        def f(n):
            s = fmts[size_fmt]
            return f'{n / s:.0f}'
    
        rows = []
        for rule, (input_file_sizes, output_file_sizes) in self.rule_data.items():
            insizes = np.array(list(input_file_sizes.values()))
            outsizes = np.array(list(output_file_sizes.values()))
            inmean, instd, intotal = insizes.mean(), insizes.std(), insizes.sum()
            outmean, outstd, outtotal = outsizes.mean(), outsizes.std(), outsizes.sum()
            rows.append(
                [rule.__name__, 'inputs:'] + [f(v) for v in [inmean, instd, intotal]]
            )
            rows.append(
                [rule.__name__, 'outputs:'] + [f(v) for v in [outmean, outstd, outtotal]]
            )        
        print(f'{self.rmk.name}')
        print(tabulate(
            rows,
            headers=['rule', 'type', f'mean ({size_fmt})', f'std ({size_fmt})', f'total ({size_fmt})']
        ))

def individual_rule_input_output_info(rule, size_fmt='GB'):
    fmts = {
        'B': 1,
        'kB': 1e3,
        'MB': 1e6,
        'GB': 1e9,
        'TB': 1e12,
    }
    def f(n):
        s = fmts[size_fmt]
        return f'{n / s:.0f}'
        
    input_file_sizes, output_file_sizes = input_output_file_size(rule)
    insizes = np.array(list(input_file_sizes.values()))
    outsizes = np.array(list(output_file_sizes.values()))
    inmean, instd, intotal = insizes.mean(), insizes.std(), insizes.sum()
    outmean, outstd, outtotal = outsizes.mean(), outsizes.std(), outsizes.sum()
    print(f'{rule}')
    print(tabulate(
        [
            ['inputs:'] + [f(v) for v in [inmean, instd, intotal]],
            ['outputs:'] + [f(v) for v in [outmean, outstd, outtotal]],
        ],
        headers=['', f'mean ({size_fmt})', f'std ({size_fmt})', f'total ({size_fmt})']
    ))

def rule_input_output_info(rmk, size_fmt='GB'):
    fmts = {
        'B': 1,
        'kB': 1e3,
        'MB': 1e6,
        'GB': 1e9,
        'TB': 1e12,
    }
    def f(n):
        s = fmts[size_fmt]
        return f'{n / s:.0f}'

    rows = []
    for rule in rmk.rules:
        input_file_sizes, output_file_sizes = input_output_file_size(rule)
        insizes = np.array(list(input_file_sizes.values()))
        outsizes = np.array(list(output_file_sizes.values()))
        inmean, instd, intotal = insizes.mean(), insizes.std(), insizes.sum()
        outmean, outstd, outtotal = outsizes.mean(), outsizes.std(), outsizes.sum()
        rows.append(
            [rule.__name__, 'inputs:'] + [f(v) for v in [inmean, instd, intotal]]
        )
        rows.append(
            [rule.__name__, 'outputs:'] + [f(v) for v in [outmean, outstd, outtotal]]
        )

    print(f'{rmk.name}')
    print(tabulate(
        rows,
        headers=['rule', 'type', f'mean ({size_fmt})', f'std ({size_fmt})', f'total ({size_fmt})']
    ))

In [17]:
rmk_info = RemakeInfo(e5p)
rmk_info.gen_rule_data()
rmk_info.display('GB')

era5_process
rule                       type        mean (GB)    std (GB)    total (GB)
-------------------------  --------  -----------  ----------  ------------
GenRegridder               inputs:             0           0             0
GenRegridder               outputs:            0           0             0
CalcERA5Shear              inputs:             0           0          3975
CalcERA5Shear              outputs:            0           0           114
CalcERA5VIMoistureFluxDiv  inputs:             0           0          6756
CalcERA5VIMoistureFluxDiv  outputs:            0           0            57
CalcERA5LayerMeans         inputs:             0           0          2781
CalcERA5LayerMeans         outputs:            0           0           171
CalcERA5Delta              inputs:             0           0            24
CalcERA5Delta              outputs:            0           0            57
GenPixelDataOnERA5Grid     inputs:             0           0           125
GenPixelData

In [18]:
rmk_info = RemakeInfo(e5h)
rmk_info.gen_rule_data()
rmk_info.display('GB')

era5_histograms
rule                                   type        mean (GB)    std (GB)    total (GB)
-------------------------------------  --------  -----------  ----------  ------------
ConditionalERA5HistHourly              inputs:             0           0           383
ConditionalERA5HistHourly              outputs:            0           0             3
ConditionalERA5HistHourlyMCSLifecycle  inputs:             0           0           383
ConditionalERA5HistHourlyMCSLifecycle  outputs:            0           0             2
ConditionalERA5HistGridpoint           inputs:             0           0           383
ConditionalERA5HistGridpoint           outputs:            1           0             6
ConditionalERA5HistMeanfield           inputs:             0           0            22
ConditionalERA5HistMeanfield           outputs:            0           0             1
CombineConditionalERA5HistGridpoint    inputs:             1           0             6
CombineConditionalERA5HistG

In [19]:
rmk_info = RemakeInfo(mle)
rmk_info.gen_rule_data()
rmk_info.display('GB')

mcs_local_envs
rule                       type        mean (GB)    std (GB)    total (GB)
-------------------------  --------  -----------  ----------  ------------
GenLatLonDistance          inputs:             0           0             0
GenLatLonDistance          outputs:            3           0             3
CheckLatLonDistance        inputs:             3           0             3
CheckLatLonDistance        outputs:            0           0             0
McsLocalEnv                inputs:             0           0           364
McsLocalEnv                outputs:            0           0           311
LifecycleMcsLocalEnvHist   inputs:             0           0           379
LifecycleMcsLocalEnvHist   outputs:           62           4           739
CombineMonthlyMcsLocalEnv  inputs:             0           0           311
CombineMonthlyMcsLocalEnv  outputs:            0           0            10


In [20]:
rmk_info = RemakeInfo(plt)
rmk_info.gen_rule_data()
rmk_info.display('MB')

plotting
rule                                       type        mean (MB)    std (MB)    total (MB)
-----------------------------------------  --------  -----------  ----------  ------------
PlotCombineConditionalERA5Hist             inputs:           105           3          2531
PlotCombineConditionalERA5Hist             outputs:            0           0             3
PlotCombineVarConditionalERA5Hist          inputs:           105           3          1266
PlotCombineVarConditionalERA5Hist          outputs:            0           0             1
PlotCombineConvectionConditionalERA5Hist   inputs:           105           3          2531
PlotCombineConvectionConditionalERA5Hist   outputs:            0           0             1
PlotGridpointConditionalERA5Hist           inputs:           573         573          1146
PlotGridpointConditionalERA5Hist           outputs:            0           0             2
PlotGridpointGlobal                        inputs:           573         573     