In [1]:
import pathlib as Path

import numpy as np
from tabulate import tabulate

from remake import load_remake

In [2]:
e5p = load_remake('../../remakefiles/era5_process.py')
# e5h = load_remake('../../remakefiles/era5_histograms.py')
# mle = load_remake('../../remakefiles/mcs_local_envs.py')
# plt = load_remake('../../remakefiles/plotting.py')
# remakes = [e5p, e5h, mle, plt]
remakes = [e5p]

loading pixel inputs cache


In [3]:
def input_output_file_size(rule):
    input_paths = set()
    output_paths = []
    for task in rule.tasks:
        input_paths.update(task.inputs.values())
        output_paths.extend(task.outputs.values())
    input_paths = sorted(input_paths)
    input_file_sizes = {
        p: p.lstat().st_size if p.exists() else 0
        for p in input_paths
    }
    output_file_sizes = {
        p: p.lstat().st_size if p.exists() else 0
        for p in output_paths
    }

    return input_file_sizes, output_file_sizes

In [4]:
#insizes, outsizes = input_output_file_size(plt.PlotCombineConditionalERA5Hist)

In [5]:
# insizes

In [6]:
# outsizes

In [7]:
class RemakeInfo:
    def __init__(self, rmk):
        self.rmk = rmk
        self.rule_data = {}

    def gen_rule_data(self):
        for rule in self.rmk.rules:
            input_file_sizes, output_file_sizes = input_output_file_size(rule)
            self.rule_data[rule] = (input_file_sizes, output_file_sizes)

    def display(self, size_fmt='GB'):
        fmts = {
            'B': 1,
            'kB': 1e3,
            'MB': 1e6,
            'GB': 1e9,
            'TB': 1e12,
        }
        def f(n):
            s = fmts[size_fmt]
            return f'{n / s:.0f}'
    
        rows = []
        for rule, (input_file_sizes, output_file_sizes) in self.rule_data.items():
            insizes = np.array(list(input_file_sizes.values()))
            outsizes = np.array(list(output_file_sizes.values()))
            inmean, instd, intotal = insizes.mean(), insizes.std(), insizes.sum()
            outmean, outstd, outtotal = outsizes.mean(), outsizes.std(), outsizes.sum()
            rows.append(
                [rule.__name__, 'inputs:'] + [f(v) for v in [inmean, instd, intotal]]
            )
            rows.append(
                [rule.__name__, 'outputs:'] + [f(v) for v in [outmean, outstd, outtotal]]
            )        
        print(f'{self.rmk.name}')
        print(tabulate(
            rows,
            headers=['rule', 'type', f'mean ({size_fmt})', f'std ({size_fmt})', f'total ({size_fmt})']
        ))

def individual_rule_input_output_info(rule, size_fmt='GB'):
    fmts = {
        'B': 1,
        'kB': 1e3,
        'MB': 1e6,
        'GB': 1e9,
        'TB': 1e12,
    }
    def f(n):
        s = fmts[size_fmt]
        return f'{n / s:.0f}'
        
    input_file_sizes, output_file_sizes = input_output_file_size(rule)
    insizes = np.array(list(input_file_sizes.values()))
    outsizes = np.array(list(output_file_sizes.values()))
    inmean, instd, intotal = insizes.mean(), insizes.std(), insizes.sum()
    outmean, outstd, outtotal = outsizes.mean(), outsizes.std(), outsizes.sum()
    print(f'{rule}')
    print(tabulate(
        [
            ['inputs:'] + [f(v) for v in [inmean, instd, intotal]],
            ['outputs:'] + [f(v) for v in [outmean, outstd, outtotal]],
        ],
        headers=['', f'mean ({size_fmt})', f'std ({size_fmt})', f'total ({size_fmt})']
    ))

def rule_input_output_info(rmk, size_fmt='GB'):
    fmts = {
        'B': 1,
        'kB': 1e3,
        'MB': 1e6,
        'GB': 1e9,
        'TB': 1e12,
    }
    def f(n):
        s = fmts[size_fmt]
        return f'{n / s:.0f}'

    rows = []
    for rule in rmk.rules:
        input_file_sizes, output_file_sizes = input_output_file_size(rule)
        insizes = np.array(list(input_file_sizes.values()))
        outsizes = np.array(list(output_file_sizes.values()))
        inmean, instd, intotal = insizes.mean(), insizes.std(), insizes.sum()
        outmean, outstd, outtotal = outsizes.mean(), outsizes.std(), outsizes.sum()
        rows.append(
            [rule.__name__, 'inputs:'] + [f(v) for v in [inmean, instd, intotal]]
        )
        rows.append(
            [rule.__name__, 'outputs:'] + [f(v) for v in [outmean, outstd, outtotal]]
        )

    print(f'{rmk.name}')
    print(tabulate(
        rows,
        headers=['rule', 'type', f'mean ({size_fmt})', f'std ({size_fmt})', f'total ({size_fmt})']
    ))

In [8]:
rmk_info = RemakeInfo(e5p)
rmk_info.gen_rule_data()
rmk_info.display('GB')

era5_process
rule                       type        mean (GB)    std (GB)    total (GB)
-------------------------  --------  -----------  ----------  ------------
GenRegridder               inputs:             0           0             0
GenRegridder               outputs:            0           0             0
CalcERA5Shear              inputs:             0           0         81924
CalcERA5Shear              outputs:            0           0          1975
CalcERA5VIMoistureFluxDiv  inputs:             0           0        138315
CalcERA5VIMoistureFluxDiv  outputs:            0           0           995
CalcERA5LayerMeans         inputs:             0           0         56391
CalcERA5LayerMeans         outputs:            0           0          2954
CalcERA5Delta              inputs:             0           0           416
CalcERA5Delta              outputs:            0           0           995
GenPixelDataOnERA5Grid     inputs:             0           0          2408
GenPixelData

In [None]:
rmk_info = RemakeInfo(e5h)
rmk_info.gen_rule_data()
rmk_info.display('GB')

In [None]:
rmk_info = RemakeInfo(mle)
rmk_info.gen_rule_data()
rmk_info.display('GB')

In [None]:
rmk_info = RemakeInfo(plt)
rmk_info.gen_rule_data()
rmk_info.display('MB')