In [None]:
import sys
import os
from tempfile import NamedTemporaryFile
from gc import collect
from time import sleep
sys.path.append('../util')
from meters import ThroughputMeter, clear_host_cache
from ncgen import make_nc
from grids import *
import netCDF4
import numpy as np

In [None]:
def write_netcdf_file(timescale, time_major=True, grid=canada_5k):
    print("Creating a time-{} NetCDF file with {}x{} grid and {} time steps".format('major' if time_major else 'minor', grid['lon']['count'], grid['lat']['count'],len(timescale)))
    with NamedTemporaryFile(suffix='.nc', delete=False, dir='/app/tmp') as f:
        nc = make_nc(f.name, grid=grid, timescale=timescale, timemajor=time_major)
        nc.close()
    print("File size: {:.2f}Mb".format(os.path.getsize(f.name)/1024/1024))
    return f

../tmp path in the Docker container points at rotating media storage.

In [None]:
def netcdf_read_test(f, time_major):
    results = []
    # Open the file just created
    nc = netCDF4.Dataset(f.name, 'r')
    if time_major:
        with ThroughputMeter() as t:
            a = nc.variables['var_0'][0,:,:]

    else:
        with ThroughputMeter() as t:
            a = nc.variables['var_0'][:,:,0]
    
    results.append((time_major, len(timescale), t.megabytes_per_second(a)))

    # python-netCDF4 seems to leak file descriptors
    # We have to take a lot of steps to make sure that the files get closed and that
    # the space gets reclaimed by the OS
    nc.close
    del nc
    print("Removing {}".format(f.name))
    os.remove(f.name)
    f.close()
    collect()
    return results

In [None]:
results = []

for time_major in [True, False]:
    for grid in [world_250k, world_125k, canada_5k, bc_400m]:
        for timescale in [timescales['annual']]:
            testfile = write_netcdf_file(timescale, time_major=time_major, grid=grid)
            clear_host_cache()
            results.append(netcdf_read_test(testfile, time_major))

Monthly

In [None]:

for time_major in [True, False]:
    for grid in [world_250k, world_125k, canada_5k, bc_400m]:
        for timescale in [timescales['monthly']]:
            testfile = write_netcdf_file(timescale, time_major=time_major, grid=grid)
            clear_host_cache()
            results.append(netcdf_read_test(testfile, time_major))

Daily - omit BC400m grid

In [None]:
# Daily timescale. Omit bc400m grid
for time_major in [True, False]:
    for grid in [world_250k, world_125k, canada_5k]:
        for timescale in [timescales['daily']]:
            testfile = write_netcdf_file(timescale, time_major=time_major, grid=grid)
            clear_host_cache()
            results.append(netcdf_read_test(testfile, time_major))

## Summarize Results

In [None]:
monthly_grid_sizes = ['world_125k', 'canada_5k', 'bc_400m']
daily_grid_sizes = ['world_250k', 'canada_5k', 'bc_400m']

monthly_step_read_throughput_time_major = [results_tmaj_world_125k_monthly, results_tmaj_canada_5k_monthly, \
                            results_tmaj_bc_400m_monthly]
monthly_step_read_throughput_time_minor = [results_tmin_world_125k_monthly, results_tmin_canada_5k_monthly, \
                            results_tmin_bc_400m_monthly]

daily_step_read_throughput_time_major = [results_tmaj_world_250k_daily, results_tmaj_canada_5k_daily, \
                            results_tmaj_bc_400m_daily]

daily_step_read_throughput_time_minor = [results_tmin_world_250k_daily, results_tmin_canada_5k_daily, \
                            results_tmin_bc_400m_daily]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
monthly_tmaj, = plt.plot([point[0][2] for point in monthly_step_read_throughput_time_major], label='time-major')
monthly_tmin, = plt.plot([point[0][2] for point in monthly_step_read_throughput_time_minor], label='time-minor')
plt.xticks(range(len(monthly_grid_sizes)), monthly_grid_sizes)
plt.title('Monthly (1800 time steps) Throughput Comparison [MB/s]')
plt.legend(loc='upper center', shadow=True)
plt.show()

daily_tmaj, = plt.plot([point[0][2] for point in daily_step_read_throughput_time_major], label='time-major')
daily_tmin, = plt.plot([point[0][2] for point in daily_step_read_throughput_time_minor], label='time-minor')
plt.xticks(range(len(daily_grid_sizes)), daily_grid_sizes)
plt.title('Daily (54787 time steps) Throughput Comparison [MB/s]')
plt.legend(loc='upper center', shadow=True)
plt.show()

### There seems to be a slightly higher throughput for reading out of large time-major NetCDF data files vs. time-minor (on the order of ~7MB/sec)