In [1]:
import glob
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from tpot.export_utils import set_param_recursive
import xarray as xr
from SALib.sample import saltelli
from SALib.analyze import sobol
import joblib
import re
import os
import dask
import dask.bag as db
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cartopy.crs as ccrs
from cartopy.feature import ShapelyFeature
from cartopy.io.shapereader import Reader
import cloudpickle
params = {
    'text.latex.preamble': ['\\usepackage{gensymb}'],
    'axes.grid': False,
    'savefig.dpi': 700,
    'font.size': 12,
    'text.usetex': False,
    'figure.figsize': [5, 5],
    'font.family': 'serif',
}
matplotlib.rcParams.update(params)

In [2]:
from dask_jobqueue import SGECluster
from dask.distributed import Client

cluster = SGECluster(
    walltime='12:00:00', 
    memory='2 G',
    resource_spec='h_vmem=2G',
    scheduler_options={
        'dashboard_address': ':5757',
    },
    project='admiralty'
)

client = Client(cluster)

In [3]:
cluster.scale(jobs=50)

In [None]:
client.close()
cluster.close()

In [4]:
output = 'PM2_5_DRY'
path = '/nobackup/earlacoa/machinelearning/data/'

with open(path + 'dict_train.pickle', 'rb') as ds:
    dict_train = pickle.load(ds)
    
df_train = pd.concat(dict_train, ignore_index=True)
gridcells = df_train[['lat', 'lon']].drop_duplicates().values.tolist()

#### create control using emulators

In [None]:
outputs = [
    'PM2_5_DRY',
    'o3',
    'AOD550_sfc',
    'asoaX_2p5',
    'bc_2p5',
    'bsoaX_2p5',
    'nh4_2p5',
    'no3_2p5',
    'oc_2p5',
    'oin_2p5',
    'so4_2p5'
]

fraction_res = 1.0
fraction_ind = 1.0
fraction_tra = 1.0
fraction_agr = 1.0
fraction_ene = 1.0

custom_inputs = np.array([
    fraction_res,
    fraction_ind,
    fraction_tra,
    fraction_agr,
    fraction_ene
]).reshape(1, -1)

empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

for output in outputs:
    emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')
    
    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={'lat': np.arange(-60, 85, 0.25), 'lon': np.arange(-180, 180, 0.25)}
    )
    
    for emulator_file in emulator_files:
        lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
        emulator = joblib.load(emulator_file)
        
        try:
            custom_output = emulator.predict(custom_inputs)
            ds_custom_output = xr.where(
                (ds_custom_output.coords['lat'] == lat) & (ds_custom_output.coords['lon'] == lon),
                custom_output,
                ds_custom_output
            )
        except:
            RuntimeError
    
    ds_custom_output.name = output
    ds_custom_output.to_netcdf(
        path + 'summary/ds_ctl_' + output + '.nc'
    )

#### create individual 10% emulators while holding other inputs at 1.0

In [None]:
output = 'PM2_5_DRY'
#output = 'o3'

emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')

empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

matrix1 = np.array(np.meshgrid(np.linspace(0, 1.5, 16), 1, 1, 1, 1)).T.reshape(-1, 5)
matrix2 = np.array(np.meshgrid(1, np.linspace(0, 1.5, 16), 1, 1, 1)).T.reshape(-1, 5)
matrix3 = np.array(np.meshgrid(1, 1, np.linspace(0, 1.5, 16), 1, 1)).T.reshape(-1, 5)
matrix4 = np.array(np.meshgrid(1, 1, 1, np.linspace(0, 1.5, 16), 1)).T.reshape(-1, 5)
matrix5 = np.array(np.meshgrid(1, 1, 1, 1, np.linspace(0, 1.5, 16))).T.reshape(-1, 5)
matrix_stacked = np.vstack((matrix1, matrix2, matrix3, matrix4, matrix5))

for matrix in matrix_stacked:
    custom_inputs = matrix.reshape(1, -1)
    filename = 'RES' + str(np.round(custom_inputs[0][0], decimals=1)) \
                + '_IND' + str(np.round(custom_inputs[0][1], decimals=1)) \
                + '_TRA' + str(np.round(custom_inputs[0][2], decimals=1)) \
                + '_AGR' + str(np.round(custom_inputs[0][3], decimals=1)) \
                + '_ENE' + str(np.round(custom_inputs[0][4], decimals=1))

    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={'lat': np.arange(-60, 85, 0.25), 'lon': np.arange(-180, 180, 0.25)}
    )

    for emulator_file in emulator_files:
        lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
        emulator = joblib.load(emulator_file)
        custom_output = emulator.predict(custom_inputs)
        ds_custom_output = xr.where(
            (ds_custom_output.coords['lat'] == lat) & (ds_custom_output.coords['lon'] == lon),
            custom_output,
            ds_custom_output
        )
    
    ds_custom_output.name = output
    
    ds_custom_output.to_netcdf(
        path + '/summary/ds_' + filename + '_' + output + '.nc'
    )

#### create 10% emulators - pangeo
parallelise over the custom inputs (as these are independent, while the dataset for gridcells are dependent)

Dask bag good practices
- no inter-worker communication
    - use the bag to load data
- minimise IO
- cloudpickle functions

Dask bag features
- immutable
- multi-processing (by default)
- multiple bags need identical partitions (number and size)

In [87]:
matrix_stacked = np.array(np.meshgrid(
    np.linspace(0, 1.5, 16), 
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16)
)).T.reshape(-1, 5)

custom_inputs = [item.reshape(1, -1) for item in matrix_stacked]

In [None]:
# remove duplicates of ones already completed
custom_inputs_completed_filenames = glob.glob(path + 'summary/ds*')
custom_inputs_completed_list = []
for custom_inputs_completed_filename in custom_inputs_completed_filenames:
    custom_inputs_completed_list.append(
        [float(item) for item in re.findall(r'\d+\.\d+', custom_inputs_completed_filename)]
    )
    
custom_inputs_list = []
for custom_input in custom_inputs:
    custom_inputs_list.append(
        [float(item) for item in re.findall(r'[0-9]\.[0-9]?', str(custom_input))]
    )
    
custom_inputs = [np.array(item).reshape(1, -1) for item in custom_inputs_list if item not in custom_inputs_completed_list]

In [227]:
custom_inputs_sample = custom_inputs[0:20]

In [228]:
output = 'PM2_5_DRY'
emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')

def load_emulator(emulator_file):
    lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
    emulator = joblib.load(emulator_file)
    return lat, lon, emulator


def custom_predict(emulator, custom_input):
    lat, lon, emulator = emulator
    custom_output = emulator.predict(custom_input)[0]
    return lat, lon, custom_input, custom_output

In [229]:
pickled_custom_predict = cloudpickle.dumps(custom_predict)
depickled_custom_predict = pickle.loads(pickled_custom_predict)

pickled_load_emulator = cloudpickle.dumps(load_emulator)
depickled_load_emulator = pickle.loads(pickled_load_emulator)

In [230]:
bag_emulators = db.from_sequence(emulator_files).map(depickled_load_emulator)

In [231]:
%%time
results = bag_emulators.map(depickled_custom_predict, custom_input).compute()

CPU times: user 405 ms, sys: 16 ms, total: 421 ms
Wall time: 1.55 s


In [232]:
%%time
results = []
for custom_input in custom_inputs_sample:
    results.append(bag_emulators.map(depickled_custom_predict, custom_input).compute())

CPU times: user 8.65 s, sys: 264 ms, total: 8.91 s
Wall time: 31.5 s


cannot use multiple bags as cannot have identical paritions between gridcells (15,372) and inputs (1,000,000)

if run in serial for 400 ms each input, then 100 hours for 1,000,000 inputs

In [233]:
mp_options = []
for custom_input in custom_inputs_sample:
    for emulator_file in emulator_files:
        lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
        options = {
            'custom_input': custom_input,
            'lat': lat,
            'lon': lon,
            'emulator_file': emulator_file,
        }
        mp_options.append(options)

In [235]:
def custom_predict(options):
    custom_input = options['custom_input']
    emulator = joblib.load(options['emulator_file'])
    custom_output = emulator.predict(custom_input)[0]
    return options['lat'], options['lon'], custom_input, custom_output

In [236]:
pickled_custom_predict = cloudpickle.dumps(custom_predict)
depickled_custom_predict = pickle.loads(pickled_custom_predict)

In [237]:
bag_mp_options = db.from_sequence(mp_options)
bag_mp_options = bag_mp_options.map(depickled_custom_predict)

In [238]:
%%time
results = bag_mp_options.compute()

CPU times: user 1min 12s, sys: 2.97 s, total: 1min 15s
Wall time: 1min 42s


after create datasets from results

In [None]:
joblib.dump(results, path + 'results_' + output + '.joblib')

In [None]:
results = joblib.load(path + 'results_' + output + '.joblib')

In [220]:
results_lats = [item[0] for item in results]
results_lons = [item[1] for item in results]
results_custom_inputs = [item[2][0] for item in results]
results_custom_outputs = [item[3] for item in results]

In [None]:
empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

for custom_input_index, custom_input in enumerate(results_custom_inputs):
    filename = 'RES' + str(np.round(custom_input[0][0], decimals=1)) \
                + '_IND' + str(np.round(custom_input[0][1], decimals=1)) \
                + '_TRA' + str(np.round(custom_input[0][2], decimals=1)) \
                + '_AGR' + str(np.round(custom_input[0][3], decimals=1)) \
                + '_ENE' + str(np.round(custom_input[0][4], decimals=1))
    
    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={
            'lat': np.arange(-60, 85, 0.25), 
            'lon': np.arange(-180, 180, 0.25)
        }
    )
    
    print(filename)
    for custom_output_index, custom_output in enumerate(results_custom_outputs[custom_input_index]):
        lat = results_lats[custom_input_index][custom_output_index]
        lon = results_lons[custom_input_index][custom_output_index]
        ds_custom_output.loc[dict(lat=lat, lon=lon)] = custom_output
        
    
    ds_custom_output.name = output
    ds_custom_output.to_netcdf(
        path + 'summary/ds_' + filename + '_' + output + '.nc'
    )