In [1]:
import glob
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from tpot.export_utils import set_param_recursive
import xarray as xr
from SALib.sample import saltelli
from SALib.analyze import sobol
import joblib
import re
import os
import dask
import dask.bag as db
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cartopy.crs as ccrs
from cartopy.feature import ShapelyFeature
from cartopy.io.shapereader import Reader
params = {
    'text.latex.preamble': ['\\usepackage{gensymb}'],
    'axes.grid': False,
    'savefig.dpi': 700,
    'font.size': 12,
    'text.usetex': False,
    'figure.figsize': [5, 5],
    'font.family': 'serif',
}
matplotlib.rcParams.update(params)

In [2]:
from dask_jobqueue import SGECluster
from dask.distributed import Client

cluster = SGECluster(
    walltime='12:00:00', 
    memory='2 G',
    resource_spec='h_vmem=2G',
    scheduler_options={
        'dashboard_address': ':5757',
    },
    project='admiralty'
)

client = Client(cluster)

In [3]:
cluster.scale(jobs=50)

In [None]:
client.close()
cluster.close()

In [4]:
output = 'PM2_5_DRY'
path = '/nobackup/earlacoa/machinelearning/data/'

with open(path + 'dict_train.pickle', 'rb') as ds:
    dict_train = pickle.load(ds)
    
df_train = pd.concat(dict_train, ignore_index=True)
gridcells = df_train[['lat', 'lon']].drop_duplicates().values.tolist()

#### create control using emulators

In [None]:
outputs = [
    'PM2_5_DRY',
    'o3',
    'AOD550_sfc',
    'asoaX_2p5',
    'bc_2p5',
    'bsoaX_2p5',
    'nh4_2p5',
    'no3_2p5',
    'oc_2p5',
    'oin_2p5',
    'so4_2p5'
]

fraction_res = 1.0
fraction_ind = 1.0
fraction_tra = 1.0
fraction_agr = 1.0
fraction_ene = 1.0

custom_inputs = np.array([
    fraction_res,
    fraction_ind,
    fraction_tra,
    fraction_agr,
    fraction_ene
]).reshape(1, -1)

empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

for output in outputs:
    emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')
    
    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={'lat': np.arange(-60, 85, 0.25), 'lon': np.arange(-180, 180, 0.25)}
    )
    
    for emulator_file in emulator_files:
        lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
        emulator = joblib.load(emulator_file)
        
        try:
            custom_output = emulator.predict(custom_inputs)
            ds_custom_output = xr.where(
                (ds_custom_output.coords['lat'] == lat) & (ds_custom_output.coords['lon'] == lon),
                custom_output,
                ds_custom_output
            )
        except:
            RuntimeError
    
    ds_custom_output.name = output
    ds_custom_output.to_netcdf(
        path + 'summary/ds_ctl_' + output + '.nc'
    )

#### create individual 10% emulators while holding other inputs at 1.0

In [None]:
output = 'PM2_5_DRY'
#output = 'o3'

emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')

empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

matrix1 = np.array(np.meshgrid(np.linspace(0, 1.5, 16), 1, 1, 1, 1)).T.reshape(-1, 5)
matrix2 = np.array(np.meshgrid(1, np.linspace(0, 1.5, 16), 1, 1, 1)).T.reshape(-1, 5)
matrix3 = np.array(np.meshgrid(1, 1, np.linspace(0, 1.5, 16), 1, 1)).T.reshape(-1, 5)
matrix4 = np.array(np.meshgrid(1, 1, 1, np.linspace(0, 1.5, 16), 1)).T.reshape(-1, 5)
matrix5 = np.array(np.meshgrid(1, 1, 1, 1, np.linspace(0, 1.5, 16))).T.reshape(-1, 5)
matrix_stacked = np.vstack((matrix1, matrix2, matrix3, matrix4, matrix5))

for matrix in matrix_stacked:
    custom_inputs = matrix.reshape(1, -1)
    filename = 'RES' + str(np.round(custom_inputs[0][0], decimals=1)) \
                + '_IND' + str(np.round(custom_inputs[0][1], decimals=1)) \
                + '_TRA' + str(np.round(custom_inputs[0][2], decimals=1)) \
                + '_AGR' + str(np.round(custom_inputs[0][3], decimals=1)) \
                + '_ENE' + str(np.round(custom_inputs[0][4], decimals=1))

    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={'lat': np.arange(-60, 85, 0.25), 'lon': np.arange(-180, 180, 0.25)}
    )

    for emulator_file in emulator_files:
        lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
        emulator = joblib.load(emulator_file)
        custom_output = emulator.predict(custom_inputs)
        ds_custom_output = xr.where(
            (ds_custom_output.coords['lat'] == lat) & (ds_custom_output.coords['lon'] == lon),
            custom_output,
            ds_custom_output
        )
    
    ds_custom_output.name = output
    
    ds_custom_output.to_netcdf(
        path + '/summary/ds_' + filename + '_' + output + '.nc'
    )

#### create 10% emulators - pangeo
parallelise over the custom inputs (as these are independent, while the dataset for gridcells are dependent)

In [5]:
matrix_stacked = np.array(np.meshgrid(
    np.linspace(0, 1.5, 16), 
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16)
)).T.reshape(-1, 5)

custom_inputs = [item.reshape(1, -1) for item in matrix_stacked]

In [None]:
# remove duplicates of ones already completed
custom_inputs_completed_filenames = glob.glob(path + 'summary/ds*')
custom_inputs_completed_list = []
for custom_inputs_completed_filename in custom_inputs_completed_filenames:
    custom_inputs_completed_list.append(
        [float(item) for item in re.findall(r'\d+\.\d+', custom_inputs_completed_filename)]
    )
    
custom_inputs_list = []
for custom_input in custom_inputs:
    custom_inputs_list.append(
        [float(item) for item in re.findall(r'[0-9]\.[0-9]?', str(custom_input))]
    )
    
custom_inputs = [np.array(item).reshape(1, -1) for item in custom_inputs_list if item not in custom_inputs_completed_list]

In [None]:
joblib.dump(custom_inputs, path + 'custom_inputs_' + output + '.joblib')

In [None]:
emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')

emulators = {}

for emulator_file in emulator_files:
    lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
    emulators.update({
        str(lat) + '_' + str(lon): joblib.load(emulator_file)
    })

In [None]:
joblib.dump(emulators, path + 'emulators_' + output + '.joblib')

In [6]:
custom_inputs = joblib.load(path + 'custom_inputs_' + output + '.joblib')

In [7]:
emulators = joblib.load(path + 'emulators_' + output + '.joblib')

testing ideas

everything in one ...

In [111]:
empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

def custom_predict(custom_input):  
    filename = 'RES' + str(np.round(custom_input[0][0], decimals=1)) \
                + '_IND' + str(np.round(custom_input[0][1], decimals=1)) \
                + '_TRA' + str(np.round(custom_input[0][2], decimals=1)) \
                + '_AGR' + str(np.round(custom_input[0][3], decimals=1)) \
                + '_ENE' + str(np.round(custom_input[0][4], decimals=1))
    
    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={
            'lat': np.arange(-60, 85, 0.25), 
            'lon': np.arange(-180, 180, 0.25)
        }
    )
    
    for gridcell in gridcells:
        lat, lon = gridcell
        emulator = emulators[str(lat) + '_' + str(lon)]
        custom_output = emulator.predict(custom_input)[0]
        ds_custom_output.loc[dict(lat=lat, lon=lon)] = custom_output
        
    
    ds_custom_output.name = output
    ds_custom_output.to_netcdf(
        path + 'summary/ds_' + filename + '_' + output + '.nc'
    )

In [112]:
custom_inputs_sample = custom_inputs[0:2]

In [114]:
%%time
for custom_input in custom_inputs_sample:
    custom_predict(custom_input)

CPU times: user 50.2 s, sys: 921 ms, total: 51.1 s
Wall time: 56 s


In [115]:
%load_ext line_profiler
%lprun -f custom_predict custom_predict(custom_inputs_sample[0])

Timer unit: 1e-06 s

Total time: 51.998 s
File: <ipython-input-111-fbcf2891a3ca>
Function: custom_predict at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
     4                                           def custom_predict(custom_input):  
     5                                               filename = 'RES' + str(np.round(custom_input[0][0], decimals=1)) \
     6                                                           + '_IND' + str(np.round(custom_input[0][1], decimals=1)) \
     7                                                           + '_TRA' + str(np.round(custom_input[0][2], decimals=1)) \
     8                                                           + '_AGR' + str(np.round(custom_input[0][3], decimals=1)) \
     9         1        117.0    117.0      0.0                  + '_ENE' + str(np.round(custom_input[0][4], decimals=1))
    10                                               
    11         1          2.0      2.0      0.0      ds_custom_outp

In [113]:
bag_custom_inputs = db.from_sequence(custom_inputs_sample)
bag_custom_inputs = bag_custom_inputs.map_partitions(custom_predict)

In [None]:
%%time
results = bag_custom_inputs.compute()

dask bag over the custom inputs only

In [98]:
def custom_predict(custom_input):  
    lats = []
    lons = []
    custom_inputs = []
    custom_outputs = []
    for gridcell in gridcells:
        lat, lon = gridcell
        emulator = emulators[str(lat) + '_' + str(lon)]
        custom_output = emulator.predict(custom_input)[0]
        lats.append(lat)
        lons.append(lon)
        custom_inputs.append(custom_input)
        custom_outputs.append(custom_output)
        
    return lats, lons, custom_inputs, custom_outputs

In [102]:
%%time
for custom_input in custom_inputs_sample:
    custom_predict(custom_input)

CPU times: user 27 s, sys: 504 ms, total: 27.5 s
Wall time: 28.5 s


In [103]:
%lprun -f custom_predict custom_predict(custom_inputs_sample[0])

Timer unit: 1e-06 s

Total time: 26.6969 s
File: <ipython-input-98-38f65a81f203>
Function: custom_predict at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def custom_predict(custom_input):  
     2         1          2.0      2.0      0.0      lats = []
     3         1        979.0    979.0      0.0      lons = []
     4         1          1.0      1.0      0.0      custom_inputs = []
     5         1          0.0      0.0      0.0      custom_outputs = []
     6     15279       8369.0      0.5      0.0      for gridcell in gridcells:
     7     15278       6797.0      0.4      0.0          lat, lon = gridcell
     8     15278      58308.0      3.8      0.2          emulator = emulators[str(lat) + '_' + str(lon)]
     9     15278   26581385.0   1739.8     99.6          custom_output = emulator.predict(custom_input)[0]
    10     15278      17955.0      1.2      0.1          lats.append(lat)
    11     15278     

In [100]:
bag_custom_inputs = db.from_sequence(custom_inputs_sample)
bag_custom_inputs = bag_custom_inputs.map_partitions(custom_predict)

In [None]:
%%time
results = bag_custom_inputs.compute()

# stalling

dask bag over the gridcells only

In [105]:
def custom_predict(gridcell):  
    lats = []
    lons = []
    custom_inputs = []
    custom_outputs = []
    
    lat, lon = gridcell
    emulator = emulators[str(lat) + '_' + str(lon)]
    lats.append(lat)
    lons.append(lon)
    
    for custom_input in custom_inputs_sample:
        custom_output = emulator.predict(custom_input)[0]
        
        custom_inputs.append(custom_input)
        custom_outputs.append(custom_output)
        
    return lats, lons, custom_inputs, custom_outputs

In [106]:
%%time
for gridcell in gridcells:
    custom_predict(gridcell)

CPU times: user 26.9 s, sys: 486 ms, total: 27.4 s
Wall time: 28.8 s


In [107]:
%lprun -f custom_predict custom_predict(gridcells[0])

Timer unit: 1e-06 s

Total time: 0.001924 s
File: <ipython-input-105-ce45e5633c31>
Function: custom_predict at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def custom_predict(gridcell):  
     2         1          2.0      2.0      0.1      lats = []
     3         1          1.0      1.0      0.1      lons = []
     4         1          1.0      1.0      0.1      custom_inputs = []
     5         1          0.0      0.0      0.0      custom_outputs = []
     6                                               
     7         1          0.0      0.0      0.0      lat, lon = gridcell
     8         1          4.0      4.0      0.2      emulator = emulators[str(lat) + '_' + str(lon)]
     9         1          1.0      1.0      0.1      lats.append(lat)
    10         1          1.0      1.0      0.1      lons.append(lon)
    11                                               
    12         3          1.0      0.3      

In [108]:
bag_gridcells = db.from_sequence(gridcells)
bag_gridcells = bag_custom_inputs.map(custom_predict)

In [None]:
%%time
results = bag_gridcells.compute()

dask bag for gridcells and custom inputs combined only

In [81]:
gridcells_and_inputs = [[custom_input, gridcell] for custom_input in custom_inputs_sample for gridcell in gridcells]

In [94]:
def custom_predict(gridcell_and_input):
    custom_input = gridcell_and_input[0]
    lat, lon = gridcell_and_input[1]
    emulator = emulators[str(lat) + '_' + str(lon)]   
    custom_output = emulator.predict(custom_input)[0]
    return lat, lon, custom_input, custom_output

In [95]:
%%time
for gridcell_and_input in gridcells_and_inputs:
    custom_predict(gridcell_and_input)

CPU times: user 26.2 s, sys: 274 ms, total: 26.5 s
Wall time: 27.6 s


In [82]:
bag_gridcells_and_inputs = db.from_sequence(gridcells_and_inputs)
bag_gridcells_and_inputs = bag_gridcells_and_inputs.map(custom_predict)

In [None]:
%%time
results = bag_gridcells_and_inputs.compute()

multiple dask bags

In [72]:
partitions=2
bag_gridcells = db.from_sequence(gridcells, npartitions=partitions)
bag_custom_inputs = db.from_sequence(custom_inputs_sample, npartitions=partitions)

In [67]:
def custom_predict(gridcell):
    lat, lon = gridcell[0]
    emulator = emulators[str(lat) + '_' + str(lon)]   
    custom_output = emulator.predict(custom_input)[0]
    return lat, lon, custom_input, custom_output

In [75]:
bag_combined = bag_custom_inputs.map_partitions(custom_predict, bag_gridcells)

In [None]:
%%time
results = bag_combined.compute()

if split up the process, then once the above dask bag is computed can add the values to a dataset

In [None]:
joblib.dump(results, path + 'results_' + output + '.joblib')

In [None]:
results = joblib.load(path + 'results_' + output + '.joblib')

In [None]:
results_lats = [item[0] for item in results]
results_lons = [item[1] for item in results]
results_custom_inputs = [item[2][0] for item in results]
results_custom_outputs = [item[3] for item in results]

In [None]:
empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

for custom_input_index, custom_input in enumerate(results_custom_inputs):
    filename = 'RES' + str(np.round(custom_input[0][0], decimals=1)) \
                + '_IND' + str(np.round(custom_input[0][1], decimals=1)) \
                + '_TRA' + str(np.round(custom_input[0][2], decimals=1)) \
                + '_AGR' + str(np.round(custom_input[0][3], decimals=1)) \
                + '_ENE' + str(np.round(custom_input[0][4], decimals=1))
    
    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={
            'lat': np.arange(-60, 85, 0.25), 
            'lon': np.arange(-180, 180, 0.25)
        }
    )
    
    print(filename)
    for custom_output_index, custom_output in enumerate(results_custom_outputs[custom_input_index]):
        lat = results_lats[custom_input_index][custom_output_index]
        lon = results_lons[custom_input_index][custom_output_index]
        ds_custom_output.loc[dict(lat=lat, lon=lon)] = custom_output
        
    
    ds_custom_output.name = output
    ds_custom_output.to_netcdf(
        path + 'summary/ds_' + filename + '_' + output + '.nc'
    )