In [None]:
import glob
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from tpot.export_utils import set_param_recursive
import xarray as xr
from SALib.sample import saltelli
from SALib.analyze import sobol
import joblib
import re
import os
import dask
import dask.bag as db
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cartopy.crs as ccrs
from cartopy.feature import ShapelyFeature
from cartopy.io.shapereader import Reader
params = {
    'text.latex.preamble': ['\\usepackage{gensymb}'],
    'axes.grid': False,
    'savefig.dpi': 700,
    'font.size': 12,
    'text.usetex': False,
    'figure.figsize': [5, 5],
    'font.family': 'serif',
}
matplotlib.rcParams.update(params)

In [None]:
from dask_jobqueue import SGECluster
from dask.distributed import Client

cluster = SGECluster(
    walltime='12:00:00', 
    memory='2 G',
    resource_spec='h_vmem=2G',
    scheduler_options={
        'dashboard_address': ':5757',
    }
)

client = Client(cluster)

In [None]:
cluster.scale(jobs=150)

In [None]:
client.close()
cluster.close()

In [None]:
output = 'PM2_5_DRY'
path = '/nobackup/earlacoa/machinelearning/data/'

with open(path + 'dict_train.pickle', 'rb') as ds:
    dict_train = pickle.load(ds)
    
df_train = pd.concat(dict_train, ignore_index=True)
gridcells = df_train[['lat', 'lon']].drop_duplicates().values.tolist()

#### create control using emulators

In [None]:
outputs = [
    'PM2_5_DRY',
    'o3',
    'AOD550_sfc',
    'asoaX_2p5',
    'bc_2p5',
    'bsoaX_2p5',
    'nh4_2p5',
    'no3_2p5',
    'oc_2p5',
    'oin_2p5',
    'so4_2p5'
]

fraction_res = 1.0
fraction_ind = 1.0
fraction_tra = 1.0
fraction_agr = 1.0
fraction_ene = 1.0

custom_inputs = np.array([
    fraction_res,
    fraction_ind,
    fraction_tra,
    fraction_agr,
    fraction_ene
]).reshape(1, -1)

empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

for output in outputs:
    emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')
    
    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={'lat': np.arange(-60, 85, 0.25), 'lon': np.arange(-180, 180, 0.25)}
    )
    
    for emulator_file in emulator_files:
        lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
        emulator = joblib.load(emulator_file)
        
        try:
            custom_output = emulator.predict(custom_inputs)
            ds_custom_output = xr.where(
                (ds_custom_output.coords['lat'] == lat) & (ds_custom_output.coords['lon'] == lon),
                custom_output,
                ds_custom_output
            )
        except:
            RuntimeError
    
    ds_custom_output.name = output
    ds_custom_output.to_netcdf(
        path + 'summary/ds_ctl_' + output + '.nc'
    )

#### create individual 10% emulators while holding other inputs at 1.0

In [None]:
output = 'PM2_5_DRY'
#output = 'o3'

emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')

empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

matrix1 = np.array(np.meshgrid(np.linspace(0, 1.5, 16), 1, 1, 1, 1)).T.reshape(-1, 5)
matrix2 = np.array(np.meshgrid(1, np.linspace(0, 1.5, 16), 1, 1, 1)).T.reshape(-1, 5)
matrix3 = np.array(np.meshgrid(1, 1, np.linspace(0, 1.5, 16), 1, 1)).T.reshape(-1, 5)
matrix4 = np.array(np.meshgrid(1, 1, 1, np.linspace(0, 1.5, 16), 1)).T.reshape(-1, 5)
matrix5 = np.array(np.meshgrid(1, 1, 1, 1, np.linspace(0, 1.5, 16))).T.reshape(-1, 5)
matrix_stacked = np.vstack((matrix1, matrix2, matrix3, matrix4, matrix5))

for matrix in matrix_stacked:
    custom_inputs = matrix.reshape(1, -1)
    filename = 'RES' + str(np.round(custom_inputs[0][0], decimals=1)) \
                + '_IND' + str(np.round(custom_inputs[0][1], decimals=1)) \
                + '_TRA' + str(np.round(custom_inputs[0][2], decimals=1)) \
                + '_AGR' + str(np.round(custom_inputs[0][3], decimals=1)) \
                + '_ENE' + str(np.round(custom_inputs[0][4], decimals=1))

    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={'lat': np.arange(-60, 85, 0.25), 'lon': np.arange(-180, 180, 0.25)}
    )

    for emulator_file in emulator_files:
        lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
        emulator = joblib.load(emulator_file)
        custom_output = emulator.predict(custom_inputs)
        ds_custom_output = xr.where(
            (ds_custom_output.coords['lat'] == lat) & (ds_custom_output.coords['lon'] == lon),
            custom_output,
            ds_custom_output
        )
    
    ds_custom_output.name = output
    
    ds_custom_output.to_netcdf(
        path + '/summary/ds_' + filename + '_' + output + '.nc'
    )

#### create 10% emulators - pangeo
parallelise over the custom inputs (as these are independent, while the dataset for gridcells are dependent)

In [None]:
matrix_stacked = np.array(np.meshgrid(
    np.linspace(0, 1.5, 16), 
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16),
    np.linspace(0, 1.5, 16)
)).T.reshape(-1, 5)

custom_inputs = [item.reshape(1, -1) for item in matrix_stacked]

In [None]:
# remove duplicates of ones already completed
custom_inputs_completed_filenames = glob.glob(path + 'summary/ds*')
custom_inputs_completed_list = []
for custom_inputs_completed_filename in custom_inputs_completed_filenames:
    custom_inputs_completed_list.append(
        [float(item) for item in re.findall(r'\d+\.\d+', custom_inputs_completed_filename)]
    )
    
custom_inputs_list = []
for custom_input in custom_inputs:
    custom_inputs_list.append(
        [float(item) for item in re.findall(r'[0-9]\.[0-9]?', str(custom_input))]
    )
    
custom_inputs = [np.array(item).reshape(1, -1) for item in custom_inputs_list if item not in custom_inputs_completed_list]

In [None]:
joblib.dump(custom_inputs, path + 'custom_inputs_' + output + '.joblib')

In [None]:
emulator_files = glob.glob(path + output + '/emulator_' + output + '_*.joblib')

emulators = {}

for emulator_file in emulator_files:
    lat, lon = [float(item) for item in re.findall(r'\d+\.\d+', emulator_file)]
    emulators.update({
        str(lat) + '_' + str(lon): joblib.load(emulator_file)
    })

In [None]:
joblib.dump(emulators, path + 'emulators_' + output + '.joblib')

In [None]:
custom_inputs = joblib.load(path + 'custom_inputs_' + output + '.joblib')

In [None]:
emulators = joblib.load(path + 'emulators_' + output + '.joblib')

testing different options to find the most efficient

In [None]:
def custom_predict(custom_input):  
    lats = []
    lons = []
    custom_inputs = []
    custom_outputs = []
    for gridcell in gridcells:
        lat, lon = gridcell
        emulator = emulators[str(lat) + '_' + str(lon)]
        custom_output = emulator.predict(custom_input)[0]
        lats.append(lat)
        lons.append(lon)
        custom_inputs.append(custom_input)
        custom_outputs.append(custom_output)
        
    return lats, lons, custom_inputs, custom_outputs

In [None]:
@dask.delayed
def custom_predict_per_gridcell(gridcell, custom_input):
    lat, lon = gridcell
    emulator = emulators[str(lat) + '_' + str(lon)]
    custom_output = emulator.predict(custom_input)[0]
    return lat, lon, custom_input, custom_output



@dask.delayed
def custom_predict(custom_input):  
    lat, lon, custom_input, custom_output = custom_predict_per_gridcell(gridcell, custom_input)
    return lat, lon, custom_input, custom_output

In [None]:
custom_inputs_sample = custom_inputs[200:202]

In [None]:
bag_custom_inputs = db.from_sequence(custom_inputs_sample)
bag_custom_inputs = bag_custom_inputs.map_partitions(custom_predict)

In [None]:
%%time
results = bag_custom_inputs.compute()

In [None]:
%%time
for custom_input in custom_inputs_sample:
    custom_predict(gridcells, custom_input, emulators)

In [None]:
%load_ext line_profiler
%lprun -f custom_predict custom_predict(gridcells, custom_inputs_sample[0], emulators)

In [None]:
custom_inputs_sample = custom_inputs[200:201]

In [None]:
bag_gridcells = db.from_sequence(gridcells)
bag_gridcells = bag_gridcells.map(custom_predict, emulators, custom_inputs_sample)

In [None]:
del results

In [None]:
%%time
results = bag_gridcells.compute()

In [None]:
%%time
for gridcell in gridcells:
    custom_predict(gridcell)

In [None]:
%load_ext line_profiler
%lprun -f custom_predict custom_predict(gridcells[0])

In [None]:
gridcells_and_inputs = [[custom_input, gridcell] for custom_input in custom_inputs_sample for gridcell in gridcells]

In [None]:
bag_gridcells_and_inputs = db.from_sequence(gridcells_and_inputs)

In [None]:
def custom_predict(gridcell_and_input):
    custom_input = gridcell_and_input[0]
    lat, lon = gridcell[1]
    emulator = emulators[str(lat) + '_' + str(lon)]   
    custom_output = emulator.predict(custom_input)[0]
    return lat, lon, custom_input, custom_output

In [None]:
bag_gridcells_and_inputs = bag_gridcells_and_inputs.map(custom_predict)

In [None]:
%%time
results = bag_gridcells_and_inputs.compute()

In [None]:
partitions=2
bag_gridcells = db.from_sequence(gridcells, npartitions=partitions)
bag_custom_inputs = db.from_sequence(custom_inputs_sample, npartitions=partitions)

In [None]:
def custom_predict(gridcell):
    lat, lon = gridcell[0]
    emulator = emulators[str(lat) + '_' + str(lon)]   
    custom_output = emulator.predict(custom_input)[0]
    return lat, lon, custom_input, custom_output

In [None]:
def custom_predict(gridcell, custom_input):
    lat, lon = gridcell[0]
    emulator = emulators[str(lat) + '_' + str(lon)]   
    custom_output = emulator.predict(custom_input)[0]
    return lat, lon, custom_input, custom_output

In [None]:
bag_combined = bag_custom_inputs.map_partitions(custom_predict, bag_gridcells)

In [None]:
%%time
results = bag_combined.compute()

In [None]:
results_lats = [item[0] for item in results]
results_lons = [item[1] for item in results]
results_custom_inputs = [item[2][0] for item in results]
results_custom_outputs = [item[3] for item in results]

In [None]:
empty_values = np.empty((580, 1440))
empty_values[:] = np.nan

for custom_input_index, custom_input in enumerate(results_custom_inputs):
    filename = 'RES' + str(np.round(custom_input[0][0], decimals=1)) \
                + '_IND' + str(np.round(custom_input[0][1], decimals=1)) \
                + '_TRA' + str(np.round(custom_input[0][2], decimals=1)) \
                + '_AGR' + str(np.round(custom_input[0][3], decimals=1)) \
                + '_ENE' + str(np.round(custom_input[0][4], decimals=1))
    
    ds_custom_output = xr.DataArray(
        empty_values, 
        dims=('lat', 'lon'), 
        coords={
            'lat': np.arange(-60, 85, 0.25), 
            'lon': np.arange(-180, 180, 0.25)
        }
    )
    
    print(filename)
    for custom_output_index, custom_output in enumerate(results_custom_outputs[custom_input_index]):
        lat = results_lats[custom_input_index][custom_output_index]
        lon = results_lons[custom_input_index][custom_output_index]
        ds_custom_output.loc[dict(lat=lat, lon=lon)] = custom_output
        
    
    ds_custom_output.name = output
    ds_custom_output.to_netcdf(
        path + 'summary/ds_' + filename + '_' + output + '.nc'
    )