In [1]:
import numpy as np
import xarray as xr
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import re
from collections import OrderedDict
import seaborn as sns

In [2]:
sns.set_style('darkgrid')
sns.set_context('notebook')

## Load data

In [3]:
var_dict = {
    'geopotential': ('z', [500]),
    'temperature': ('t', [850]),
    'specific_humidity': ('q', [500, 850]),
    '2m_temperature': ('t2m', None),
    'toa_incident_solar_radiation': ('tisr', None),
    'potential_vorticity': ('pv', [500, 850]),
    
    'constants': ['lat2d', 'orography', 'lsm']
}

In [4]:
DATADIR = '/rds/general/user/mc4117/home/WeatherBench/data/'
ds = [xr.open_mfdataset(f'{DATADIR}/{var}/*.nc', combine='by_coords') for var in var_dict.keys()]

In [5]:
ds_whole = xr.merge(ds)

In [17]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, ds, var_dict, lead_time, batch_size=32, shuffle=True, load=True, 
                 mean=None, std=None, output_vars=None):
        """
        Data generator for WeatherBench data.
        Template from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
        Args:
            ds: Dataset containing all variables
            var_dict: Dictionary of the form {'var': level}. Use None for level if data is of single level
            lead_time: Lead time in hours
            batch_size: Batch size
            shuffle: bool. If True, data is shuffled.
            load: bool. If True, datadet is loaded into RAM.
            mean: If None, compute mean from data.
            std: If None, compute standard deviation from data.
        """

        self.ds = ds
        self.var_dict = var_dict
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.lead_time = lead_time

        data = []
        level_names = []
        generic_level = xr.DataArray([1], coords={'level': [1]}, dims=['level'])
        for long_var, params in var_dict.items():
            if long_var == 'constants': 
                for var in params:
                    data.append(ds[var].expand_dims(
                        {'level': generic_level, 'time': ds.time}, (1, 0)
                    ))
                    level_names.append(var)
            else:
                var, levels = params
                try:
                    data.append(ds[var].sel(level=levels))
                    level_names += [f'{var}_{level}' for level in levels]
                except ValueError:
                    data.append(ds[var].expand_dims({'level': generic_level}, 1))
                    level_names.append(var)

        self.data = xr.concat(data, 'level').transpose('time', 'level')
        self.data['level_names'] = xr.DataArray(
            level_names, dims=['level'], coords={'level': self.data.level})
        if output_vars is None:
            self.output_idxs = range(len(dg_valid.data.level))
        else:
            self.output_idxs = [i for i, l in enumerate(self.data.level_names.values) 
                                if any([bool(re.match(o, l)) for o in output_vars])]
        
        # Normalize
        self.mean = self.data.mean(('time')).compute() if mean is None else mean
#         self.std = self.data.std('time').mean(('lat', 'lon')).compute() if std is None else std
        self.std = self.data.std(('time')).compute() if std is None else std
        self.data = (self.data - self.mean) / self.std
        
        self.n_samples = self.data.isel(time=slice(0, -lead_time)).shape[0]
        self.init_time = self.data.isel(time=slice(None, -lead_time)).time
        self.valid_time = self.data.isel(time=slice(lead_time, None)).time

        self.on_epoch_end()

        # For some weird reason calling .load() earlier messes up the mean and std computations
        if load: print('Loading data into RAM'); self.data.load()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.n_samples / self.batch_size))

    def __getitem__(self, i):
        'Generate one batch of data'
        idxs = self.idxs[i * self.batch_size:(i + 1) * self.batch_size]
        X = self.data.isel(time=idxs).values
        y = self.data.isel(time=idxs + self.lead_time, level=self.output_idxs).values
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.idxs = np.arange(self.n_samples)
        if self.shuffle == True:
            np.random.shuffle(self.idxs)

In [19]:
ds_whole.lat

In [38]:
ds_2015_06 = ds_whole.sel(time = '2015-06', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2015_07 = ds_whole.sel(time = '2015-07', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2015_08 = ds_whole.sel(time = '2015-08', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2016_06 = ds_whole.sel(time = '2016-06', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2016_07 = ds_whole.sel(time = '2016-07', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2016_08 = ds_whole.sel(time = '2016-08', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2017_06 = ds_whole.sel(time = '2017-06', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2017_07 = ds_whole.sel(time = '2017-07', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2017_08 = ds_whole.sel(time = '2017-08', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2018_06 = ds_whole.sel(time = '2018-06', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2018_07 = ds_whole.sel(time = '2018-07', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')
ds_2018_08 = ds_whole.sel(time = '2018-08', level = 500, lon = '0', lat = '2.8125').drop('lon').drop('lat').drop('level')


ds_train = xr.merge([ds_2015_06, ds_2015_07, ds_2015_08])
ds_valid = xr.merge([ds_2016_06, ds_2016_07, ds_2016_08])
ds_test = xr.merge([ds_2017_06, ds_2017_07, ds_2017_08, ds_2018_06, ds_2018_07, ds_2018_08])

In [39]:
bs=32
lead_time=72
output_vars = ['z_500'] #, 't_850']

In [40]:
# Create a training and validation data generator. Use the train mean and std for validation as well.
dg_train = DataGenerator(ds_train, var_dict, lead_time, batch_size=bs, load=True, 
                         output_vars=output_vars)
dg_valid = DataGenerator(ds_valid, var_dict, lead_time, batch_size=bs, mean=dg_train.mean, std=dg_train.std, 
                         shuffle=False, output_vars=output_vars)

Loading data into RAM


  return func(*(_execute_task(a, cache) for a in args))


Loading data into RAM


  return func(*(_execute_task(a, cache) for a in args))


In [41]:
# Now also a generator for testing. Impartant: Shuffle must be False!
dg_test = DataGenerator(ds_test, var_dict, lead_time, batch_size=bs, mean=dg_train.mean, std=dg_train.std, 
                         shuffle=False, output_vars=output_vars)

Loading data into RAM


  return func(*(_execute_task(a, cache) for a in args))


In [42]:
X, y = dg_train[0]; X.shape, y.shape

((32, 9), (32, 0))

In [44]:
ds_train.to_array().shape

(11, 2208)

## Load model

In [47]:
def build_cnn(filters, kernels, input_shape, dr=0):
    """Fully convolutional network"""
    x = input = Input(shape=input_shape)
    for f, k in zip(filters[:-1], kernels[:-1]):
        x = Conv1D(f, k)(x)
        x = LeakyReLU()(x)
        if dr > 0: x = Dropout(dr)(x, training = True)
    output = Conv1D(filters[-1], kernels[-1], padding = 'same')(x)
    return keras.models.Model(input, output)


cnn_new = build_cnn([64, 64, 1], [2, 2, 2], (11, 2208))

cnn_new.compile(keras.optimizers.Adam(1e-4), 'mse')

print(cnn_new.summary())



#dg_valid = DataGenerator(dg_valid_data, var_dict, lead_time, batch_size=bs, mean=dg_train.mean, std=dg_train.std, 
#                         shuffle=False, output_vars=output_vars)

cnn_new.fit(x = np.expand_dims(1, ds_train.to_array()), epochs=100, validation_data=ds_valid.to_array(), 
          callbacks=[tf.keras.callbacks.EarlyStopping(
                        monitor='val_loss',
                        min_delta=0,
                        patience=2,
                        verbose=1, 
                        mode='auto')])

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        [(None, 11, 2208)]        0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 10, 64)            282688    
_________________________________________________________________
leaky_re_lu_22 (LeakyReLU)   (None, 10, 64)            0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 9, 64)             8256      
_________________________________________________________________
leaky_re_lu_23 (LeakyReLU)   (None, 9, 64)             0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 9, 1)              129       
Total params: 291,073
Trainable params: 291,073
Non-trainable params: 0
____________________________________________________

TypeError: only size-1 arrays can be converted to Python scalars

In [None]:
dg_valid.expand_dims(axis = 3)

In [None]:
dg_valid.data

In [None]:
dg_train = dg_train.data.drop('lon')

In [None]:
cnn.load_weights('/rds/general/user/mc4117/home/WeatherBench/saved_models/train_72_multi_data_gpu_seasonal.h5')

In [None]:
X1, y1 = dg_train[0]

for i in range(1, len(dg_train)):
    X2, y2 = dg_train[i]
    X1 = np.concatenate((X1, X2))
    y1 = np.concatenate((y1, y2))

In [None]:
X1_test, y1_test = dg_test[0]
for i in range(1, len(dg_test)):
    X2_test, y2_test = dg_test[i]
    X1_test = np.concatenate((X1_test, X2_test))
    y1_test = np.concatenate((y1_test, y2_test))

In [None]:
no_samples, nx, ny, nz = X1.shape
x1_reshaped = X1.reshape((no_samples, nx*ny*nz))
no_samples, nx, ny, nz = y1.shape
y1_reshaped = y1.reshape((no_samples, nx*ny*nz))

In [None]:
no_samples, nx, ny, nz = X1_test.shape
x1_test_reshaped = X1_test.reshape((no_samples, nx*ny*nz))
no_samples, nx, ny, nz = y1_test.shape
y1_test_reshaped = y1_test.reshape((no_samples, nx*ny*nz))

In [None]:
def func_predict(X):
    
    return cnn.predict(X)

In [None]:
feature_list = list(ds_test.keys())
len(feature_list)

In [None]:
from lime import lime_tabular

explainer = lime_tabular.RecurrentTabularExplainer(X1[:, 0, :, :], training_labels=y1[:, 0, :, :], feature_names=feature_list)

In [None]:

exp = explainer.explain_instance(X1_test[1, 0, :, :], cnn.predict, num_features=11)#, labels=(1,))
exp.show_in_notebook()