#### Specify NetCDF files used for training and prediciton inputs
These are low resolution versions of NCAR CAM inputs/outputs, located in the `example_data` directory of this project's git repository.

In [20]:
root_dir = "/home/adamsjam/git/model_learn"

# files used as feature inputs for model training
netcdf_features_train = [root_dir + "/example_data/fv091x180L26_moist_HS.cam.h0.2001-01-11-00000_lowres.nc",
                         root_dir + "/example_data/fv091x180L26_moist_HS.cam.h0.2001-01-26-00000_lowres.nc",
                         root_dir + "/example_data/fv091x180L26_moist_HS.cam.h0.2001-02-10-00000_lowres.nc"]

# files used as label inputs for model training
netcdf_labels_train = [root_dir + "/example_data/fv091x180L26_moist_HS.cam.h1.2001-01-11-00000_lowres.nc",
                       root_dir + "/example_data/fv091x180L26_moist_HS.cam.h1.2001-01-26-00000_lowres.nc",
                       root_dir + "/example_data/fv091x180L26_moist_HS.cam.h1.2001-02-10-00000_lowres.nc"]

# files used as feature inputs for model prediction
netcdf_features_predict = [root_dir + "/example_data/fv091x180L26_moist_HS.cam.h0.2001-02-25-00000_lowres.nc"]

# files used as label outputs for model prediction
netcdf_labels_predict_dense = [root_dir + "/example_data/fv091x180L26_moist_HS.cam.h1.2001-02-25-00000_lowres_predicted_dense.nc"]
netcdf_labels_predict_conv3d = [root_dir + "/example_data/fv091x180L26_moist_HS.cam.h1.2001-02-25-00000_lowres_predicted_conv3d.nc"]

### Load datasets for training and prediction

We'll define a function to extract an array of variable(s) for a single level from an xarray DataSet, and another to extract both features and labels from NetCDF.

In [31]:
import numpy as np
import xarray as xr

def extract_data_array(dataset,
                       variables,
                       lev):

    # allocate the array
    arr = np.empty(shape=[dataset.time.size, 
                          dataset.lat.size, 
                          dataset.lon.size, 
                          len(variables)],
                   dtype=np.float64)
    
    # for each variable we'll extract the values 
    for var_index, var in enumerate(variables):

        # if we have (time, lev, lat, lon), then use level parameter
        dimensions = dataset.variables[var].dims
        if dimensions == ('time', 'lev', 'lat', 'lon'):
            values = dataset[var].values[:, lev, :, :]
        elif dimensions == ('time', 'lat', 'lon'):
            values = dataset[var].values[:, :, :]
        else:
            raise ValueError("Unsupported variable dimensions: {dims}".format(dims=dimensions))

        # add the values into the array at the variable's position
        arr[:, :, :, var_index] = values
    
    return arr
    
    
def extract_features_labels(netdcf_features, 
                            netcdf_labels,
                            feature_vars,
                            label_vars,
                            level=0):
    """
    Extracts feature and label data from specified NetCDF files for a single level as numpy arrays.
    
    The feature and label NetCDFs are expected to have matching time, level, lat, and lon coordinate variables.
    
    Returns two arrays: the first for features and the second for labels. Arrays will have shape (time, lat, lon, var),
    where var is the number of feature or label variables. For example if the dimensions of feature data variables in 
    the NetCDF is (time: 360, lev: 26, lat: 120, lon: 180) and the features specified are ["T", "U"] then the resulting
    features array will have shape (360, 120, 180, 2), with the first feature variable "T" corresponding to array[:, :, :, 0]
    and the second feature variable "U" corresponding to array[:, :, :, 1].
    
    :param netdcf_features: one or more NetCDF files containing feature variables, can be single file or list
    :param netdcf_features: one or more NetCDF files containing label variables, can be single file or list
    :param feature_vars: list of feature variable names to be extracted from the features NetCDF
    :param label_vars: list of label variable names to be extracted from the labels NetCDF
    :param level: index of the level to be extracted (all times/lats/lons at this level for each feature/label variable)
    :return: two 4-D numpy arrays, the first for features and the second for labels
    """
    
    # open the features (flows) and labels (tendencies) as xarray DataSets
    ds_features = xr.open_mfdataset(paths=netdcf_features)
    ds_labels = xr.open_mfdataset(paths=netcdf_labels)

    # confirm that we have datasets that match on the time, lev, lat, and lon dimension/coordinate
    if np.any(ds_features.variables['time'].values != ds_labels.variables['time'].values):
        raise ValueError('Non-matching time values between feature and label datasets')
    if np.any(ds_features.variables['lev'].values != ds_labels.variables['lev'].values):
        raise ValueError('Non-matching level values between feature and label datasets')
    if np.any(ds_features.variables['lat'].values != ds_labels.variables['lat'].values):
        raise ValueError('Non-matching lat values between feature and label datasets')
    if np.any(ds_features.variables['lon'].values != ds_labels.variables['lon'].values):
        raise ValueError('Non-matching lon values between feature and label datasets')

    # extract feature and label arrays at the specified level
    array_features = extract_data_array(ds_features, feature_vars, level)
    array_labels = extract_data_array(ds_labels, label_vars, level)
    
    return array_features, array_labels

The feature variables being used are 'PS', 'T', 'U', and 'V'. The label variable is 'PTTEND'.

In [32]:
features = ["PS", "T", "U", "V"]
labels = ["PTTEND"]

Read NetCDF files to load feature and label datasets that will be used for training and prediction.

All files should share the same time, level, lat, and lon coordinate, with each file's feature or label variables having shape (times, levels, lats, lons).

In [33]:
train_x, train_y = extract_features_labels(netcdf_features_train[0],
                                           netcdf_labels_train[0],
                                           features,
                                           labels,
                                           level=0)
predict_x = extract_data_array(xr.open_dataset(netcdf_features_predict[0]),
                               features,
                               lev=0)

Get the dimension sizes for later use.

In [35]:
size_times_train = train_x.shape[0]
size_times_predict = train_x.shape[0]
size_lat = train_x.shape[1]
size_lon = train_x.shape[2]

Our neural network models will work much better if all values are scaled into a range such as between 0 and 1. We'll use scikit-learn's MinMaxScaler for now. The scaler being used for labels will be reused later for inverse scaling of the predicted label values.

In [36]:
from sklearn.preprocessing import MinMaxScaler

# initialize a list to store scalers for each feature/label
scalers_x = [MinMaxScaler(feature_range=(0, 1))] * len(features)
scalers_y = [MinMaxScaler(feature_range=(0, 1))] * len(labels)

# data is 4-D with shape (times, lats, lons, vars), scalers can only work on 2-D arrays,
# so for each feature we scale the corresponding 3-D array of values after flattening it,
# then reshape back into the original shape
for feature_ix in range(len(features)):
    scaler = scalers_x[feature_ix]
    feature_train = train_x[:, :, :, feature_ix].flatten().reshape(-1, 1)
    feature_predict = predict_x[:, :, :, feature_ix].flatten().reshape(-1, 1)
    scaled_train = scaler.fit_transform(feature_train)
    scaled_predict = scaler.fit_transform(feature_predict)
    reshaped_scaled_train = np.reshape(scaled_train, newshape=(size_times_train, size_lat, size_lon))
    reshaped_scaled_predict = np.reshape(scaled_predict, newshape=(size_times_predict, size_lat, size_lon))
    train_x[:, :, :, feature_ix] = reshaped_scaled_train
    predict_x[:, :, :, feature_ix] = reshaped_scaled_predict
for label_ix in range(len(labels)):
    scaler = scalers_y[label_ix]
    label_train = train_y[:, :, :, label_ix].flatten().reshape(-1, 1)
    scaled_train = scaler.fit_transform(label_train)
    reshaped_scaled_train = np.reshape(scaled_train, newshape=(size_times_train, size_lat, size_lon))
    train_y[:, :, :, label_ix] = reshaped_scaled_train

### Define Keras models to use for prediciton
We'll define two neural network models using the Keras library with TensorFlow as its backend. One of these models will contain only simple densely connected layers, and another will contain a both convolutional layer and a densely connected layer. We'll use both of these for prediction of labels corresponding to the results of NCAR CAM model runs involving computation of the Held-Suarez test case. Initially we'll focus on the input feature variables PS, T, U, and V and the output label PTTEND.

##### Dense layer-only model

In [38]:
from keras.models import Sequential
from keras.layers import Dense

# define the model
dense_model = Sequential()

# add a fully-connected hidden layer with the same number of neurons as input attributes (features)
dense_model.add(Dense(len(features), input_dim=len(features), activation='relu'))

# add a fully-connected hidden layer with the twice the number of neurons as input attributes (features)
dense_model.add(Dense(len(features) * 2, activation='relu'))

# output layer uses no activation function since we are interested
# in predicting numerical values directly without transform
dense_model.add(Dense(len(labels)))

# compile the model using the ADAM optimization algorithm and a mean squared error loss function
dense_model.compile(optimizer='adam', loss='mse')

# display some summary information
dense_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 69
Trainable params: 69
Non-trainable params: 0
_________________________________________________________________


##### Convolutional layer model

In [39]:
from keras.models import Sequential
from keras.layers import Conv3D, Dense

# define the model
cnn_model = Sequential()

# add an initial 3-D convolutional layer
cnn_model.add(Conv3D(filters=32,
                     kernel_size=(3, 3, 3),
                     activation="relu",
                     data_format="channels_last",
                     input_shape=(size_times_train, size_lat, size_lon, len(features)),
                     padding='same'))

# add a fully-connected hidden layer with twice the number of neurons as input attributes (features)
cnn_model.add(Dense(len(features) * 2, activation='relu'))

# output layer uses no activation function since we are interested
# in predicting numerical values directly without transform
cnn_model.add(Dense(len(labels)))

# compile the model using the ADAM optimization algorithm and a mean squared error loss function
cnn_model.compile(optimizer='adam', loss='mse')

# display some summary information
cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d_1 (Conv3D)            (None, 720, 12, 23, 32)   3488      
_________________________________________________________________
dense_4 (Dense)              (None, 720, 12, 23, 8)    264       
_________________________________________________________________
dense_5 (Dense)              (None, 720, 12, 23, 1)    9         
Total params: 3,761
Trainable params: 3,761
Non-trainable params: 0
_________________________________________________________________


##### Reshape data for model input

In [41]:
shape_x = (1, ) + train_x.shape
shape_y = (1, ) + train_y.shape
train_x = np.reshape(train_x, newshape=shape_x)
train_y = np.reshape(train_y, newshape=shape_y)
predict_x = np.reshape(predict_x, newshape=shape_x)

#### Train the models for the first level

In [43]:
cnn_model.fit(train_x, train_y, shuffle=True, epochs=8, verbose=2)

Epoch 1/8
 - 2s - loss: 0.3783
Epoch 2/8
 - 0s - loss: 0.2326
Epoch 3/8
 - 0s - loss: 0.1847
Epoch 4/8
 - 0s - loss: 0.1725
Epoch 5/8
 - 0s - loss: 0.1593
Epoch 6/8
 - 0s - loss: 0.1469
Epoch 7/8
 - 0s - loss: 0.1341
Epoch 8/8
 - 0s - loss: 0.1208


<keras.callbacks.History at 0x2b873ae901d0>