Read NetCDF files to load feature and label datasets that will be used for training the model.

All files will share the same time, level, lat, and lon dimensions, ie. each file's feature or label variable having shape (times, levels, lats, lons).

We'll use three input files for training that contain 720 time steps (for a total of 2160 training time steps), and then use the trained model to predict label values corresponding to a single features input dataset which contains 720 time steps.  

The feature variables being used are 'PS', 'T', 'U', and 'V'. The label variable is 'PTTEND'.


In [24]:
import numpy as np
import xarray as xr

netcdf_features_train = ["/home/adamsjam/data/lowres/fv091x180L26_moist_HS.cam.h0.2001-01-11-00000_lowres.nc",
                         "/home/adamsjam/data/lowres/fv091x180L26_moist_HS.cam.h0.2001-01-26-00000_lowres.nc"]
netcdf_labels_train = ["/home/adamsjam/data/lowres/fv091x180L26_moist_HS.cam.h1.2001-01-11-00000_lowres.nc",
                       "/home/adamsjam/data/lowres/fv091x180L26_moist_HS.cam.h1.2001-01-26-00000_lowres.nc"]
netcdf_features_predict = ["/home/adamsjam/data/lowres/fv091x180L26_moist_HS.cam.h0.2001-02-10-00000_lowres.nc"]
                         
# # open the features (flows) and labels (tendencies) as xarray DataSets
# ds_train_features = xr.open_mfdataset(paths=netcdf_features_train)
# ds_train_labels = xr.open_mfdataset(paths=netcdf_labels_train)
# ds_predict_features = xr.open_mfdataset(paths=netcdf_features_predict)

# # confirm that we have training datasets that match on the time, lev, lat, and lon dimension/coordinate
# if np.any(ds_train_features.variables['time'].values != ds_train_labels.variables['time'].values):
#     raise ValueError('Non-matching time values between feature and label datasets')
# if np.any(ds_train_features.variables['lev'].values != ds_train_labels.variables['lev'].values):
#     raise ValueError('Non-matching level values between feature and label datasets')
# if np.any(ds_train_features.variables['lat'].values != ds_train_labels.variables['lat'].values):
#     raise ValueError('Non-matching lat values between feature and label datasets')
# if np.any(ds_train_features.variables['lon'].values != ds_train_labels.variables['lon'].values):
#     raise ValueError('Non-matching lon values between feature and label datasets')

# # confirm that the training and prediction datasets match on the lev, lat, and lon dimension/coordinate
# # it's likely that we'll use more times for training than for prediction, so we ignore those differences
# if np.any(ds_train_features.variables['lev'].values != ds_predict_features.variables['lev'].values):
#     raise ValueError('Non-matching level values between train and predict feature datasets')
# if np.any(ds_train_features.variables['lat'].values != ds_predict_features.variables['lat'].values):
#     raise ValueError('Non-matching lat values between train and predict feature datasets')
# if np.any(ds_train_features.variables['lon'].values != ds_predict_features.variables['lon'].values):
#     raise ValueError('Non-matching lon values between train and predict feature datasets')

# train/fit/score models using the dry features and corresponding labels
features = ['PS', 'T', 'U', 'V']
labels = ['PTTEND']
        
# # trim out all non-relevant data variables from the datasets
# for var in ds_train_features.data_vars:
#     if var not in features:
#         ds_train_features = ds_train_features.drop(var)
# for var in ds_train_labels.data_vars:
#     if var not in labels:
#         ds_train_labels = ds_train_labels.drop(var)
# for var in ds_predict_features.data_vars:
#     if var not in features:
#         ds_predict_features = ds_predict_features.drop(var)

#### Load single training dataset

We'll define a function to read arrays of features and labels from NetCDF for a single level.

In [21]:
def load_feature_label_data(netdcf_features, 
                            netcdf_labels,
                            feature_vars,
                            label_vars,
                            level=0):
    """
    Loads feature and label data from specified NetCDF files for a single level.
    
    The feature and label NetCDFs are expected to have matching time, level, lat, and lon coordinate variables.
    
    Returns two arrays: the first for features and the second for labels. Arrays will have shape (time, lat, lon, var),
    where var is the number of feature or label variables. For example if the dimensions of feature data variables in 
    the NetCDF is (time: 360, lev: 26, lat: 120, lon: 180) and the features specified are ["T", "U"] then the resulting
    features array will have shape (360, 120, 180, 2), with the first feature variable "T" corresponding to array[:, :, :, 0]
    and the second feature variable "U" corresponding to array[:, :, :, 1].
    
    :param netdcf_features: one or more NetCDF files containing feature variables, can be single file or list
    :param netdcf_features: one or more NetCDF files containing label variables, can be single file or list
    :param feature_vars: list of feature variable names to be extracted from the features NetCDF
    :param label_vars: list of label variable names to be extracted from the labels NetCDF
    :param level: index of the level to be extracted (all times/lats/lons at this level for each feature/label variable)
    :return: two 4-D numpy arrays, the first for features and the second for labels
    """
    
    # open the features (flows) and labels (tendencies) as xarray DataSets
    ds_features = xr.open_mfdataset(paths=netdcf_features)
    ds_labels = xr.open_mfdataset(paths=netcdf_labels)

    # confirm that we have datasets that match on the time, lev, lat, and lon dimension/coordinate
    if np.any(ds_features.variables['time'].values != ds_labels.variables['time'].values):
        raise ValueError('Non-matching time values between feature and label datasets')
    if np.any(ds_features.variables['lev'].values != ds_labels.variables['lev'].values):
        raise ValueError('Non-matching level values between feature and label datasets')
    if np.any(ds_features.variables['lat'].values != ds_labels.variables['lat'].values):
        raise ValueError('Non-matching lat values between feature and label datasets')
    if np.any(ds_features.variables['lon'].values != ds_labels.variables['lon'].values):
        raise ValueError('Non-matching lon values between feature and label datasets')

    # allocate arrays
    array_features = np.empty(shape=[ds_features.time.size, 
                                     ds_features.lat.size, 
                                     ds_features.lon.size, 
                                     len(feature_vars)], 
                              dtype=np.float64)
    array_labels = np.empty(shape=[ds_features.time.size, 
                                   ds_features.lat.size, 
                                   ds_features.lon.size, 
                                   len(label_vars)], 
                            dtype=np.float64)
    
    # get variable vales into the arrays for both features and labels
    for vars, dataset, arr in zip([feature_vars, label_vars], 
                                  [ds_features, ds_labels], 
                                  [array_features, array_labels]):
        
        # for each variable we'll extract the values 
        for var_index, var in enumerate(vars):

            # if we have (time, lev, lat, lon), then use level parameter
            dimensions = dataset.variables[var].dims
            if dimensions == ('time', 'lev', 'lat', 'lon'):
                values = dataset[var].values[:, level, :, :]
            elif dimensions == ('time', 'lat', 'lon'):
                values = dataset[var].values[:, :, :]
            else:
                raise ValueError("Unsupported feature variable dimensions: {dims}".format(dims=dimensions))

        # add the values into the array at the variable's position
        arr[:, :, :, var_index] = values

    return array_features, array_labels

In [22]:
train_x, train_y = load_feature_label_data(netcdf_features_train[0],
                                           netcdf_labels_train[0],
                                           features,
                                           labels,
                                           level=0)

In [20]:
train_y.shape

(720, 12, 23, 1)

Use a single file's time steps as a measure of how many time steps we should use as a unit of data. We'll use three of these as inputs to the model for training, then a single unit of times.

In [11]:
# size_times_train = ds_train_features.variables['time'].size
# size_times_predict = ds_predict_features.variables['time'].size
# size_lat = ds_predict_features.variables['lat'].size
# size_lon = ds_predict_features.variables['lon'].size

size_times_train = train_x.shape[0]
# size_times_predict = ds_predict_features.variables['time'].size
size_lat = train_x.shape[1]
size_lon = train_x.shape[2]

We'll define a function to pull an array of data for a single level, i.e. all times, lats, lons, and which returns an array with shape: `(times, lats, lons, variables)`

In [25]:
def pull_vars_into_array(dataset,
                         variables,
                         level):
    """
    Create a Numpy array from the specified variables of an xarray DataSet.

    :param dataset: xarray.DataSet
    :param variables: list of variables to be extracted from the DataSet and included in the resulting DataFrame
    :param level: the level index (all times, lats, and lons included from this indexed level)
    :return: an array with shape (ds.time.size, ds.lat.size, ds.lon.size, len(variables)) and dtype float
    """

    # the array we'll populate and return
    arr = np.empty(shape=[dataset.time.size, dataset.lat.size, dataset.lon.size, len(variables)], dtype=float)

    # loop over each variable, adding each into the array
    for index, var in enumerate(variables):

        # if we have (time, lev, lat, lon), then use level parameter
        dimensions = dataset.variables[var].dims
        if dimensions == ('time', 'lev', 'lat', 'lon'):
            values = dataset[var].values[:, level, :, :]
        elif dimensions == ('time', 'lat', 'lon'):
            values = dataset[var].values[:, :, :]
        else:
            raise ValueError("Unsupported variable dimensions: {dims}".format(dims=dimensions))

        # add the values into the array at the variable's position
        arr[:, :, :, index] = values

    return arr

Get an array of training features and labels, and a corresponding array of features to use for prediction.

In [26]:
# # use the first level
# lev = 0

# # get the array of training features for this level (all times/lats/lons)
# train_x = pull_vars_into_array(ds_train_features,
#                                features,
#                                lev)

# # get the array of training labels for this level (all times/lats/lons)
# train_y = pull_vars_into_array(ds_train_labels,
#                                labels,
#                                lev)

# # get the array of features from which we'll predict new label(s)
# predict_x = pull_vars_into_array(ds_predict_features,
#                                  features,
#                                  lev)

The neural network model will work much better if all values are scaled into a range such as between 0 and 1. We'll use scikit-learn's MinMaxScaler for now. The scaler being used for labels will be reused later for inverse scaling the predicted label values.

In [23]:
from sklearn.preprocessing import MinMaxScaler

# initialize a list to store scalers for each feature/label
scalers_x = [MinMaxScaler(feature_range=(0, 1))] * len(features)
scalers_y = [MinMaxScaler(feature_range=(0, 1))] * len(labels)

# data is 4-D with shape (times, lats, lons, vars), scalers can only work on 2-D arrays,
# so for each feature we scale the corresponding 3-D array of values after flattening it,
# then reshape back into the original shape
for feature_ix in range(len(features)):
    scaler = scalers_x[feature_ix]
    feature_train = train_x[:, :, :, feature_ix].flatten().reshape(-1, 1)
#     feature_predict = predict_x[:, :, :, feature_ix].flatten().reshape(-1, 1)
    scaled_train = scaler.fit_transform(feature_train)
#     scaled_predict = scaler.fit_transform(feature_predict)
    reshaped_scaled_train = np.reshape(scaled_train, newshape=(size_times_train, size_lat, size_lon))
#     reshaped_scaled_predict = np.reshape(scaled_predict, newshape=(size_times_predict, size_lat, size_lon))
    train_x[:, :, :, feature_ix] = reshaped_scaled_train
#     predict_x[:, :, :, feature_ix] = reshaped_scaled_predict
for label_ix in range(len(labels)):
    scaler = scalers_y[label_ix]
    label_train = train_y[:, :, :, label_ix].flatten().reshape(-1, 1)
    scaled_train = scaler.fit_transform(label_train)
    reshaped_scaled_train = np.reshape(scaled_train, newshape=(size_times_train, size_lat, size_lon))
    train_y[:, :, :, label_ix] = reshaped_scaled_train

A batch of data for the model will be the full lat/lon grid for a single level geospatially, and the number of time steps of the individual files, since we'll predict for a single file's worth of data. For example we're using three files worth of data for training the model, then using the fitted model to predict the label variables corresponding to a single file of feature inputs.

In [32]:
training_batches = int(size_times_train / size_times_predict)
times_per_batch = int(size_times_train / training_batches)
training_features_shape = (training_batches, times_per_batch, size_lat, size_lon, len(features))
training_labels_shape = (training_batches, times_per_batch, size_lat, size_lon, len(labels))
predict_features_shape = (1, times_per_batch, size_lat, size_lon, len(features))
train_x = np.reshape(train_x, newshape=training_features_shape)
train_y = np.reshape(train_y, newshape=training_labels_shape)
predict_x = np.reshape(predict_x, newshape=predict_features_shape)

### Define the model using convolutional and dense layers

In [26]:
from keras.models import Sequential
from keras.layers import Conv3D, Dense

# define the model
model = Sequential()

# add an initial 3-D convolutional layer
model.add(Conv3D(filters=32,
                 kernel_size=(3, 3, 3),
                 activation="relu",
                 data_format="channels_last",
                 input_shape=(size_times_train, size_lat, size_lon, len(features)),
                 padding='same'))

# add a fully-connected hidden layer with the number of neurons as input attributes (features)
model.add(Dense(len(features), activation='relu'))

# output layer uses no activation function since we are interested
# in predicting numerical values directly without transform
model.add(Dense(len(labels)))

# compile the model using the ADAM optimization algorithm and a mean squared error loss function
model.compile(optimizer='adam', loss='mse')

# display summary info
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d_1 (Conv3D)            (None, 720, 12, 23, 32)   3488      
_________________________________________________________________
dense_1 (Dense)              (None, 720, 12, 23, 4)    132       
_________________________________________________________________
dense_2 (Dense)              (None, 720, 12, 23, 1)    5         
Total params: 3,625
Trainable params: 3,625
Non-trainable params: 0
_________________________________________________________________


#### Train the model for the first level

In [27]:
#model.fit(train_x, train_y, batch_size=times_per_batch, shuffle=True, epochs=8, verbose=2)
model.fit(train_x, train_y, shuffle=True, epochs=8, verbose=2)

ValueError: Error when checking input: expected conv3d_1_input to have 5 dimensions, but got array with shape (720, 12, 23, 4)