In [58]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append("../")

location = "remote"
if location == "remote":
    # TODO: hacky, shouldn't be necessary
    os.environ["WANDB_NOTEBOOK_NAME"] = "lustre_scratch/coralshift/notebooks/rnn.ipynb"
    os.chdir("/lustre_scratch/orlando-code/coralshift/")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
from __future__ import annotations

from pathlib import Path
import xarray as xa
import numpy as np
# import math as m
# import pandas as pd
import tensorflow as tf
# import matplotlib.pyplot as plt
# import matplotlib.patches as patches

import wandb
from tqdm import tqdm
from sklearn import model_selection
# from sklearn.preprocessing import normalize
from scipy.interpolate import interp2d
from sklearn.utils import class_weight
# from scipy.ndimage import gaussian_gradient_magnitude
import xbatcher

# import rasterio
# from rasterio.plot import show
# import rioxarray as rio

# from bs4 import BeautifulSoup
# import requests


#issues with numpy deprecation in pytorch_env
from coralshift.processing import spatial_data
from coralshift.utils import file_ops, directories
from coralshift.plotting import spatial_plots, model_results
from coralshift.dataloading import data_structure, climate_data

In [60]:
cells = 12800
seq_len = 10000
num_fs = 15
target_frac = 0.1

# Creating the features array
features = np.random.random((cells, seq_len, num_fs)).astype(np.float32)

# Creating the label array
labels = np.random.choice((2), size=(cells,), p=[1-target_frac,target_frac])

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(labels), y=labels)

# Verifying the shape of the arrays
print("Features shape:", features.shape)
print("Label shape:", labels.shape)
print("Class weights:", class_weights)

Features shape: (12800, 10000, 15)
Label shape: (12800,)
Class weights: [0.55483312 5.05928854]


In [61]:
# send numpy features/label array to tf.Data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((features, labels))

BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 10

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
print("Number of batches:", len(train_dataset))

Number of batches: 200


## Replacing toys with real data

In [62]:
X_train = features
y_train = labels

# wandb.init(
#     project="coralshift",
#     entity="orlando-code",
#     settings=wandb.Settings(start_method="fork")
#     # config={    }
#     )

# initialize optimiser: will need hyperparameter scan for learning rate and others
# https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam
optimizer = tf.keras.optimizers.Adam(3e-4)

# X = ds_man.get_dataset("monthly_climate_1_12_X_np")
# y = ds_man.get_dataset("monthly_climate_1_12_y_np")
# # check that untrained model runs (should output array of non-nan values)
# # why values change?
# # g_model(X[:32])

# X_train, X_test, y_train, y_test = model_selection.train_test_split(
#     X, y, test_size=0.2, random_state=42)

# X_train, X_test, y_train, y_test = model_selection.train_test_split(
#     sub_X, sub_y, test_size=0.2, random_state=42)

# Define Gated Recurrent Unit model class in TensorFlow
class gru_model(tf.keras.Model):
    # initialise class instance to define layers of the model
    def __init__(self, rnn_units: list[int], num_layers: int, 
        # dff: int
        ):
        """Sets up a GRU model architecture with multiple layers and dense layers for mapping the outputs of the GRU 
        layers to a desired output shape

        Parameters
        ----------
        rnn_units (list[int]): list containing the number of neurons to use in each layer
        num_layers (int): number of layers in GRU model
        """
        super(gru_model, self).__init__()   # initialise GRU model as subclass of tf.keras.Model
        # store values for later use
        self.num_layers = num_layers    # number of layers in GRU model
        self.rnn_units = rnn_units
        # self.dff = dff
        # define model layers: creating new `tf.keras.layers.GRU` layer for each iteration
        self.grus = [tf.keras.layers.GRU(rnn_units[i],  # number (integer) of rnn units/neurons to use in each model layer
                                   return_sequences=True,   # return full sequence of outputs for each timestep
                                   return_state=True) for i in range(num_layers)] # return last hidden state of RNN at end of sequence
        
        # dense layers are linear mappings of RNN layer outputs to desired output shape
        # self.w1 = tf.keras.layers.Dense(dff) # 10 units
        self.w1 = tf.keras.layers.Dense(10) # 10 units

        self.w2 = tf.keras.layers.Dense(1)  # 1 unit (dimension 1 required before final sigmoid function)
        # self.A = tf.keras.layers.Dense(30)
        # self.B = tf.keras.layers.Dense(dff)



    def call(self, inputs: np.ndarray, training: bool=False):
        """Processes an input sequence of data through several layers of GRU cells, followed by a couple of
        fully-connected dense layers, and outputs the probability of an event happening.
        
        Parameters
        ----------
        inputs (np.ndarray): input tensor of shape (batch_size, seq_length, features)
            batch_size - defines the size of the sample drawn from datapoints
            seq_length - number of timesteps in sequence
            features - number of features associated with each datapoint
        training (bool, defaults to False): True if model is in training, False if in inference mode

        Returns
        -------
        target: probability of an event occuring, with shape (batch_size, 1)
        """
        # input shape: (batch_size, seq_length, features)
       
        assert self.num_layers == len(self.rnn_units)

        # check that input tensor has correct shape
        if (len(inputs.shape) != 3):
            print(f"Incorrect shape of input tensor. Expected 3D array. Recieved {len(inputs.shape)}D array.")

        # print('input dim ({}, {}, {})'.format(inputs.shape[0], inputs.shape[1], inputs.shape[2]))
        # whole_seq, static_input = inputs
        whole_seq = inputs


        # iteratively passes input tensor to GRU layers, overwriting preceding sequence 'whole_seq'
        for layer_num in range(self.num_layers):
            whole_seq, final_s = self.grus[layer_num](whole_seq, training=training)

        # adding extra layers
        # static = self.B(tf.nn.gelu(self.A(static_input)))
        # target = self.w1(final_s)  + static # final hidden state of last layer used as input to fully connected dense layers...
        target = self.w1(final_s)   # final hidden state of last layer used as input to fully connected dense layers...

        target = tf.nn.relu(target) # via ReLU activation function
        target = self.w2(target)    # final hidden layer must have dimension 1 
        
        # obtain a probability value between 0 and 1
        target = tf.nn.sigmoid(target)
        
        return target


# initialise GRU model with 500 hidden layers, one GRU unit per layer 
g_model = gru_model([100], 1) # N.B. [x] is number of hidden layers in GRU network


def negative_log_likelihood(y: np.ndarray, y_pred: np.ndarray, class_weights: np.ndarray = None) -> float:
    """Compute binary cross-entropy loss between ground-truth binary labels and predicted probabilities,
    incorporating class weights.

    Parameters
    ----------
    y (np.ndarray): true binary labels, where 0 represents the negative class
    y_pred (np.ndarray): predicted labels (as probability value between 0 and 1)
    class_weights (np.ndarray): weights for each class. If None, no class weights will be applied.

    Returns
    -------
    float: negative log likelihood loss computed using binary cross-entropy loss between 'y' and 'y_pred',
    incorporating class weights if provided
    """
    bce = tf.keras.losses.BinaryCrossentropy()  

    if class_weights is not None:
        sample_weights = tf.gather(class_weights, np.asarray(y,dtype=np.int32))
        # reshape to match size of y and y_pred
        return bce(y, y_pred, sample_weight=tf.reshape(sample_weights, (-1, 1)))

    return bce(y, y_pred)


def training_batches(X: np.ndarray, y: np.ndarray, batch_num: int, batch_size: int=32):
    start_idx = batch_num * batch_size
    end_idx = (batch_num + 1) * batch_size

    X_batch = X[start_idx:end_idx]
    y_batch = y[start_idx:end_idx]
    
    return X_batch, y_batch

# https://stackoverflow.com/questions/52357542/attributeerror-tensor-object-has-no-attribute-numpy
# should aim to delete the following to speed up training: but can't figure out a way to make wandb reporting work
# without it
tf.config.run_functions_eagerly(True)

def build_graph():
    
    # compile function as graph using tf's autograph feature: leads to faster execution times, at expense of limitations
    # to Python objects/certain control flow structures (somewhat relaxed by experimental_relax_shapes)
    @tf.function(experimental_relax_shapes=True)
    def train_step(gru: tf.keras.Model, optimizer: tf.keras.optimizers.Optimizer, X: tf.Tensor, y: tf.Tensor, 
               training: bool = True, class_weights=class_weights) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
        """Train model using input `X` and target data `y` by computing gradients of the loss (via 
        negative_log_likelihood)
        
        Parameters
        ----------
        y (np.ndarray): true binary labels, where 0 represents the negative class
        y_pred (np.ndarray): predicted labels (as probability value between 0 and 1)

        Returns
        -------
        float: negative log likelihood loss computed using binary cross-entropy loss between 'y' and 'y_pred'
        """
        if training:
            # num_samples = X.shape[0]
            # num_batches = num_samples // batch_size
            # num_batches = batch_num
            # total_epoch_loss = 0.0
            # for batch_num in tqdm(range(num_batches), desc="batches", position=0, leave=True):
            # for batch_num, batch in tqdm(enumerate(ds), desc="batches", position=0, leave=True):
            total_batch_loss = 0

            y_pred = gru(X, training)
            xent = negative_log_likelihood(y, y_pred, class_weights)

                # X_batch, y_batch = training_batches(X, y, batch_num=batch_num, batch_size=batch_size)

            with tf.GradientTape(persistent=True) as tape:
                y_pred = gru(X, training) 
                xent = negative_log_likelihood(y, y_pred, class_weights)
                # y_pred = gru(X, training) # TO DELETE
                # xent = negative_log_likelihood(y, y_pred)
            
            gradients = tape.gradient(xent, gru.trainable_variables)
            optimizer.apply_gradients(zip(gradients, gru.trainable_variables))
            # print("xent", xent.numpy())
            # print("total_epoch_loss", total_epoch_loss)
            total_batch_loss += xent
                # learning rate?
                # wandb.log({"batch": batch_num, "loss": xent, "total_epoch_loss": total_epoch_loss})

            average_loss = total_batch_loss / num_batches
            # return predicted output values and total loss value
            return y_pred, xent, total_batch_loss

    # set default float type
    tf.keras.backend.set_floatx('float32')
    # TODO: this isn't assigned... What should it return otherwise? OOH yeas it is!
    return train_step


with tf.device("/GPU:0"):
    num_epochs = 2
    # will update so that subsamples are fed in from which batches are taken: will require recomputation
    # of class_weight for each subsample

    tr_step = build_graph()

    for epoch in tqdm(range(num_epochs), desc= " epochs", position=1, leave=True):
        total_epoch_loss = 0.0
        for X_batch, y_batch in tqdm(train_dataset, position=0, desc=" training on batches", leave=True):
            y_pred, xent, batch_loss = tr_step(
                g_model, optimizer, X_batch, y_batch, class_weights=class_weights, training=True)

            total_epoch_loss += batch_loss

        average_loss = total_epoch_loss / len(train_dataset)


# wandb.finish()


 training on batches:   0%|          | 0/200 [00:00<?, ?it/s]

2023-06-16 22:44:52.763747: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401
 training on batches: 100%|██████████| 200/200 [01:47<00:00,  1.87it/s]
 training on batches: 100%|██████████| 200/200 [01:39<00:00,  2.01it/s]
 epochs: 100%|██████████| 2/2 [03:26<00:00, 103.34s/it]


In [71]:
g_model.summary()

Model: "gru_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_1 (GRU)                 multiple                  35100     
                                                                 
 dense_20 (Dense)            multiple                  1010      
                                                                 
 dense_21 (Dense)            multiple                  11        
                                                                 
Total params: 36,121
Trainable params: 36,121
Non-trainable params: 0
_________________________________________________________________
