# 5 Modelling - Recurrent neural network

In [1]:
#general
import pandas as pd
import numpy as np
import os
import plotly.express as px
from datetime import datetime
import csv

#ml
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#data folder
data_folder : str = "data"

#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#decide if the data gets saved or not
run_optim : bool = False

In [2]:
import warnings
warnings.filterwarnings('ignore')

example: https://towardsdatascience.com/time-series-prediction-with-lstm-in-tensorflow-42104db39340

Activation function: "selu". Does not suffer from:
- dying relu
- vanshing gradient
- source: https://towardsdatascience.com/gentle-introduction-to-selus-b19943068cd9
- code: model.add(layers.Dense(64, activation='selu'))

Kernel initializer: "HeNormal"
- procudes deterministic results with the same random seed
- source: https://keras.io/api/layers/initializers/
<br>


## 5.1 Basics and base class for inheritance

In [3]:
df_main = pd.read_csv(os.path.join("data", "df.csv"), index_col = "index")
df_main.shape

(527, 27)

In [4]:
#create test set
n_years_test = 4
years = list(set(df_main["year"].to_list()))
years.sort()

#set relevant years
indexes = [int(len(years) * (i/n_years_test)) - 1 for i in range(1, 1 + n_years_test)]
test_years = [years[i-1] for i in indexes] ##added minus 2 to get a better distributed data set

#create df_test und df
df = df_main[~df_main["year"].isin(test_years)]
df_test = df_main[df_main["year"].isin(test_years)]

print(f"test years:\t{test_years}")
print(f"test set:\t{df_test.shape[0]}\ntrain set:\t{df.shape[0]}")


test years:	[1988, 1999, 2010, 2021]
test set:	48
train set:	479


In [5]:
#see: https://scikit-learn.org/stable/auto_examples/neural_networks/plot_mlp_alpha.html#sphx-glr-auto-examples-neural-networks-plot-mlp-alpha-py
#see: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
#see: https://becominghuman.ai/what-does-feature-scaling-mean-when-to-normalize-data-and-when-to-standardize-data-c3de654405ed
#see: https://www.tensorflow.org/tutorials/structured_data/time_series
#see: https://towardsdatascience.com/preventing-data-leakage-in-your-machine-learning-model-9ae54b3cd1fb

class Keras_base():

    def __init__ (self, y_col : str, df : object, df_test : object ,run_optim : bool, data_folder : str, model_splitter : int):

        #set base infromation
        self.y_col : str            = y_col
        self.df : object            = df
        self.df_test                = df_test
        self.run_optim : bool       = run_optim
        self.model_splitter         = model_splitter #1 = v1, 2 = v2
        self.valid_frac             = 0.2

        #fixed model random seed
        self.random_state            = 42
        tf.random.set_seed(self.random_state)

        #nn parameters
        self.acitvation_func        = "selu"
        self.solver                 = tf.keras.optimizers.legacy.SGD(learning_rate=0.1) #tf.keras.optimizers.SGD(learning_rate=0.0001) #tf.keras.optimizers.Adam(learning_rate=0.001)
        self.initializer            = tf.keras.initializers.HeNormal(seed = self.random_state)
        self.loss_func              = tf.keras.losses.BinaryCrossentropy()
        self.n_epochs               = 10

        #arch params
        self.lin_arch_scaling       = 2
        self.cone_arch_scaling      = 2
        self.cone_arch_base_power   = 4
        self.n_layers               = 5

        #set saving infos
        self.log_file : str         = os.path.join(data_folder, "optim_log.txt")
        self.result_file : str      = os.path.join(data_folder, "rnnc_results.csv")

        #construct keras initializer with given random seed
        self.initializer = tf.keras.initializers.HeNormal(seed = self.random_state)

        #prepara data
        self.__standardize()
        self.__df_split()

        if self.df_test is not None:
            self.__prep_test_df()

        self.__generate_archs()
        self.n_features = self.X.shape[1]

        #execute code bases on inputs
        if self.run_optim:
            self.run_optim_child()

        #retrieve results from automation if it exists
        self.__get_results()

        return

    def __generate_archs(self):

        self.architectures : list = []
        n_features = len(self.X.columns.tolist())

        nodes_pow_2 = [2 ** (self.cone_arch_base_power + p) for p in range(1, self.n_layers + 1)][::-1]

        for n_layer in range(1, self.n_layers + 1):

            #linear
            for size in [n_features / self.lin_arch_scaling ,n_features, n_features * self.lin_arch_scaling]:
                arch_lin = [int(size)] * n_layer
                self.architectures.append(arch_lin)

            #cone
            arch_cone = nodes_pow_2[:n_layer]
            self.architectures.append(arch_cone)

        return

    def __standardize(self):

        #unstandardized copy
        self.df_unstand = self.df.copy()

        #standardize columns
        x_cols = self.df.columns.to_list(); x_cols.remove(self.y_col)
        self.df[x_cols] = (self.df[x_cols] - self.df[x_cols].mean()) / self.df[x_cols].std()

        #clean up all columns with no variation
        #unneded_cols = self.df.describe().T["std"].loc[self.df.describe().T["std"] == 0].index.to_list()
        #self.df.drop(labels = unneded_cols, axis = 1, inplace = True)

        true_indexes = []

        for index, value in self.df.std().isna().items():
                if value == True:
                    true_indexes.append(index)

        self.df.drop(labels = true_indexes, axis = 1, inplace = True)

        return

    def __df_split(self):

        #create X and y dfs
        self.X = self.df.drop(labels = self.y_col, axis = 1)
        self.y = self.df[self.y_col]

        #create train and valid models
        if self.model_splitter == 1:
            self.__single_model_split_v1()
        elif self.model_splitter == 2:
            self.__single_model_split_v2()

        return

    def save_result(self, model_type, train_score, valid_score, arch): #add features

        if os.path.isfile(self.result_file) is False:

            file = open(self.result_file, "w", newline='')
            writer = csv.writer(file)
            writer.writerow(["model_type","train_score", "valid_score","arch", "valid_model_splitter"]) #add features
            file.close()

        file = open(self.result_file, "a", newline='')
        writer = csv.writer(file)
        writer.writerow([model_type, train_score, valid_score, arch, self.model_splitter]) #add features
        file.close()

        return

    def __log(self, message):

        #create log entry
        log_time : str = datetime.now()
        message = f"source_mlp,{log_time},{message}\n"

        #write log entry
        file_object = open(self.log_file, 'a')
        file_object.write(message)
        file_object.close()

        return

    def __get_results(self):

        self.results = pd.read_csv(self.result_file)

    def __single_model_split_v1(self):

        #split
        index = round(self.df.shape[0] * self.valid_frac)

        self.X_train = self.X.iloc[index:]
        self.y_train = self.y.iloc[index:]

        self.X_valid = self.X.iloc[:index]
        self.y_valid = self.y.iloc[:index]

        return

    def __single_model_split_v2(self):

        years = list(set(self.df["year"].to_list()))
        years.sort()

        n_years = int(len(years) * self.valid_frac)
        n_years_half = ((n_years % 2) + n_years) / 2 #round to even numbers and split in half

        #get target year list
        valid_years = years[round(((len(years) - 1) / 2) - n_years_half) : round((len(years) / 2 ) -1 )] + years[int(-n_years_half):]
        train_years = [year for year in years if year not in valid_years]

        #generate valid and train dfs
        df_valid = self.df[self.df["year"].isin(valid_years)]
        df_train =self.df[self.df["year"].isin(train_years)]

        #generate x and y
        self.X_train = df_train.drop(labels = self.y_col, axis = 1)
        self.y_train = df_train[self.y_col]

        self.X_valid = df_valid.drop(labels = self.y_col, axis = 1)
        self.y_valid = df_valid[self.y_col]

        del df_valid, df_train, valid_years, train_years #free up memory

        return

    def __prep_test_df(self):

        #standardize
        x_cols = self.df.columns.to_list(); x_cols.remove(self.y_col)
        self.df_test[x_cols] = (self.df_test[x_cols] - self.df_unstand[x_cols].mean()) / self.df_unstand[x_cols].std()

        #split
        self.X_test = self.df_test.drop(labels = self.y_col, axis = 1)
        self.y_test = self.df_test[self.y_col]

        self.X_test = (self.X_test - self.X_test.mean()) / self.X_test.std()
        self.X_test.drop(labels = self.X_test.columns[self.X_test.isna().any()].tolist(), axis = 1, inplace = True)

        return

    def get_same_guess_accuracy(self):

        same_guess_prop_test = [self.y_test.describe()["mean"].round(2), 1 - self.y_test.describe()["mean"].round(2)]
        same_guess_prop_valid = [self.y_valid.describe()["mean"].round(2), 1 - self.y_valid.describe()["mean"].round(2)]

        self.same_guess_acc = {
            "same_guess_prop_valid" : same_guess_prop_valid,
            "same_guess_prop_test" : same_guess_prop_test,
        }

        print(self.same_guess_acc)

        return


    def clean_results (self):
        """Remove all same guess anwsers form the results tabel"""

        self.results[["train_score", "valid_score"]] = self.results[["train_score", "valid_score"]].round(2)

        #same_guess_prop_train = [self.y_train.describe()["mean"].round(2), 1 - self.y_train.describe()["mean"].round(2)]
        same_guess_prop_valid = [self.y_valid.describe()["mean"].round(2), 1 - self.y_valid.describe()["mean"].round(2)]

        #clean up results df
        self.results["same_guess"] = 0
        self.results.loc[self.results["valid_score"].isin(same_guess_prop_valid), "same_guess"] = 1

        self.results = self.results.loc[self.results["same_guess"] != 1].sort_values(by = "valid_score", axis = 0, ascending = False)

        print(same_guess_prop_valid)

        return

    def get_single_model_results(self, train_score = None, valid_score = None):

        self.model_score = {
            "train" : train_score,
            "valid": valid_score,
            "test" : None
        }

        if self.df_test is None:
            return

        #predictions = self.model.predict(self.X_test)
        valid_score, valid_acc = self.model.evaluate(self.X_valid, self.y_valid)
        test_score, test_acc = self.model.evaluate(self.X_test, self.y_test)

        self.single_model_result = {
            "valid_acc" : round(valid_acc,2),
            "test_acc" : round(test_acc,2),
        }

        print(self.single_model_result)
        return

    def run_optim_child(self):
        raise NotImplementedError("Must override by calling child class")


In [6]:
class EarlyStopping(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs={}):

      if(logs.get("accuracy") < 0.001):
          print("\nMAEthreshold reached. Training stopped.")
          self.model.stop_training = True

## 5.2 RNNC Sequential

In [7]:
#recurrent neural network classifier liner

class RNNC_1(Keras_base):

    def run_optim_child(self):

        self.early_stopping = EarlyStopping()

        for arch in self.architectures:

            self.__create_uncompiled_model(arch)
            self.__create_model(arch)
            print(self.model.summary())
            del self.model

    def __create_windowed_dataset(self, window_size = 12, batch_size=36, shuffle_buffer=1000):

        y = np.array(self.df[self.y_col])

        # Extract the feature columns and convert to numpy array.
        features = np.array(df.drop(self.y_col, axis=1))

        # Create a TensorFlow dataset from the feature and y arrays.
        dataset = tf.data.Dataset.from_tensor_slices((features, y))

        # Create a windowed dataset.
        dataset = dataset.window(window_size, shift=1, drop_remainder=True)
        dataset = dataset.flat_map(lambda x, y: tf.data.Dataset.zip((x.batch(window_size), y.batch(window_size))))

        # Shuffle and batch the dataset.
        dataset = dataset.shuffle(shuffle_buffer).batch(batch_size)

        self.dataset = dataset

        return

    def create_single_model_windowd(self, arch):

        self.early_stopping = EarlyStopping()
        self.__create_windowed_dataset()
        self.__create_uncompiled_model(arch, windowing = True)
        self.__create_model(windowing = True)

    def create_single_model(self, arch):

        self.early_stopping = EarlyStopping()
        self.__create_uncompiled_model(arch)
        self.__create_model()

    def __create_uncompiled_model(self, arch, windowing = False):

        #init model and initializer
        self.model = tf.keras.models.Sequential()

        #add a automaticali scaling input layer
        if windowing is False:
            shape = [self.n_features]
        else:
            shape = []

        self.model.add(tf.keras.layers.Lambda(
            lambda x: tf.expand_dims(x, axis=-1),
            input_shape=shape, #working: [None]
            #kernel_initializer = self.initializer,
            #activation = self.acitvation_func
        ))

        #add LSTM layers as hidden layers
        for i in range(len(arch)):


            if (i + 1) == len(arch):
                return_sequences = False
            else:
                return_sequences = True

            #hidden layers
            self.model.add(tf.keras.layers.Bidirectional(
                tf.keras.layers.LSTM(
                    arch[i],
                    #kernel_initializer = self.initializer,
                    activation = self.acitvation_func,
                    return_sequences = return_sequences,
                )
            ))

        #add a Dense output layer
        self.model.add(tf.keras.layers.Dense(1, activation = self.acitvation_func))

        return

    def __create_model(self, arch = None, windowing = False):
        #see https://keras.io/api/models/model_training_apis/

        #compile model
        self.model.compile(
            loss = self.loss_func,
            optimizer = self.solver,
            metrics = "accuracy" #binary_accuracy
        )

        #fit model and fetch history
        if windowing is False:
            self.history = self.model.fit(
                x = self.X_train,
                y = self.y_train,
                validation_data = (self.X_valid,self.y_valid),
                shuffle = False, #keep in order because it is time series data
                epochs = self.n_epochs,
                callbacks =[self.early_stopping]
            )

        else:
            self.history = self.model.fit(
                self.dataset,
                shuffle = False, #keep in order because it is time series data
                epochs = self.n_epochs,
                callbacks =[self.early_stopping]
            )

        #get results and write them
        train_accuracy = self.history.history['accuracy'][-1] # binary_accuracy
        valid_accuracy = self.history.history["val_accuracy"][-1]

        if self.run_optim:
            self.save_result(model_type = "RNNC_SEQ", train_score = train_accuracy, valid_score = valid_accuracy, arch = arch)
        else:
            self.get_single_model_results(train_score = train_accuracy, valid_score = valid_accuracy)

        return


In [8]:
rnnc_optim = RNNC_1(

    y_col = "t2m_cat_offset",
    df = df,
    df_test = df_test,
    run_optim = run_optim,
    data_folder = data_folder,
    model_splitter = 2,
)

In [9]:
rnnc_optim.results

Unnamed: 0,model_type,train_score,valid_score,arch,valid_model_splitter
0,RNNC_SEQ,0.507576,0.349398,[13],2
1,RNNC_SEQ,0.507576,0.349398,[26],2
2,RNNC_SEQ,0.507576,0.349398,[52],2
3,RNNC_SEQ,0.507576,0.349398,[512],2
4,RNNC_SEQ,0.492424,0.650602,"[13, 13]",2
5,RNNC_SEQ,0.492424,0.650602,"[26, 26]",2
6,RNNC_SEQ,0.472222,0.421687,"[52, 52]",2
7,RNNC_SEQ,0.507576,0.349398,"[512, 256]",2
8,RNNC_SEQ,0.517677,0.481928,"[13, 13, 13]",2
9,RNNC_SEQ,0.507576,0.349398,"[26, 26, 26]",2


In [10]:
rnnc_optim.clean_results()
rnnc_optim.results

[0.65, 0.35]


Unnamed: 0,model_type,train_score,valid_score,arch,valid_model_splitter,same_guess
8,RNNC_SEQ,0.52,0.48,"[13, 13, 13]",2,0
12,RNNC_SEQ,0.49,0.47,"[13, 13, 13, 13]",2,0
6,RNNC_SEQ,0.47,0.42,"[52, 52]",2,0
18,RNNC_SEQ,0.5,0.39,"[52, 52, 52, 52, 52]",2,0
15,RNNC_SEQ,0.48,0.37,"[512, 256, 128, 64]",2,0
16,RNNC_SEQ,0.47,0.3,"[13, 13, 13, 13, 13]",2,0


In [11]:
rnnc_optim.create_single_model(arch = [13, 13, 13, 13, 13])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
{'valid_acc': 0.35, 'test_acc': 0.56}


In [12]:
rnnc_optim.get_single_model_results()


{'valid_acc': 0.35, 'test_acc': 0.56}


In [13]:
rnnc_optim.get_same_guess_accuracy()

{'same_guess_prop_valid': [0.65, 0.35], 'same_guess_prop_test': [0.44, 0.56]}


Findings: 
- The model creates same guess outputs
- Not enough data is available for the model to converge

## 5.3 RNNC Sequential with shifting Window
see https://www.tensorflow.org/tutorials/structured_data/time_series

In [14]:
rnnc_windowd = RNNC_1(
    y_col = "t2m_cat_offset",
    df = df,
    df_test = df_test,
    run_optim = False,
    data_folder = data_folder,
    model_splitter = 2,
)

In [18]:
try:
    rnnc_windowd.create_single_model_windowd(arch = [13, 13, 13, 13, 13])
except:
    print("Joel hadn't time to implement this or fix the problem")

Joel hadn't time to implement this or fix the problem


In [16]:
rnnc_optim.get_single_model_results()

{'valid_acc': 0.35, 'test_acc': 0.56}


In [17]:
rnnc_optim.get_same_guess_accuracy()

{'same_guess_prop_valid': [0.65, 0.35], 'same_guess_prop_test': [0.44, 0.56]}
