In [None]:

######################################################################################
######################################################################################
################################      Question 3     #####################################
######################################################################################
######################################################################################
import h5py
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sn
import time
import sys


class humanActivityRecognition(object):


    def __init__(self, size, part_Q):

        self.part_Q = part_Q
        self.size = size
        self.sizeOfLayer = len(size) - 1
        self.mlp_sizeOfLayer = None
        self.mlp_coeff, self.coeffOf_firstLayer, self.mlp_action, self.actionOf_firstLayer = None, None, None, None
        self.init_coeff()


    def init_coeff(self):
        """
        This is a function that appears to be initializing the coefficients 
        (also known as parameters) of a machine learning model. It looks like 
        the function is part of a class, as it uses the self keyword to access 
        class attributes.

        The function appears to be implementing initialization for a multi-layer
        perceptron (MLP) and also for the first layer of the model. It looks like 
        the initialization for the first layer depends on the value of the part_Q 
        attribute of the class, which can be 1, 2, or 3.

        The function uses the np.random.uniform function to initialize the 
        coefficients using the Xavier distribution, which is a common initialization 
        technique that helps prevent the vanishing and exploding gradient problems 
        in deep neural networks. The function also initializes momentum, which is
        a hyperparameter used in optimization algorithms such as stochastic 
        gradient descent (SGD) with momentum.


        """


        part_Q = self.part_Q
        size = self.size
        sizeOfLayer = self.sizeOfLayer

        W = []
        b = []
        for i in range(1, sizeOfLayer):
            
            r = np.sqrt(6 / (size[i] + size[i + 1]))
            W.append(np.random.uniform(-r, r, size=(size[i], size[i + 1])))
            b.append(np.zeros((1, size[i + 1])))

        self.mlp_sizeOfLayer = len(W)
        coeff = {"W": W, "b": b}
        action = {"W": [0] * self.mlp_sizeOfLayer, "b": [0] * self.mlp_sizeOfLayer}
        self.mlp_coeff = coeff
        self.mlp_action = action

        N = size[0]
        H = size[1]
        Z = N + H

        if part_Q == 1:
            r = np.sqrt(6 / (N + H))
            Wih = np.random.uniform(-r, r, size=(N, H))
            r = np.sqrt(6 / (H + H))
            Whh = np.random.uniform(-r, r, size=(H, H))
            b = np.zeros((1, H))

            coeff = {"Wih": Wih, "Whh": Whh, "b": b}

        if part_Q == 2:
            r = np.sqrt(6 / (Z + H))

            Wf = np.random.uniform(-r, r, size=(Z, H))
            Wi = np.random.uniform(-r, r, size=(Z, H))
            Wc = np.random.uniform(-r, r, size=(Z, H))
            Wo = np.random.uniform(-r, r, size=(Z, H))

            bf = np.zeros((1, H))
            bi = np.zeros((1, H))
            bc = np.zeros((1, H))
            bo = np.zeros((1, H))

            coeff = {"Wf": Wf, "bf": bf,
                      "Wi": Wi, "bi": bi,
                      "Wc": Wc, "bc": bc,
                      "Wo": Wo, "bo": bo}

        if part_Q == 3:
            rN = np.sqrt(6 / (N + H))
            rH = np.sqrt(6 / (H + H))

            Wz = np.random.uniform(-rN, rN, size=(N, H))
            Uz = np.random.uniform(-rH, rH, size=(H, H))
            bz = np.zeros((1, H))

            Wr = np.random.uniform(-rN, rN, size=(N, H))
            Ur = np.random.uniform(-rH, rH, size=(H, H))
            br = np.zeros((1, H))

            Wh = np.random.uniform(-rN, rN, size=(N, H))
            Uh = np.random.uniform(-rH, rH, size=(H, H))
            bh = np.zeros((1, H))

            coeff = {"Wz": Wz, "Uz": Uz, "bz": bz,
                      "Wr": Wr, "Ur": Ur, "br": br,
                      "Wh": Wh, "Uh": Uh, "bh": bh}


        action = dict.fromkeys(coeff.keys(), 0)
        self.coeffOf_firstLayer = coeff
        self.actionOf_firstLayer = action


    def train(self, X, Y, eta, alpha, batch_size, epoch):
        """
        This is a method for training a neural network using mini-batch gradient
         descent. The method takes the following arguments:

        X: a 2D array of input data, where each row represents a sample and each 
        column represents a feature.
        Y: a 2D array of labels for the input data, where each row represents a 
        sample and each column represents a label.
        eta: the learning rate.
        alpha: the momentum coefficient.
        batch_size: the size of the mini-batches used to train the model.
        epoch: the number of epochs to train the model.
        The method first divides the data into a training set and a validation 
        set, and then trains the model on the training set using mini-batches. 
        It tracks the training loss, validation loss, training accuracy, and 
        validation accuracy over the course of training and returns these values 
        in a dictionary at the end of training. The method also has an early 
        stopping mechanism that stops training if the validation loss reaches 
        convergence, defined as being within a certain range of the average 
        validation loss over the past 15 epochs.
        """

        listof_trainLoss = []
        listOf_valueLoss = []
        listOf_trainAccuracy = []
        listOf_valueAccuracy = []

        # create validation set
        sizeOfValue = int(X.shape[0] / 10)
        p = np.random.permutation(X.shape[0])
        valueOfX = X[p][:sizeOfValue]
        valueOfY = Y[p][:sizeOfValue]
        X = X[p][sizeOfValue:]
        Y = Y[p][sizeOfValue:]

        sample_size = X.shape[0]
        iterationOf_perEpoch = int(sample_size / batch_size)

        for i in range(epoch):

            time_start = time.time()

            start = 0
            end = batch_size
            p = np.random.permutation(X.shape[0])
            X = X[p]
            Y = Y[p]

            for j in range(iterationOf_perEpoch):

                X_batch = X[start:end]
                Y_batch = Y[start:end]

              
                pred, o, drv, h, h_derivative, memory = self.forward_propagation_pass(X_batch)

              
                delta = pred
                delta[Y_batch == 1] -= 1
                delta = delta / batch_size

                
                fl_grads, mlp_grads = self.Backpropagation_pass(X_batch, o, drv, delta, h, h_derivative, memory)

                
                self.update_coeff(eta, alpha, fl_grads, mlp_grads)

                start = end
                end += batch_size

           
            pred = self.predict(X, acc=False)
            lossOfTrain = self.cross_entropy(Y, pred)

      
            accuracyOfTrain = self.predict(X, Y, acc=True)

           
            accuracyOfValue = self.predict(valueOfX, valueOfY, acc=True)

            # val loss
            pred = self.predict(valueOfX, acc=False)
            valueOfLoss = self.cross_entropy(valueOfY, pred)


            print('Train Loss: %.2f, Val Loss: %.2f, Train Acc: %.2f, Val Acc: %.2f [Epoch: %d of %d]'
                  % (lossOfTrain, valueOfLoss, accuracyOfTrain, accuracyOfValue, i + 1, epoch))

            listof_trainLoss.append(lossOfTrain)
            listOf_valueLoss.append(valueOfLoss)
            listOf_trainAccuracy.append(accuracyOfTrain)
            listOf_valueAccuracy.append(accuracyOfValue)


            # stop if the cross entropy of validation set converged
            if i > 15:
                conv = listOf_valueLoss[-16:-1]
                conv = sum(conv) / len(conv)

                limit = 0.02
                if (conv - limit) < valueOfLoss < (conv + limit):
                    print("\nTraining stopped since validation C-E reached convergence.")
                    return {"listof_trainLoss": listof_trainLoss, "listOf_valueLoss": listOf_valueLoss,
                            "listOf_trainAccuracy": listOf_trainAccuracy, "listOf_valueAccuracy": listOf_valueAccuracy}


        return {"listof_trainLoss": listof_trainLoss, "listOf_valueLoss": listOf_valueLoss,
                "listOf_trainAccuracy": listOf_trainAccuracy, "listOf_valueAccuracy": listOf_valueAccuracy}


    def forward_propagation_pass(self, X):
        """
        This is a method for performing a forward pass through a neural network. 
        The method takes as input a 2D array X of input data and returns the 
        following outputs:

        pred: the predicted output of the model.
        o: a list of the functionOfActivations of the hidden layers.
        drv: a list of the derivatives of the functionOfActivations of the hidden layers.
        h: the hidden state of the recurrent layer.
        h_derivative: the resultOfDerivation of the hidden state of the recurrent layer.
        memory: the memory of the recurrent layer.
        The method first checks the type of the recurrent layer (either 
        RNN, LSTM, or GRU) and computes the hidden state h and its resultOfDerivation 
        h_derivative using the appropriate method. It then applies the ReLU functionOfActivation 
        function to the hidden state and computes the output of the model using 
        the softmax functionOfActivation function. The functionOfActivations and derivatives of the 
        hidden layers are stored in the o and drv lists, respectively. The memory 
        of the recurrent layer is also stored.

        """

        part_Q = self.part_Q
        P_ofMlp = self.mlp_coeff
        p_fl = self.coeffOf_firstLayer

        o = []
        drv = []

        h = 0
        h_derivative = 0
        memory = 0

        # first layer
        if part_Q == 1:
            h, h_derivative = self.forward_recurrent(X, p_fl)
            o.append(h[:, -1, :])
            drv.append(h_derivative[:, -1, :])
        if part_Q == 2:
            h, memory = self.forward_lstm(X, p_fl)
            o.append(h)
            drv.append(1)
        if part_Q == 3:
            h, memory = self.forward_gru(X, p_fl)
            o.append(h)
            drv.append(1)

        # relu layers
        for i in range(self.mlp_sizeOfLayer - 1):
            functionOfActivation, resultOfDerivation = self.forward_perceptron(o[-1], P_ofMlp["W"][i], P_ofMlp["b"][i], "relu")
            o.append(functionOfActivation)
            drv.append(resultOfDerivation)

        # output layer
        pred = self.forward_perceptron(o[-1], P_ofMlp["W"][-1], P_ofMlp["b"][-1], "softmax")[0]

        return pred, o, drv, h, h_derivative, memory


    def Backpropagation_pass(self, X, o, drv, delta, h=None, h_derivative=None, memory=None):
        """
        This is a method for performing a backward pass through a neural network 
        to compute the gradients of the model's parameters. The method takes the 
        following arguments:

        X: a 2D array of input data, where each row represents a sample and each
         column represents a feature.
        o: a list of the functionOfActivations of the hidden layers.
        drv: a list of the derivatives of the functionOfActivations of the hidden layers.
        delta: the error of the output layer.
        h: the hidden state of the recurrent layer (optional, only used for RNN).
        h_derivative: the resultOfDerivation of the hidden state of the recurrent layer (optional,
        only used for  RNN).
        memory: the memory of the recurrent layer (optional, only used for LSTM and GRU).
        The method first uses the error of the output layer to compute the gradients of 
        the parameters of the output layer and the fully-connected layers. It then 
        backpropagates the error through the fully-connected layers using the functionOfActivations 
        and derivatives stored in the o and drv lists. Finally, it computes the gradients 
        of the parameters of the recurrent layer using the appropriate method depending 
        on the type of the recurrent layer (either RNN, LSTM, or GRU). The method 
        returns the gradients of the parameters of the recurrent layer and the fully-connected layers.
                """
        part_Q = self.part_Q
        p_fl = self.coeffOf_firstLayer
        P_ofMlp = self.mlp_coeff

        fl_grads = dict.fromkeys(p_fl.keys())
        mlp_grads = {"W": [0] * self.mlp_sizeOfLayer, "b": [0] * self.mlp_sizeOfLayer}

        # backpropagation until recurrent
        for i in reversed(range(self.mlp_sizeOfLayer)):
            mlp_grads["W"][i], mlp_grads["b"][i], delta = self.backward_perceptron(P_ofMlp["W"][i], o[i], drv[i], delta)

        # backpropagation through time
        if part_Q == 1:
            fl_grads = self.backward_recurrent(X, h, h_derivative, delta, p_fl)
        if part_Q == 2:
            fl_grads = self.backward_lstm(memory, p_fl, delta)
        if part_Q == 3:
            fl_grads = self.backward_gru(X, memory, p_fl, delta)

        return fl_grads, mlp_grads


    def update_coeff(self, eta, alpha, fl_grads, mlp_grads):
       
        """
        This is a method for updating the model's parameters using the gradients
         computed in the backward pass. The method takes the following arguments:

        eta: the learning rate.
        alpha: the momentum coefficient.
        fl_grads: a dictionary of the gradients of the parameters of the recurrent layer.
        mlp_grads: a dictionary of the gradients of the parameters of the fully-connected layers.
        """

       
        p_fl = self.coeffOf_firstLayer
        fl_m = self.actionOf_firstLayer
        P_ofMlp = self.mlp_coeff
        mlp_m = self.mlp_action

 
        for p in self.coeffOf_firstLayer:
            fl_m[p] = eta * fl_grads[p] + alpha * fl_m[p]
            p_fl[p] -= fl_m[p]

        
        for i in range(self.mlp_sizeOfLayer):
            mlp_m["W"][i] = eta * mlp_grads["W"][i] + alpha * mlp_m["W"][i]
            mlp_m["b"][i] = eta * mlp_grads["b"][i] + alpha * mlp_m["b"][i]
            P_ofMlp["W"][i] -= mlp_m["W"][i]
            P_ofMlp["b"][i] -= mlp_m["b"][i]

       
        self.coeffOf_firstLayer = p_fl
        self.actionOf_firstLayer = fl_m
        self.mlp_coeff = P_ofMlp
        self.mlp_action = mlp_m


    def forward_perceptron(self, X, W, b, a):
       
        u = X @ W + b
        return self.functionOfActivation(u, a)


    def backward_perceptron(self, W, o, drv, delta):
       
        dW = o.T @ delta
        db = delta.sum(axis=0, keepdims=True)
        delta = drv * (delta @ W.T)
        return dW, db, delta


    def forward_recurrent(self, X, p_fl):
        """
        This is a method for performing a forward pass through a RNN 
        layer. The method takes as input a 3D array X of input data and a dictionary 
        p_fl of the parameters of the recurrent layer, and returns the following 
        outputs:

        h: the hidden state of the recurrent layer.
        h_drv: the resultOfDerivation of the hidden state of the recurrent layer.
        The method loops over the time steps of the input data and computes the 
        hidden state h and its resultOfDerivation h_drv using the tanh functionOfActivation function. 
        The hidden state at each time step is computed using the previous hidden
        state and the current input, as well as the weights Wih and Whh and the 
        bias term b of the recurrent layer. The resultOfDerivation of the hidden state 
        is computed using the resultOfDerivation of the tanh function. The method returns
        the hidden state and its resultOfDerivation for each time step.
        """

        N, T, D = X.shape
        H = self.size[1]

        Wih = p_fl["Wih"]
        Whh = p_fl["Whh"]
        b = p_fl["b"]

        previousOf_h = np.zeros((N, H))
        h = np.empty((N, T, H))
        h_derivative = np.empty((N, T, H))

        for t in range(T):
            x = X[:, t, :]
            u = x @ Wih + previousOf_h @ Whh + b
            h[:, t, :], h_derivative[:, t, :] = self.functionOfActivation(u, "tanh")
            previousOf_h = h[:, t, :]

        return h, h_derivative


    def backward_recurrent(self, X, h, h_derivative, delta, p_fl):
        """
        This is a method for performing a backward pass through a RNN layer to 
        compute the gradients of the layer's parameters. The method takes the 
        following arguments:

        X: a 3D array of input data, where the first dimension represents the batch
        size, the second dimension represents the time steps, and the third dimension
        represents the features.
        h: the hidden state of the recurrent layer.
        h_derivative: the resultOfDerivation of the hidden state of the recurrent layer.
        delta: the error of the output layer.
        p_fl: a dictionary of the parameters of the recurrent layer.
        The method loops over the time steps in reverse order and computes the 
        gradients of the parameters Wih, Whh, and b using the error delta and the
        hidden state and its resultOfDerivation at each time step. The error at each time
        step is computed by backpropagating the error from the next time step
        using the weights Whh of the recurrent layer. The method returns the 
        gradients of the parameters as a dictionary.
        """

        N, T, D = X.shape
        H = self.size[1]

        Whh = p_fl["Whh"]

        dWih = 0
        dWhh = 0
        db = 0

        for t in reversed(range(T)):
            x = X[:, t, :]

            if t > 0:
                previousOf_h = h[:, t - 1, :]
                previousOf_h_derv = h_derivative[:, t - 1, :]
            else:
                previousOf_h = np.zeros((N, H))
                previousOf_h_derv = 0

            dWih += x.T @ delta
            dWhh += previousOf_h.T @ delta
            db += delta.sum(axis=0, keepdims=True)
            delta = previousOf_h_derv * (delta @ Whh)

        return {"Wih": dWih, "Whh": dWhh, "b": db}


    def forward_lstm(self, X, p_fl):
        """
        The LSTM is a type of recurrent neural network that is often used for 
        modeling sequential data.

        In this implementation, the input X is a three-dimensional tensor with dimensions
        N, T, and D, where N is the batch size, T is the sequence length, and D is the 
        input feature dimension. The p_fl argument is a dictionary that contains the weight 
        matrices and biases for the LSTM. The function returns the hidden state of the LSTM 
        at the final time step and a memory containing intermediate values that are needed 
        for backpropagation.

        The LSTM computes the hidden state at each time step t by using the hidden state at
        the previous time step, the input at the current time step, and some other 
        intermediate values. These intermediate values are computed using element-wise
        multiplication and non-linear functionOfActivation functions such as sigmoid and tanh. 
        The final hidden state at time T is returned as the output of the function.
        """

        N, T, D = X.shape
        H = self.size[1]

        Wf, bf = p_fl["Wf"], p_fl["bf"]
        Wi, bi = p_fl["Wi"], p_fl["bi"]
        Wc, bc = p_fl["Wc"], p_fl["bc"]
        Wo, bo = p_fl["Wo"], p_fl["bo"]

        previousOf_h = np.zeros((N, H))
        previousOf_c = np.zeros((N, H))
        z = np.empty((N, T, D + H))
        c = np.empty((N, T, H))
        tanhc = np.empty((N, T, H))
        hf = 0
        hi = np.empty((N, T, H))
        hc = np.empty((N, T, H))
        ho = np.empty((N, T, H))
        tanhc_d = np.empty((N, T, H))
        hf_d = np.empty((N, T, H))
        hi_d = np.empty((N, T, H))
        hc_d = np.empty((N, T, H))
        ho_d = np.empty((N, T, H))

        for t in range(T):
            z[:, t, :] = np.column_stack((previousOf_h, X[:, t, :]))
            z_cur = z[:, t, :]

            hf, hf_d[:, t, :] = self.functionOfActivation(z_cur @ Wf + bf, "sigmoid")
            hi[:, t, :], hi_d[:, t, :] = self.functionOfActivation(z_cur @ Wi + bi, "sigmoid")
            hc[:, t, :], hc_d[:, t, :] = self.functionOfActivation(z_cur @ Wc + bc, "tanh")
            ho[:, t, :], ho_d[:, t, :] = self.functionOfActivation(z_cur @ Wo + bo, "sigmoid")

            c[:, t, :] = hf * previousOf_c + hi[:, t, :] * hc[:, t, :]
            tanhc[:, t, :], tanhc_d[:, t, :] = self.functionOfActivation(c[:, t, :], "tanh")
            previousOf_h = ho[:, t, :] * tanhc[:, t, :]
            previousOf_c = c[:, t, :]

            memory = {"z": z,
                     "c": c,
                     "tanhc": (tanhc, tanhc_d),
                     "hf_d": hf_d,
                     "hi": (hi, hi_d),
                     "hc": (hc, hc_d),
                     "ho": (ho, ho_d)}

        return previousOf_h, memory


    def backward_lstm(self, memory, p_fl, delta):
        """
      This is a function for implementing the backward pass of an LSTM (Long 
      Short-Term Memory) network, a type of recurrent neural network. In the backward pass,
      the function is responsible for computing the gradients of the loss function with 
      respect to the model's parameters (i.e., the weights and biases of the LSTM).

      The function takes in several arguments:

      memory: a dictionary containing the functionOfActivations and intermediate values 
      computed during the forward pass of the LSTM.
      p_fl: a dictionary containing the weights and biases of the LSTM.
      delta: the error gradient with respect to the output of the LSTM at the 
      final time step.
      The function first unpacks the weights and biases from the p_fl dictionary.
      It then loops through the time steps in the reverse order, starting from the 
      final time step and going back to the first time step. At each time step, it 
      computes the gradients of the loss function with respect to the four "gates" 
      in the LSTM (i.e., the forget gate, input gate, output gate, and cell state). 
      It then updates the gradients of the weights and biases by adding the gradients 
      at that time step. Finally, it updates the error gradient delta by adding up the 
      gradients with respect to the input to the LSTM at that time step.

        """
        # unpack variables
        Wf = p_fl["Wf"]
        Wi = p_fl["Wi"]
        Wc = p_fl["Wc"]
        Wo = p_fl["Wo"]

        z = memory["z"]
        c = memory["c"]
        tanhc, tanhc_d = memory["tanhc"]
        hf_d = memory["hf_d"]
        hi, hi_d = memory["hi"]
        hc, hc_d = memory["hc"]
        ho, ho_d = memory["ho"]

        H = self.size[1]
        T = z.shape[1]

        # initialize gradients to zero
        dWf = 0
        dWi = 0
        dWc = 0
        dWo = 0
        dbf = 0
        dbi = 0
        dbc = 0
        dbo = 0

        for t in reversed(range(T)):

            z_cur = z[:, t, :]

        
            if t > 0:
                previousOf_c = c[:, t - 1, :]
            else:
                previousOf_c = 0

            dc = delta * ho[:, t, :] * tanhc_d[:, t, :]
            dhf = dc * previousOf_c * hf_d[:, t, :]
            dhi = dc * hc[:, t, :] * hi_d[:, t, :]
            dhc = dc * hi[:, t, :] * hc_d[:, t, :]
            dho = delta * tanhc[:, t, :] * ho_d[:, t, :]

        
            dWf += z_cur.T @ dhf
            dbf += dhf.sum(axis=0, keepdims=True)

            dWi += z_cur.T @ dhi
            dbi += dhi.sum(axis=0, keepdims=True)

            dWc += z_cur.T @ dhc
            dbc += dhc.sum(axis=0, keepdims=True)

            dWo += z_cur.T @ dho
            dbo += dho.sum(axis=0, keepdims=True)

          
            dxf = dhf @ Wf.T[:, :H]
            dxi = dhi @ Wi.T[:, :H]
            dxc = dhc @ Wc.T[:, :H]
            dxo = dho @ Wo.T[:, :H]

            delta = (dxf + dxi + dxc + dxo)  
        grads = {"Wf": dWf, "bf": dbf,
                 "Wi": dWi, "bi": dbi,
                 "Wc": dWc, "bc": dbc,
                 "Wo": dWo, "bo": dbo}

        return grads


    def forward_gru(self, X, p_fl):
        """
        This is a function for implementing the forward pass of a GRU (Gated 
        Recurrent Unit) network, another type of recurrent neural network. In the 
        forward pass, the function is responsible for computing the functionOfActivations of 
        the GRU at each time step given the input data and the model's parameters 
        (i.e., the weights and biases of the GRU).

        The function takes in two arguments:

        X: a 3D array of shape (N, T, D), where N is the batch size, T is the 
        number of time steps, and D is the input dimension. This represents the 
        input data for the GRU, with one time step per row.
        p_fl: a dictionary containing the weights and biases of the GRU.
        The function first unpacks the weights and biases from the p_fl dictionary. 
        It then initializes several variables to hold the functionOfActivations of the GRU at 
        each time step and the intermediate values computed during the forward pass. 
        It then loops through the time steps, starting from the first time step and 
        going to the final time step. At each time step, it computes the functionOfActivations 
        of the update gate, reset gate, and hidden state using the sigmoid and tanh 
        functionOfActivation functions. It then updates the hidden state using the functionOfActivations 
        of the update and reset gates and the previous hidden state. Finally, it 
        updates the previous hidden state with the current hidden state.

        Once the loop is complete, the function returns the final hidden state of 
        the GRU and a dictionary containing the functionOfActivations and intermediate values 
        computed during the forward pass.

        """

        Wz = p_fl["Wz"]
        Wr = p_fl["Wr"]
        Wh = p_fl["Wh"]

        Uz = p_fl["Uz"]
        Ur = p_fl["Ur"]
        Uh = p_fl["Uh"]

        bz = p_fl["bz"]
        br = p_fl["br"]
        bh = p_fl["bh"]

        N, T, D = X.shape
        H = self.size[1]

        previousOf_h = np.zeros((N, H))

        z = np.empty((N, T, H))
        z_d = np.empty((N, T, H))
        r = np.empty((N, T, H))
        r_d = np.empty((N, T, H))
        h_tilde = np.empty((N, T, H))
        h_tilde_d = np.empty((N, T, H))
        h = np.empty((N, T, H))

        for t in range(T):
            x = X[:, t, :]
            z[:, t, :], z_d[:, t, :] = self.functionOfActivation(x @ Wz + previousOf_h @ Uz + bz, "sigmoid")
            r[:, t, :], r_d[:, t, :] = self.functionOfActivation(x @ Wr + previousOf_h @ Ur + br, "sigmoid")
            h_tilde[:, t, :], h_tilde_d[:, t, :] = self.functionOfActivation(x @ Wh + (r[:, t, :] * previousOf_h) @ Uh + bh, "tanh")
            h[:, t, :] = (1 - z[:, t, :]) * previousOf_h + z[:, t, :] * h_tilde[:, t, :]

            previousOf_h = h[:, t, :]

        memory = {"z": (z, z_d),
                 "r": (r, r_d),
                 "h_tilde": (h_tilde, h_tilde_d),
                 "h": h}

        return previousOf_h, memory


    def backward_gru(self, X, memory, p_fl, delta):
        """
        This is a function for implementing the backward pass of a GRU network. 
        In the backward pass, the function is responsible for computing the gradients 
        of the loss function with respect to the model's parameters (i.e., the weights and biases of the GRU).

        The function takes in several arguments:

        X: a 3D array of shape (N, T, D), where N is the batch size, T is the number of 
        time steps, and D is the input dimension. This represents the input data for the GRU.
        memory: a dictionary containing the functionOfActivations and intermediate values 
        computed during the forward pass of the GRU.

        p_fl: a dictionary containing the weights and biases of the GRU.
        delta: the error gradient with respect to the output of the GRU at the final time step.

        The function first unpacks the weights and biases from the p_fl dictionary 
        and the functionOfActivations and intermediate values from the memory dictionary. 
        It then initializes several variables to hold the gradients of the loss 
        function with respect to the weights and biases of the GRU. It then loops 
        through the time steps in the reverse order, starting from the final time 
        step and going back to the first time step. At each time step, it computes 
        the gradients of the loss function with respect to the update gate, reset gate, 
        and hidden state using the chain rule and the derivatives of the sigmoid 
        and tanh functionOfActivation functions. It then updates the gradients of the weights 
        and biases by adding the gradients at that time step. Finally, it updates 
        the error gradient delta by adding up the gradients with respect to the 
        input to the GRU at that time step.

        Once the loop is complete, the function returns a dictionary containing 
        the gradients of the loss function with respect to the weights and biases of the GRU.
        """

       
        Uz = p_fl["Uz"]
        Ur = p_fl["Ur"]
        Uh = p_fl["Uh"]

        z, z_d = memory["z"]
        r, r_d = memory["r"]
        h_tilde, h_tilde_d = memory["h_tilde"]
        h = memory["h"]

        H = self.size[1]
        N, T, D = X.shape

        
        dWz = 0
        dUz = 0
        dbz = 0
        dWr = 0
        dUr = 0
        dbr = 0
        dWh = 0
        dUh = 0
        dbh = 0

        for t in reversed(range(T)):
            x = X[:, t, :]

            # if t = 0 we want h(t-1) = 0
            if t > 0:
                previousOf_h = h[:, t - 1, :]
            else:
                previousOf_h = np.zeros((N, H))

            # similar to LSTM we find some intermediate values for each gate
            # dE/dz is named as dz for example, this is true for all naming
            dz = delta * (h_tilde[:, t, :] - previousOf_h) * z_d[:, t, :]
            dh_tilde = delta * z[:, t, :] * h_tilde_d[:, t, :]
            dr = (dh_tilde @ Uh.T) * previousOf_h * r_d[:, t, :]

            # add to the sum of gradients
            dWz += x.T @ dz
            dUz += previousOf_h.T @ dz
            dbz += dz.sum(axis=0, keepdims=True)

            dWr += x.T @ dr
            dUr += previousOf_h.T @ dr
            dbr += dr.sum(axis=0, keepdims=True)

            dWh += x.T @ dh_tilde
            dUh += previousOf_h.T @ dh_tilde
            dbh += dh_tilde.sum(axis=0, keepdims=True)

            # update delta, this step uses chain rule and resultOfDerivation of multiplication, at the end it simplifies to
            #the sum of these three terms
            d1 = delta * (1 - z[:, t, :])
            d2 = dz @ Uz.T
            d3 = (dh_tilde  @ Uh.T) * (r[:, t, :] + previousOf_h * (r_d[:, t, :] @ Ur.T))

            delta = d1 + d2 + d3


        grads = {"Wz": dWz, "Uz": dUz, "bz": dbz,
                 "Wr": dWr, "Ur": dUr, "br": dbr,
                 "Wh": dWh, "Uh": dUh, "bh": dbh}

        return grads


    def cross_entropy(self, desired, output):
      
        return np.sum(- desired * np.log(output)) / desired.shape[0]


    def functionOfActivation(self, X, a):
        """
        This is a function for implementing various functionOfActivation functions. functionOfActivation 
        functions are mathematical functions that are used to introduce non-linearity 
        into a neural network. They take in an input value or an array of input values 
        and return an output value or an array of output values.

        The function takes in two arguments:

        X: an array or a scalar value representing the input to the functionOfActivation function.
        
        a: a string indicating the type of functionOfActivation function to use.
        """

        if a == "tanh":
            functionOfActivation = np.tanh(X)
            resultOfDerivation = 1 - functionOfActivation ** 2
            return functionOfActivation, resultOfDerivation

        if a == "sigmoid":
            functionOfActivation = 1 / (1 + np.exp(-X))
            resultOfDerivation = functionOfActivation * (1 - functionOfActivation)
            return functionOfActivation, resultOfDerivation

        if a == "relu":
            functionOfActivation = X * (X > 0)
            resultOfDerivation = 1 * (X > 0)
            return functionOfActivation, resultOfDerivation

        if a == "softmax":
            functionOfActivation = np.exp(X) / np.sum(np.exp(X), axis=1, keepdims=True)
            resultOfDerivation = None
            return functionOfActivation, resultOfDerivation


    def predict(self, X, Y=None, acc=True, confusion = False):
        """
        This function method of a class that is used to predict labels for a given
         set of inputs X and compare the predictions to ground truth labels Y. 
         If acc is True, the function will compute the argmax of the prediction 
         and return the accuracy, which is the percentage of times that the predicted 
         labels match the ground truth labels. If confusion is True, the function will 
         return a confusion matrix, which is a KxK matrix where K is the number of classes. 
         The matrix will have the count of how many times each class was predicted as 
         each other class.

        """
        pred = self.forward_propagation_pass(X)[0]

        if not acc:
            return pred

        pred = pred.argmax(axis=1)
        Y = Y.argmax(axis=1)

        if not confusion:
            return (pred == Y).mean() * 100 #accuracy

        K = len(np.unique(Y))  # Number of classes
        c = np.zeros((K, K))

        for i in range(len(Y)):
            c[Y[i]][pred[i]] += 1

        return c


def q3():
    filename = "data3.h5"
    h5 = h5py.File(filename, 'r')
    trX = h5['trX'][()].astype('float64')
    tstX = h5['tstX'][()].astype('float64')
    trY = h5['trY'][()].astype('float64')
    tstY = h5['tstY'][()].astype('float64')
    h5.close()

    alpha = 0.85
    eta = 0.01
    epoch = 10
    batch_size = 32
    size = [trX.shape[2], 128, 32, 16, 6]

    print("Recurrent Layer\n")
    nn = humanActivityRecognition(size, 1)
    listof_trainLoss, listOf_valueLoss, listOf_trainAccuracy, listOf_valueAccuracy = nn.train(trX, trY, eta, alpha, batch_size, epoch).values()
    tst_acc = nn.predict(tstX, tstY, acc=True)

    print("\nTest Accuracy: ", tst_acc, "\n\n")

    fig = plt.figure(figsize=(20, 10), dpi=160, facecolor='w', edgecolor='k')
    fig.suptitle("RNN\nLearning Rate {} | action = {} | Batch Size  = {} | Hidden Layers = {}\n"
                 "Train Accuracy: {:.1f} | Validation Accuracy: {:.1f} | Test Accuracy: {:.1f}\n "
                 .format(eta, alpha, batch_size, size[2:-1], listOf_trainAccuracy[-1], listOf_valueAccuracy[-1], tst_acc), fontsize=13)

    plt.subplot(2, 2, 1)
    plt.plot(listof_trainLoss, "C2", label="Train Cross Entropy Loss")
    plt.title("Train Cross Entropy Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.subplot(2, 2, 2)
    plt.plot(listOf_valueLoss, "C3", label="Validation Cross Entropy Loss")
    plt.title("Validation Cross Entropy Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.subplot(2, 2, 3)
    plt.plot(listOf_trainAccuracy, "C2", label="Train Accuracy")
    plt.title("Train Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.subplot(2, 2, 4)
    plt.plot(listOf_valueAccuracy, "C3", label="Validation Accuracy")
    plt.title("Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")

    plt.savefig("q3a.png", bbox_inches='tight')

    train_confusion = nn.predict(trX, trY, acc=True, confusion=True)
    test_confusion = nn.predict(tstX, tstY, acc=True, confusion=True)

    plt.figure(figsize=(20, 10), dpi=160)

    names = [1, 2, 3, 4, 5, 6]

    plt.subplot(1, 2, 1)
    sn.heatmap(train_confusion, annot=True, annot_kws={"size": 8}, xticklabels=names, yticklabels=names, cmap=sn.cm.rocket_r, fmt='g')
    plt.title("Train Confusion Matrix")
    plt.ylabel("Actual")
    plt.xlabel("Prediction")
    plt.subplot(1, 2, 2)
    sn.heatmap(test_confusion, annot=True, annot_kws={"size": 8}, xticklabels=names, yticklabels=names, cmap=sn.cm.rocket_r, fmt='g')
    plt.title("Test Confusion Matrix")
    plt.ylabel("Actual")
    plt.xlabel("Prediction")
    plt.savefig("q3a_confusion.png", bbox_inches='tight')

    ##############################

    alpha = 0.85
    eta = 0.01
    epoch = 10
    batch_size = 32
    size = [trX.shape[2], 128, 32, 16, 6]

    print("\nLSTM Layer\n")

    nn = humanActivityRecognition(size, 2)
    listof_trainLoss, listOf_valueLoss, listOf_trainAccuracy, listOf_valueAccuracy = nn.train(trX, trY, eta, alpha, batch_size, epoch).values()
    tst_acc = nn.predict(tstX, tstY, acc=True)

    print("\nTest Accuracy: ", tst_acc, "\n\n")

    fig = plt.figure(figsize=(20, 10), dpi=160, facecolor='w', edgecolor='k')
    fig.suptitle("LSTM\nLearning Rate {} | action = {} | Batch Size  = {} | Hidden Layers = {}\n"
                 "Train Accuracy: {:.1f} | Validation Accuracy: {:.1f} | Test Accuracy: {:.1f}\n "
                 .format(eta, alpha, batch_size, size[2:-1], listOf_trainAccuracy[-1], listOf_valueAccuracy[-1], tst_acc), fontsize=13)

    plt.subplot(2, 2, 1)
    plt.plot(listof_trainLoss, "C2", label="Train Cross Entropy Loss")
    plt.title("Train Cross Entropy Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.subplot(2, 2, 2)
    plt.plot(listOf_valueLoss, "C3", label="Validation Cross Entropy Loss")
    plt.title("Validation Cross Entropy Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.subplot(2, 2, 3)
    plt.plot(listOf_trainAccuracy, "C2", label="Train Accuracy")
    plt.title("Train Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.subplot(2, 2, 4)
    plt.plot(listOf_valueAccuracy, "C3", label="Validation Accuracy")
    plt.title("Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")

    plt.savefig("q3b.png", bbox_inches='tight')

    train_confusion = nn.predict(trX, trY, acc=True, confusion=True)
    test_confusion = nn.predict(tstX, tstY, acc=True, confusion=True)

    plt.figure(figsize=(20, 10), dpi=160)

    plt.subplot(1, 2, 1)
    sn.heatmap(train_confusion, annot=True, annot_kws={"size": 8}, xticklabels=names, yticklabels=names, cmap=sn.cm.rocket_r, fmt='g')
    plt.title("Train Confusion Matrix")
    plt.ylabel("Actual")
    plt.xlabel("Prediction")
    plt.subplot(1, 2, 2)
    sn.heatmap(test_confusion, annot=True, annot_kws={"size": 8}, xticklabels=names, yticklabels=names, cmap=sn.cm.rocket_r, fmt='g')
    plt.title("Test Confusion Matrix")
    plt.ylabel("Actual")
    plt.xlabel("Prediction")
    plt.savefig("q3b_confusion.png", bbox_inches='tight')

    ##############################

    alpha = 0.85
    eta = 0.01
    epoch = 10
    batch_size = 32
    size = [trX.shape[2], 128, 32, 16, 6]

    print("\nGRU Layer\n")

    nn = humanActivityRecognition(size, 3)
    listof_trainLoss, listOf_valueLoss, listOf_trainAccuracy, listOf_valueAccuracy = nn.train(trX, trY, eta, alpha, batch_size, epoch).values()
    tst_acc = nn.predict(tstX, tstY, acc=True)

    print("\nTest Accuracy: ", tst_acc, "\n\n")

    fig = plt.figure(figsize=(20, 10), dpi=160, facecolor='w', edgecolor='k')
    fig.suptitle("GRU\nLearning Rate {} | action = {} | Batch Size  = {} | Hidden Layers = {}\n"
                 "Train Accuracy: {:.1f} | Validation Accuracy: {:.1f} | Test Accuracy: {:.1f}\n "
                 .format(eta, alpha, batch_size, size[2:-1], listOf_trainAccuracy[-1], listOf_valueAccuracy[-1], tst_acc), fontsize=13)

    plt.subplot(2, 2, 1)
    plt.plot(listof_trainLoss, "C2", label="Train Cross Entropy Loss")
    plt.title("Train Cross Entropy Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.subplot(2, 2, 2)
    plt.plot(listOf_valueLoss, "C3", label="Validation Cross Entropy Loss")
    plt.title("Validation Cross Entropy Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.subplot(2, 2, 3)
    plt.plot(listOf_trainAccuracy, "C2", label="Train Accuracy")
    plt.title("Train Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.subplot(2, 2, 4)
    plt.plot(listOf_valueAccuracy, "C3", label="Validation Accuracy")
    plt.title("Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")

    plt.savefig("q3c.png", bbox_inches='tight')

    train_confusion = nn.predict(trX, trY, acc=True, confusion=True)
    test_confusion = nn.predict(tstX, tstY, acc=True, confusion=True)

    plt.figure(figsize=(20, 10), dpi=160)

    plt.subplot(1, 2, 1)
    sn.heatmap(train_confusion, annot=True, annot_kws={"size": 8}, xticklabels=names, yticklabels=names, cmap=sn.cm.rocket_r, fmt='g')
    plt.title("Train Confusion Matrix")
    plt.ylabel("Actual")
    plt.xlabel("Prediction")
    plt.subplot(1, 2, 2)
    sn.heatmap(test_confusion, annot=True, annot_kws={"size": 8}, xticklabels=names, yticklabels=names, cmap=sn.cm.rocket_r, fmt='g')
    plt.title("Test Confusion Matrix")
    plt.ylabel("Actual")
    plt.xlabel("Prediction")
    plt.savefig("q3c_confusion.png", bbox_inches='tight')


