In [4]:
from keras.datasets import fashion_mnist
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
(trainx,trainy),(testx,testy)=fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [6]:
#onehot encoder
def onehot_encoder(Y):
  Y_encoded = np.zeros((10, Y.shape[0]))
  for train_point in range(Y.shape[0]):
      class_label = Y[train_point]
      Y_encoded[class_label,train_point] = 1.0
  return Y_encoded

In [7]:
num_examples_train=int(0.9*trainx.shape[0])
num_examples_val=int(0.1*trainx.shape[0])
num_examples_test=testx.shape[0]


In [8]:
x_train, x_val, y_train, y_val = train_test_split(trainx, trainy, test_size=0.1,shuffle=True)

In [9]:
X_train=x_train.reshape(x_train.shape[0],28*28).T
X_val=x_val.reshape(x_val.shape[0],28*28).T
X_test=testx.reshape(testx.shape[0],28*28).T
X_train=X_train/255
X_val=X_val/255
X_test=X_test/255


In [10]:
Y_train=onehot_encoder(y_train)
Y_val=onehot_encoder(y_val)
Y_test=onehot_encoder(testy)


In [13]:
pip install wandb


Collecting wandb
  Downloading wandb-0.12.10-py2.py3-none-any.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 21.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 25.4 MB/s eta 0:00:01[K     |▋                               | 30 kB 14.5 MB/s eta 0:00:01[K     |▊                               | 40 kB 10.7 MB/s eta 0:00:01[K     |█                               | 51 kB 3.7 MB/s eta 0:00:01[K     |█▏                              | 61 kB 4.4 MB/s eta 0:00:01[K     |█▍                              | 71 kB 4.8 MB/s eta 0:00:01[K     |█▌                              | 81 kB 4.5 MB/s eta 0:00:01[K     |█▊                              | 92 kB 5.0 MB/s eta 0:00:01[K     |██                              | 102 kB 4.3 MB/s eta 0:00:01[K     |██                              | 112 kB 4.3 MB/s eta 0:00:01[K     |██▎                             | 122 kB 4.3 MB/s eta 0:00:01[K     |██▌                             | 133 kB 4.3 MB/s eta 0:00:01

In [14]:
import wandb

In [16]:
#activations
import numpy as np


def sigmoid(z):
    return 1.0 / (1 + np.exp(-(z)))


def tanh(z):
    return np.tanh(z)


def sin(z):
    return np.sin(z)


def relu(z):
    return np.maximum(0, z) 


def softmax(Z):
    return np.exp(Z) / np.sum(np.exp(Z))


def grad_sigmoid(z):
    return  (1.0 / (1 + np.exp(-(z))))*(1 -  1.0 / (1 + np.exp(-(z))))

def grad_tanh(z):
    return 1 - np.tanh(z) ** 2

def grad_relu(z):
    if z>0:
      return 1
    else:
      return 0


In [1]:
 #initializers
def xavier_initializer(op,ip):
  std = np.sqrt(2 / (ip + op))
  return np.random.normal(0, std, size=(op, ip))

def random_initializer(op,ip):
  return np.random.normal(0, 1, size=(op, ip))

In [15]:
import numpy as np
import scipy as sp
#import wandb

class neural_network:

  def __init__(
      self, 
      layer_neurons, 
      X_train, 
      Y_train,  
      num_examples_train, 
      X_val, 
      Y_val, 
      num_examples_val,
      X_test, 
      Y_test, 
      num_examples_test,        
      optimizer,
      batch_size,
      lamda,
      learning_rate,
      max_epochs,
      activation,
      initializer,
      loss):
    #initalize neural nw layers
    self.layers = layer_neurons

    #initilialize num of examples
    self.num_examples_train = num_examples_train
    self.num_examples_val = num_examples_val
    self.num_examples_test = num_examples_test
    
    #initlialize datset
    self.X_train=X_train
    self.X_val=X_val
    self.X_test=X_test
    self.Y_train=Y_train
    self.Y_val=Y_val
    self.Y_test=Y_test

    #loss fn
    self.loss_function = loss
    
    
    #setup initializer
    if initializer=="random":
      self.initializer = random_initializer
    elif initializer=="xavier":
      self.initializer=xavier_initializer
    else:
      raise Exception('this is wrong initializer')
    
    #setup activations
    if activation=="sigmoid":
      self.activation=sigmoid
      self.grad_activation=grad_sigmoid
    elif activation=="tanh":
      self.activation=tanh
      self.grad_activation=grad_tanh
    elif activation=="relu":
      self.activation=relu
      self.grad_activation=grad_relu
    else:
      raise Exception('this is wrong activation')

    #optimizers
    if optimizer=="sgd":
      self.optimizer=self.sgd
    elif optimizer=="mgd":
      self.optimizer=self.mgd
    elif optimizer=="nag":
      self.optimizer=self.nag
    elif optimizer=="rmsprop":
      self.optimizer=self.rmsprop
    elif optimizer=="adam":
      self.optimizer=self.adam
    elif optimizer=="nadam":
      self.optimizer=self.nadam
    else:
      raise Exception("wrong optimizer")
         

    #hyperparameters
    self.max_epochs = max_epochs
    self.batch_size = batch_size
    self.learning_rate = learning_rate
    self.lamda=lamda
    
    #initialize weights
    self.W, self.b = self.Neuralnet_init(self.layers)




  # loss fn
  def cross_entropy_loss(self, Y_true, Y_pred):
      loss=np.sum(-(Y_true*np.log(Y_pred)))
      return loss

  def L2_loss(self, lamda):
    return (lamda/2) * np.sum([np.linalg.norm((self.W[i + 1]**2))  for i in range(len(self.W))])

  def mse_loss(self, Y_true, Y_pred):
      loss = np.sum((Y_true - Y_pred) ** 2)
      return loss

  #accuracy and predict
  def accuracy(self, Y_true, Y_pred, num_examples):
    acc = 0
    for i in range(num_examples):
      if (np.argmax(Y_true[:, i])==np.argmax(Y_pred[:, i])):
        acc+=1
    accuracy = acc / num_examples
    return accuracy

  def predict(self,X,num_examples):
    Y_pred = []        
    for i in range(num_examples):
      Y, H, A = self.forwardPropagate(X[:, i].reshape(784, 1),self.W,self.b)
      Y_pred.append(Y.reshape(10,))
    Y_pred = np.array(Y_pred).transpose()
    return Y_pred
 
  #neural network weights and biases initializer
  def Neuralnet_init(self,layer_neurons):
      num_layers=len(self.layers)
      W = {}
      b = {}
      for i in range(0, num_layers - 1):
          W[i + 1] = self.initializer(layer_neurons[i + 1], layer_neurons[i])
          b[i + 1] = np.zeros((layer_neurons[i + 1], 1))
      return W,b
  
  def forwardPropagate(self,X, W, b):
    num_layers=len(self.layers)
    H={}
    A = {}

    H[0] = X
    A[0] = X
    for k in range(0, num_layers - 2):
      A[k + 1] = np.dot(W[k + 1],H[k])+ b[k + 1]
      H[k + 1] = self.activation(A[k + 1])

    #op layer
    A[num_layers - 1] = np.dot(W[num_layers-1],H[num_layers - 2])+ b[num_layers - 1]
    y_pred = softmax(A[num_layers - 1])
    H[num_layers - 1] = y_pred
    return y_pred,H,A
  
  
  
  
  def backPropagate(self,Ypred,H,A,Y_train,lamda=0):
    
    num_layers=len(self.layers)
    grad_A={}
    grad_H={}

    grad_W={}
    grad_b={}
    

    # Gradient of op layer(AL).
    grad_A[num_layers - 1] = -(Y_train - Ypred)

    for k in range(num_layers - 2, -1, -1):
      grad_W[k + 1] = np.outer(grad_A[k + 1],H[k])+(lamda/2 )* self.W[k + 1]

        
      grad_b[k + 1]=grad_A[k + 1]
      
      if k == 0:
        grad_H[k] = np.dot(self.W[k + 1].T,grad_A[k + 1])
        grad_A[k] = grad_H[k]*A[k]
      else:
        grad_H[k] = np.dot(self.W[k + 1].T,grad_A[k + 1])
          
        grad_A[k] = grad_H[k]*self.grad_activation(A[k])


          
    return grad_W,grad_b
    


  def sgd(self,epochs,num_examples,batch_size,learning_rate, lamda):
    loss_train = []
    acc_train = []
    acc_val = []
    num_layers=len(self.layers)
    for epoch in range(epochs):
      loss_per_point=0
      
      for i in range(X_train.shape[1]):
        Y,H,A = self.forwardPropagate(self.X_train[:,i].reshape(784,1), self.W, self.b) 
        grad_w, grad_b = self.backPropagate(Y,H,A,self.Y_train[:,i].reshape(10,1))
      
        loss_per_point= loss_per_point+ self.cross_entropy_loss(Y_train[:, i].reshape(10, 1), Y)+self.L2_loss(lamda)

        for k in range(num_layers-1):
          self.W[k+1]=self.W[k+1] - learning_rate * grad_w[k+1]
          self.b[k+1]=self.b[k+1] - learning_rate * grad_b[k+1]
            
      
         #loss    
      loss_train.append(loss_per_point/X_train.shape[1])

      #accuracy 
      Y_pred_final = self.predict(self.X_train,self.num_examples_train)
      acc_train.append(self.accuracy(Y_train, Y_pred_final, num_examples))
      acc_val.append(self.accuracy(self.Y_val, self.predict(self.X_val, self.num_examples_val), self.num_examples_val))

      print("epoch:",epoch,"training loss:",loss_train[epoch],"training accuracy",acc_train[epoch],"validation accuracy",acc_val[epoch])

      wandb.log({'epoch':epoch,'training_loss':loss_train[epoch],'training_accuracy':acc_train[epoch], 'validation_accuracy':acc_val[epoch] })


    return Y_pred_final



  def mgd(self,epochs,num_examples,batch_size,learning_rate, lamda):
    #gamma = min(1 - 2 ** (-1 - np.log((epoch / 250.0) + 1, 2)), gamma)
    gamma=0.9
    eta=learning_rate
    loss_train = []
    acc_train = []
    acc_val = []
    del_w={}
    del_b={}
    prev_update_w={}
    prev_update_b={}
    update_w={}
    update_b={}
    
    X_train = self.X_train[:, :num_examples]
    Y_train = self.Y_train[:, :num_examples] 
    num_layers=len(self.layers)

    for k in range(num_layers-1):
      prev_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      prev_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

    for k in range(num_layers-1):
      update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      update_b[k+1] =np.zeros((self.layers[k + 1], 1))
    num_points_seen = 0
    for epoch in range(epochs):
      
      loss_per_point=0
      #initializing del_w and del_b
      
      for k in range(num_layers-1):
        del_w[k+1] = np.zeros((layer_neurons[k + 1], self.layers[k]))
        del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
      

      for i in range(X_train.shape[1]):
        Y,H,A = self.forwardPropagate(self.X_train[:,i].reshape(784,1), self.W, self.b) 
        grad_w, grad_b = self.backPropagate(Y,H,A,self.Y_train[:,i].reshape(10,1))
        
        
        #add grad of weights and biases to get del_w and del_b
        for k in range(num_layers-1):
          del_w[k+1]=del_w[k+1] + grad_w[k+1]
          del_b[k+1]=del_b[k+1] + grad_b[k+1]

        loss_per_point= loss_per_point+ self.cross_entropy_loss(Y_train[:, i].reshape(10, 1), Y )+self.L2_loss(lamda)
      
      
        num_points_seen +=1
      
        if num_points_seen % batch_size == 0:
          #updating history in batch
          for k in range(num_layers-1):
            update_w[k+1]=gamma*prev_update_w[k+1]+eta*del_w[k+1] 
            update_b[k+1]=gamma*prev_update_b[k+1]+eta*del_b[k+1]

          for k in range(num_layers-1):
            self.W[k+1]=self.W[k+1] - learning_rate * update_w[k+1]
            self.b[k+1]=self.b[k+1] - learning_rate * update_b[k+1]

          

          prev_update_w = update_w
          prev_update_b = update_b
          
          for k in range(num_layers-1):
            del_w[k+1] = np.zeros((self.layers[k + 1],self.layers[k]))
            del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
      #loss    
      loss_train.append(loss_per_point/X_train.shape[1])

      #accuracy 
      Y_pred_final = self.predict(self.X_train,self.num_examples_train)
      acc_train.append(self.accuracy(Y_train, Y_pred_final, num_examples))
      acc_val.append(self.accuracy(self.Y_val, self.predict(self.X_val, self.num_examples_val), self.num_examples_val))

      print("epoch:",epoch,"training loss:",loss_train[epoch],"training accuracy",acc_train[epoch],"validation accuracy",acc_val[epoch])

      wandb.log({'epoch':epoch,'training_loss':loss_train[epoch],'training_accuracy':acc_train[epoch], 'validation_accuracy':acc_val[epoch] })


    return Y_pred_final



  def nag(self,epochs,num_examples,batch_size,learning_rate,lamda):
    
    #gamma = min(1 - 2 ** (-1 - np.log((calls / 250.0) + 1, 2)), gamma)
    
    gamma=0.9
    eta=learning_rate
    loss_train = []
    acc_train = []
    acc_val = []
    del_w={}
    del_b={}
    prev_update_w={}
    prev_update_b={}
    update_w={}
    update_b={}
    num_layers=len(self.layers)
    
    X_train = self.X_train[:, :num_examples]
    Y_train = self.Y_train[:, :num_examples]        

    #initalizing prev and latest update
    for k in range(num_layers-1):
      prev_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      prev_update_b[k+1] =np.zeros((self.layers[k + 1], 1))
    
    for k in range(num_layers-1):
      update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      update_b[k+1] =np.zeros((self.layers[k + 1], 1))
    num_points_seen = 0
    
    #begin
    for epoch in range(epochs):
      
      loss_per_point=0
      #initializing del_w and del_b
      
      for k in range(num_layers-1):
        del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
        del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
      #doing lookahead
      for k in range(num_layers-1):
        update_w[k+1] = gamma*prev_update_w[k+1]
        update_b[k+1] =gamma*prev_update_b[k+1]

      for i in range(X_train.shape[1]):
        
        #look ahead w,b
        for k in range(num_layers-1):
          self.W[k+1]=self.W[k+1]-update_w[k+1]
          self.b[k+1]=self.b[k+1]-update_b[k+1]
          
        Y,H,A = self.forwardPropagate(self.X_train[:,i].reshape(784,1), self.W, self.b) 
        grad_w, grad_b = self.backPropagate(Y,H,A,self.Y_train[:,i].reshape(10,1))
        
        #add grad of weights and biases to get del_w and del_b
        for k in range(num_layers-1):
          del_w[k+1]=del_w[k+1] + grad_w[k+1]
          del_b[k+1]=del_b[k+1] + grad_b[k+1]
    
        loss_per_point= loss_per_point+ self.cross_entropy_loss(Y_train[:, i].reshape(10, 1), Y )+self.L2_loss(lamda)
      
      
        num_points_seen +=1
      
        if num_points_seen % batch_size == 0:
          
          #updating history in batch
          for k in range(num_layers-1):
            update_w[k+1]=gamma*prev_update_w[k+1]+eta*del_w[k+1] 
            update_b[k+1]=gamma*prev_update_b[k+1]+eta*del_b[k+1]

          for k in range(num_layers-1):
            self.W[k+1]=self.W[k+1] - eta * update_w[k+1]
            self.b[k+1]=self.b[k+1] - eta * update_b[k+1]

          

          prev_update_w = update_w
          prev_update_b = update_b
          for k in range(num_layers-1):
            del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
            del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
          
      #loss    
      loss_train.append(loss_per_point/X_train.shape[1])

      #accuracy 
      Y_pred_final = self.predict(self.X_train,self.num_examples_train)
      acc_train.append(self.accuracy(Y_train, Y_pred_final, num_examples))
      acc_val.append(self.accuracy(self.Y_val, self.predict(self.X_val, self.num_examples_val), self.num_examples_val))

      print("epoch:",epoch,"training loss:",loss_train[epoch],"training accuracy",acc_train[epoch],"validation accuracy",acc_val[epoch])

      wandb.log({'epoch':epoch,'training_loss':loss_train[epoch],'training_accuracy':acc_train[epoch], 'validation_accuracy':acc_val[epoch] })


    return Y_pred_final

  def rmsprop(self,epochs,num_examples, batch_size, learning_rate, lamda):
    
    eta=learning_rate
    loss_train = []
    acc_train = []
    acc_val = []
    del_w={}
    del_b={}
    
    update_w={}
    update_b={}
    eps=10**(-8)
    beta=0.9

    num_layers=len(self.layers)
    X_train = self.X_train[:, :num_examples]
    Y_train = self.Y_train[:, :num_examples]


    #initialize update
    for k in range(num_layers-1):
      update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      update_b[k+1] =np.zeros((self.layers[k + 1], 1))
    
    num_points_seen = 0
    
    for epoch in range(epochs):
      
      loss_per_point=0
      #initializing del_w and del_b
      
      for k in range(num_layers-1):
        del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
        del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
      

      for i in range(X_train.shape[1]):
        Y,H,A = self.forwardPropagate(self.X_train[:,i].reshape(784,1), self.W, self.b) 
        grad_w, grad_b = self.backPropagate(Y,H,A,self.Y_train[:,i].reshape(10,1))
        
        #add grad of weights and biases to get del_w and del_b
        for k in range(num_layers-1):
          del_w[k+1]=del_w[k+1] + grad_w[k+1]
          del_b[k+1]=del_b[k+1] + grad_b[k+1]
    
        loss_per_point= loss_per_point+ self.cross_entropy_loss(Y_train[:, i].reshape(10, 1), Y )+self.L2_loss(lamda)
      
      
        num_points_seen +=1
      
        if num_points_seen % batch_size == 0:
          #updating history in batch
          for k in range(num_layers-1):
            update_w[k+1]=beta*update_w[k+1]+ (1-beta)*((del_w[k+1])**2) 
            update_b[k+1]=beta*update_b[k+1]+ (1-beta)*((del_b[k+1])**2)

          for k in range(num_layers-1):
            self.W[k+1]=self.W[k+1] - ((eta)/(np.sqrt(update_w[k+1]+eps)))*del_w[k+1]
            self.b[k+1]=self.b[k+1] - ((eta)/(np.sqrt(update_b[k+1]+eps)))*del_b[k+1]

        
          for k in range(num_layers-1):
            del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
            del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
          
              
      #loss    
      loss_train.append(loss_per_point/X_train.shape[1])

      #accuracy 
      Y_pred_final = self.predict(self.X_train,self.num_examples_train)
      acc_train.append(self.accuracy(Y_train, Y_pred_final, num_examples))
      acc_val.append(self.accuracy(self.Y_val, self.predict(self.X_val, self.num_examples_val), self.num_examples_val))

      print("epoch:",epoch,"training loss:",loss_train[epoch],"training accuracy",acc_train[epoch],"validation accuracy",acc_val[epoch])

      wandb.log({'epoch':epoch,'training_loss':loss_train[epoch],'training_accuracy':acc_train[epoch], 'validation_accuracy':acc_val[epoch] })

    return Y_pred_final

  
  
  def adam(self, epochs,num_examples, batch_size, learning_rate, lamda):
        
    
    X_train = self.X_train[:, :num_examples]
    Y_train = self.Y_train[:, :num_examples]        
    num_layers=len(self.layers)
    

    eta=learning_rate
    loss_train = []
    acc_train = []
    acc_val = []
    del_w={}
    del_b={}
    
    m_update_w={}
    m_update_b={}

    m_bias_corr_update_w={}
    m_bias_corr_update_b={}
    
    v_update_w={}
    v_update_b={}

    v_bias_corr_update_w={}
    v_bias_corr_update_b={}
    
    eps=10**(-8)
    beta1=0.9
    beta2=0.99

    
    #initialize update
    for k in range(num_layers-1):
      m_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      m_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

      m_bias_corr_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      m_bias_corr_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

      v_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      v_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

      v_bias_corr_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      v_bias_corr_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

    num_points_seen = 0
    
    for epoch in range(epochs):
      
      loss_per_point=0
      #initializing del_w and del_b
      
      for k in range(num_layers-1):
        del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
        del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
      

      for i in range(X_train.shape[1]):
        Y,H,A = self.forwardPropagate(self.X_train[:,i].reshape(784,1), self.W, self.b) 
        grad_w, grad_b = self.backPropagate(Y,H,A,self.Y_train[:,i].reshape(10,1))
        
        #add grad of weights and biases to get del_w and del_b
        for k in range(num_layers-1):
          del_w[k+1]=del_w[k+1] + grad_w[k+1]
          del_b[k+1]=del_b[k+1] + grad_b[k+1]
    
        loss_per_point= loss_per_point+ self.cross_entropy_loss(Y_train[:, i].reshape(10, 1), Y )+self.L2_loss(lamda)
      
      
        num_points_seen +=1
      
        if num_points_seen % batch_size == 0:
          #updating history in batch
          for k in range(num_layers-1):
            
            m_update_w[k+1]=beta1*m_update_w[k+1]+ (1-beta1)*(del_w[k+1]) 
            m_update_b[k+1]=beta1*v_update_b[k+1]+ (1-beta1)*(del_b[k+1])
            
            v_update_w[k+1]=beta2*v_update_w[k+1]+ (1-beta2)*((del_w[k+1])**2) 
            v_update_b[k+1]=beta2*v_update_b[k+1]+ (1-beta2)*((del_b[k+1])**2)

            #beta**epoch+1 as epoch=0 it will be inf
            m_bias_corr_update_w[k+1]=m_update_w[k+1]/1-((beta1)**(epoch+1))
            m_bias_corr_update_b[k+1]=m_update_b[k+1]/1-((beta1)**(epoch+1))

            v_bias_corr_update_w[k+1]=v_update_w[k+1]/1-((beta2)**(epoch+1))
            v_bias_corr_update_b[k+1]=v_update_b[k+1]/1-((beta2)**(epoch+1))

            
          for k in range(num_layers-1):
            self.W[k+1]=self.W[k+1] - ((eta)/(np.sqrt(v_bias_corr_update_w[k+1])+eps))*m_bias_corr_update_w[k+1]
            self.b[k+1]=self.b[k+1] - ((eta)/(np.sqrt(v_bias_corr_update_b[k+1])+eps))*m_bias_corr_update_b[k+1]

        
          for k in range(num_layers-1):
            del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
            del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      

      #loss    
      loss_train.append(loss_per_point/X_train.shape[1])

      #accuracy 
      Y_pred_final = self.predict(self.X_train,self.num_examples_train)
      acc_train.append(self.accuracy(Y_train, Y_pred_final, num_examples))
      acc_val.append(self.accuracy(self.Y_val, self.predict(self.X_val, self.num_examples_val), self.num_examples_val))

      print("epoch:",epoch,"training loss:",loss_train[epoch],"training accuracy",acc_train[epoch],"validation accuracy",acc_val[epoch])

      wandb.log({'epoch':epoch,'training_loss':loss_train[epoch],'training_accuracy':acc_train[epoch], 'validation_accuracy':acc_val[epoch] })



    return Y_pred_final


  def nadam(self, epochs,num_examples, batch_size, learning_rate, lamda ):
        
    
    X_train = self.X_train[:, :num_examples]
    Y_train = self.Y_train[:, :num_examples]        
    num_layers=len(self.layers)
    

    eta=learning_rate
    loss_train = []
    acc_train = []
    acc_val = []
    del_w={}
    del_b={}
    
    m_update_w={}
    m_update_b={}

    m_bias_corr_update_w={}
    m_bias_corr_update_b={}
    
    v_update_w={}
    v_update_b={}

    v_bias_corr_update_w={}
    v_bias_corr_update_b={}
    
    eps=10**(-8)
    beta1=0.9
    beta2=0.99

    
    #initialize update
    for k in range(num_layers-1):
      m_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      m_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

      m_bias_corr_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      m_bias_corr_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

      v_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      v_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

      v_bias_corr_update_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
      v_bias_corr_update_b[k+1] =np.zeros((self.layers[k + 1], 1))

    num_points_seen = 0
    
    for epoch in range(epochs):
      
      loss_per_point=0
      #initializing del_w and del_b
      
      for k in range(num_layers-1):
        del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
        del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      
      

      for i in range(X_train.shape[1]):
        Y,H,A = self.forwardPropagate(self.X_train[:,i].reshape(784,1), self.W, self.b) 
        grad_w, grad_b = self.backPropagate(Y,H,A,self.Y_train[:,i].reshape(10,1))
        
        #add grad of weights and biases to get del_w and del_b
        for k in range(num_layers-1):
          del_w[k+1]=del_w[k+1] + grad_w[k+1]
          del_b[k+1]=del_b[k+1] + grad_b[k+1]
    
        loss_per_point= loss_per_point+ self.cross_entropy_loss(Y_train[:, i].reshape(10, 1), Y )+self.L2_loss(lamda)
      
      
        num_points_seen +=1
      
        if num_points_seen % batch_size == 0:
          #updating history in batch
          for k in range(num_layers-1):
            
            m_update_w[k+1]=beta1*m_update_w[k+1]+ (1-beta1)*(del_w[k+1]) 
            m_update_b[k+1]=beta1*v_update_b[k+1]+ (1-beta1)*(del_b[k+1])
            
            v_update_w[k+1]=beta2*v_update_w[k+1]+ (1-beta2)*((del_w[k+1])**2) 
            v_update_b[k+1]=beta2*v_update_b[k+1]+ (1-beta2)*((del_b[k+1])**2)

            #beta**epoch+1 as epoch=0 it will be inf
            m_bias_corr_update_w[k+1]=m_update_w[k+1]/1-((beta1)**(epoch+1))
            m_bias_corr_update_b[k+1]=m_update_b[k+1]/1-((beta1)**(epoch+1))

            v_bias_corr_update_w[k+1]=v_update_w[k+1]/1-((beta2)**(epoch+1))
            v_bias_corr_update_b[k+1]=v_update_b[k+1]/1-((beta2)**(epoch+1))

            
          for k in range(num_layers-1):
            self.W[k+1]=self.W[k+1] - ((eta)/(np.sqrt(v_bias_corr_update_w[k+1]+eps)))*(beta1*m_bias_corr_update_w[k+1]+((1-beta1)/(1-(beta1)**(epoch+1)))*del_w[k+1])
            self.b[k+1]=self.b[k+1] - ((eta)/(np.sqrt(v_bias_corr_update_b[k+1]+eps)))*(beta1*m_bias_corr_update_b[k+1]+((1-beta1)/(1-(beta1)**(epoch+1)))*del_b[k+1])

        
          for k in range(num_layers-1):
            del_w[k+1] = np.zeros((self.layers[k + 1], self.layers[k]))
            del_b[k+1] =np.zeros((self.layers[k + 1], 1))
      

      #loss    
      loss_train.append(loss_per_point/X_train.shape[1])

      #accuracy 
      Y_pred_final = self.predict(self.X_train,self.num_examples_train)
      acc_train.append(self.accuracy(Y_train, Y_pred_final, num_examples))
      acc_val.append(self.accuracy(self.Y_val, self.predict(self.X_val, self.num_examples_val), self.num_examples_val))

      print("epoch:",epoch,"training loss:",loss_train[epoch],"training accuracy",acc_train[epoch],"validation accuracy",acc_val[epoch])

      wandb.log({'epoch':epoch,'training_loss':loss_train[epoch],'training_accuracy':acc_train[epoch], 'validation_accuracy':acc_val[epoch] })



    return Y_pred_final





In [18]:
max_epochs=2
layer_neurons=[784,128,10]
lamda=0.5
learning_rate=10**(-3)
optimizer="rmsprop"
batch_size=16
activation="tanh"
initializer="random"
loss="cross"

In [None]:
nn =neural_network(layer_neurons=layer_neurons,
        X_train=X_train,
        Y_train=Y_train,
        num_examples_train = num_examples_train,
        X_val = X_val,
        Y_val = Y_val,
        num_examples_val = num_examples_val,
        X_test = X_test,
        Y_test= Y_test,
        num_examples_test = num_examples_test,
        optimizer = optimizer,
        batch_size = batch_size,
        lamda = lamda,
        learning_rate = learning_rate,
        max_epochs = max_epochs,
        activation = activation,
        initializer = initializer,
        loss = loss
        )

#training_loss, acc_train, acc_val
Y_pred_train = nn.optimizer(nn.max_epochs, nn.num_examples_train, nn.batch_size, nn.learning_rate,nn.lamda)