In [None]:
class Identity(Activation):

    """
    Identity function (already implemented).
    """

    # This class is a gimme as it is already implemented for you as an example

    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        self.state = x
        return x

    def derivative(self):
        return 1.0

In [None]:
class Sigmoid(Activation):

    """
    Sigmoid non-linearity
    """

    # Remember do not change the function signatures as those are needed
    # to stay the same for AutoLab.

    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, x):
        self.state=(1/(1+np.exp(-x)))
        return self.state

    def derivative(self):
        return (self.state*(1-self.state))#returning derivative of sigmoid function
        

In [None]:
class Tanh(Activation):

    """
    Tanh non-linearity
    """

    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, x):
        self.state=(np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
        return self.state

    def derivative(self):
        return (1-(self.state*self.state))
     

In [None]:
class ReLU(Activation):

    """
    ReLU non-linearity
    """

    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, x):
        self.state=(np.maximum(0,x))
        return self.state

    def derivative(self):
        relu=(self.state>0,1,0)
        return relu

In [None]:
class SoftmaxCrossEntropy(Criterion):
    """
    Softmax loss
    """

    def __init__(self):
        super(SoftmaxCrossEntropy, self).__init__()

    def forward(self, x, y):
        """
        Argument:
            x (np.array): (batch size, 10)
            y (np.array): (batch size, 10)
        Return:
            out (np.array): (batch size, )
        """
        self.logits = x
        self.labels = y
        maximum=np.maximum(self.logits,axis=1).reshape(-1,1)#calculating the maximum value to stabilise the exponential term
        sub=self.logits-maximum
        self.expl=np.exp(sub)
        sum=self.expl.sum(axis=1).reshape(-1,1)#computing the sum of all e^(logits-maximum) values
        ans=-(self.logits*self.labels).sum(axis=1)+(maximum+np.log(sum)).reshape(-1,1)#applying cross entropy
        self.sig=self.logits/sum

        return ans

    def derivative(self):
        """
        Return:
            out (np.array): (batch size, 10)
        """
        return self.sig-self.logits #computing derivative 

        
        

In [None]:
class Linear():
    def __init__(self, in_feature, out_feature, weight_init_fn, bias_init_fn):

        """
        Argument:
            W (np.array): (in feature, out feature)
            dW (np.array): (in feature, out feature)
            momentum_W (np.array): (in feature, out feature)

            b (np.array): (1, out feature)
            db (np.array): (1, out feature)
            momentum_B (np.array): (1, out feature)
        """

        self.W = weight_init_fn(in_feature, out_feature)
        self.b = bias_init_fn(out_feature)

        # TODO: Complete these but do not change the names.
        self.dW = np.zeros(None)
        self.db = np.zeros(None)

        self.momentum_W = np.zeros(None)
        self.momentum_b = np.zeros(None)

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        self.x=x
        ans=np.matmul(self.W,self.x)+self.b #calculated the value of z which is used by the acitivation to calculate probability.Using
        return ans                          #np.matmul to calculate its dot product;

    def backward(self, delta):
        self.dW=np.dot((self.x).T,delta)/delta.shape[0]#here we are calculating change in W and we are dividing by the batch size to get 
        self.db=np.sum(delta,axis=0)/delta.shape[0]#average gradient.
        dx=np.dot(delta,self.W.T)
        return dx

        

In [None]:
class MLP(object):

    """
    A simple multilayer perceptron
    """

    def __init__(self, input_size, output_size, hiddens, activations, weight_init_fn,
                 bias_init_fn, criterion, lr):

        # Don't change this -->
        self.train_mode = True
        self.nlayers = len(hiddens) + 1
        self.input_size = input_size
        self.output_size = output_size
        self.activations = activations
        self.criterion = criterion
        self.lr = lr
        # <---------------------

        # Don't change the name of the following class attributes,
        # the autograder will check against these attributes. But you will need to change
        # the values in order to initialize them correctly

        # Initialize and add all your linear layers into the list 'self.linear_layers'
        # (HINT: self.foo = [ bar(???) for ?? in ? ])
        # (HINT: Can you use zip here?)
        self.linear_layers = [Linear(inf,outf,weight_init_fn,bias_init_fn) for inf ,outf in zip([self.input_size]+hiddens,hiddens+
                                                                                                [self.output_size])]
        #initialised linear_layers
        #zip is used to create pairs. The first pair is (input_size,hiddens[0]) and the last pair is (hiddens[-1],output_size).
        #([self.input_size]+hiddens,hiddens+[self.output_size]) is used to create an input list and output list.
                                                                                                

    def forward(self, x):
        """
        Argument:
            x (np.array): (batch size, input_size)
        Return:
            out (np.array): (batch size, output_size)
        """
        for i, layer in enumerate(self.linear_layers):#enumerate is used to get the value of i and layer element in single step
            x=layer(x)#propogating input through each layer of neural network
            x=self.activations[i](x)#applying activations on the output of the linear layer to get probabilities 
            return x

    def zero_grads(self):
        for layer in self.linear_layers:# here we are reseting the values of gradient to zero to prevent gradient accumulation 
            layer.dw.fill(0.0)
            layer.db.fill(0.0)

    def step(self):
        for i in range (len(self.linear_layers)):
            layer=self.linear_layers[i]
            layer.w=layer.w-self.lr*layer.dw#here we are calculating the updated values of weight a biases
            layer.b=layer.b-self.lr*layer.db

    def backward(self, labels):
        final_layer=self.activations[-1]#used to select the activation function for the last layer's output
        final_outputs=final_layer.state
        loss=self.criterion(final_outputs,labels)#calculating loss
        delta=self.criterion.derivative()#calculating delta
        for i in range(self.nlayers-1,-1,-1):#iterating over the remaining layers in reverse order
            delta=delta*self.activations[i].derative()#computes the derivative of the activation function with respect to its intput value and
            delta=self.linear_layers[i].backward(delta)#multiplies it with the gradient.The next line backward function computes the gradient
                                                       #of loss with respect to input layer

    def error(self, labels):
        return (np.argmax(self.output, axis = 1) != np.argmax(labels, axis = 1)).sum()

    def total_loss(self, labels):
        return self.criterion(self.output, labels).sum()

    def __call__(self, x):
        return self.forward(x)

    def train(self):
        self.train_mode = True

    def eval(self):
        self.train_mode = False