In [None]:
import numpy as np

**1. Using the Sigmoid as a template, implement the following activataion functions:**

  a. ReLU

  b. Tanh

  c. [EXPONENTIAL LINEAR UNITS (ELUS)](https://arxiv.org/pdf/1511.07289.pdf)  

  d. [Mish](https://arxiv.org/pdf/1908.08681.pdf)


In [None]:
class Sigmoid:
    def __init__(self):
        self.output = None

    def forward(self, x):
        self.output = 1.0 / (1.0 + np.exp(-x))
        return self.output

    def backward(self, dL_dy):
        dy_dx = self.output * (1 - self.output)
        return dL_dy * dy_dx

Note:

- `forward`: Takes in an input $x$ and computes the sigmoid activation. It also stores the output for use in the backward pass. We will learn about that later in the course and also in the next assignement.
  
- `backward`: Takes in the upstream gradient (usually represented as $\frac{dL}{dy}$, where $L $ is the loss and $y$ is the output of the sigmoid) and multiplies it by the local gradient (the derivative of the sigmoid) to produce the gradient of the loss with respect to the input, $\frac{dL}{dx}$. Note that $\frac{dL}{dy}$ is typically denoted as `grad_output` in many PyTorch examples. It is also called upstream gradient.

For learning purposes or if you're trying to implement certain custom functionalities not provided by PyTorch, then the object-oriented approach with `forward` and `backward` methods is a great way to go. In this assignment, we must rely on Numpy for implementation, avoiding the high-level frameworks like PyTorch or TensorFlow.

In [None]:
# TODO: Implement ReLu class
class ReLu:
  def __init__(self):
    self.output = None

  def forward(self, x):
    self.output = np.maximum(0, x)
    return self.output

  def backward(self, dL_dy):
    dy_dx = np.where(self.output > 0, 1, 0)
    return dL_dy * dy_dx

In [None]:
# TODO: Implement Thanh class
class Thanh:
  def __init__(self):
    self.output = None

  def forward(self, x):
    self.output = (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
    return self.output

  def backward(self, dL_dy):
    dy_dx = 1 - (self.output)**2
    return dL_dy * dy_dx

In [None]:
# TODO: Implement ELUS class
class ELUS:
  def __init__(self, alpha=1):  # The default value of ELU's alpha = 1(follow the torch documentation), it can be alter later if needed
  # If alpha not initialize to 1, then it doesn't look like real implementation as you need to set it to 1 anytime you use it
    self.input = None
    self.output = None
    self.alpha = alpha

  def forward(self, x):
    self.input = x
    self.output = np.where(self.input > 0, x, self.alpha * (np.exp(x) - 1))
    return self.output

  def backward(self, dL_dy):
    dy_dx = np.where(self.input > 0, 1, self.alpha * np.exp(self.input))
    return dL_dy * dy_dx

In [None]:
# TODO: Implement Mish class
class Mish:
  def __init__(self):
    self.input = None
    self.output = None

  def forward(self, x):
    self.input = x
    self.output = x * np.tanh(np.log(1 + np.exp(x)))
    return self.output

  def backward(self, dL_dy):
    x = self.input
    nominator = np.exp(x) * (4*(x+1) + 4*np.exp(2*x) + np.exp(3*x) + np.exp(x)*(4*x+6))
    denominator = (2*np.exp(x) + np.exp(2*x) + 2)**2
    dy_dx = nominator/denominator
    return dL_dy * dy_dx

**3. Implement the CrossEntropyLoss Class.**

In many deep learning tasks, especially classification, the CrossEntropyLoss function plays a pivotal role. For this assignment, your task is to implement the `CrossEntropyLoss` class with both `forward` and `backward` methods.

Specifications:

1. **Input**:
    - `logits`: A 2D numpy array of shape `(batch_size, num_classes)`. Each row represents the logit scores (pre-softmax outputs) for each class of a particular sample.
    - `target`: A 2D numpy array of shape `(batch_size, num_classes)`, representing the one-hot encoded true labels.

2. **Output**:
    - The `forward` method should return a scalar representing the average loss over the entire batch.
    - The `backward` method should return the gradient of the loss with respect to the logits, which will be used for gradient-based optimization.

3. **Numerical Stability**: Ensure that your implementation is numerically stable, avoiding potential pitfalls like overflow or underflow in the softmax and logarithm calculations.

Note:
- CrossEntropyLoss is typically computed using logit inputs, i.e., the outputs of the model before the softmax function is applied. This is for computational efficiency.


In [None]:
#TODO: Implement the CrossEntropyLoss here
class CrossEntropyLoss:
  def __init__(self):
    self.logits = None
    self.target = None

  def forward(self, logits, target):
    self.logits = logits
    self.target = target

    logitStable = logits - np.max(logits) # Substract the max value to make the logits stable (for numerical stability)
    self.logits = logitStable # Save stable value to self.logits
    logitAfterSoftmax = np.exp(logitStable) / np.sum(np.exp(logitStable), axis=1, keepdims=True)
    # The formula for logit after applying Softmax is logits/(sigma sum of logits)
    cross_entropy = -np.sum(self.target * np.log(logitAfterSoftmax)) # Without the negative sign, the loss will be negative
    loss = cross_entropy/len(logitStable)
    return loss

  def backward(self):
    logitAfterSoftmax = np.exp(self.logits) / np.sum(np.exp(self.logits), axis=1, keepdims=True)  #This logits is logit after softmax
    gradient = (logitAfterSoftmax-self.target)/len(self.logits)
    return gradient

In [None]:
# This is the test code
logits = np.ones(shape=(10,3))
target = np.ones(shape=(10,3))
myCrossEntropyLoss = CrossEntropyLoss()
answers = myCrossEntropyLoss.forward(logits=logits, target=target)
print(answers)

3.29583686600433



**3. Implement a Simple ANN Class**

Write a Python class named `ANN` that simulates a basic feed-forward artificial neural network.

Specifications:

1. **Initialization (`__init__` method)**:
    - The network should be initialized with 784 inputs and 10 outputs.
    - It must have two hidden layers, each containing 256 neurons.
    - Use the ReLU activation function for each hidden layer.
    - Initialize weights using a gaussian distribution with zero mean a variance of `2 / (number of input nodes + number of output nodes)`.
    - Include bias terms for each neuron and initialize them to zero.

2. **Forward Propagation (`forward` method)**:
    - The method should take a batch of `k` inputs (a 2D numpy array of shape `(k, 784)`).
    - It should process the batch through the network and produce a batch of `k` logit outputs, each being a vector of size 10 (thus, the output shape should be `(k, 10)`).

Note: For this assignment, use `numpy` for all mathematical operations.

---

This version provides a clearer set of requirements and expectations for the students.

In [None]:
# TODO: Implement the ANN class here
class ANN:
  def __init__(self):
    self.inputs=784
    self.hidden=256
    self.outputs=10
    self.weight1 = np.random.normal(loc=0, scale=np.sqrt(2/(self.inputs+self.hidden)), size=(self.inputs, self.hidden))
    self.weight2 = np.random.normal(loc=0, scale=np.sqrt(2/(self.hidden+self.hidden)), size=(self.hidden, self.hidden))
    self.weight3 = np.random.normal(loc=0, scale=np.sqrt(2/(self.hidden+self.outputs)), size=(self.hidden, self.outputs))
    # In the weight calculation, loc is the mean, scale is the standard deviation (so I sqrt the variance), size is the size of the weight
    # Because zero mean so loc=0, variance is given so scale (standard deviation) is the sprt of that

    self.bias1 = np.zeros(shape=(1,self.hidden))
    self.bias2 = np.zeros(shape=(1,self.hidden))
    self.bias3 = np.zeros(shape=(1,self.outputs))

  def forward(self, X):
    my_relu = ReLu()  # I call my relu function above so all cells have to be run!
    self.hiddenlayer1_nonactivation = np.dot(X, self.weight1) + self.bias1
    self.hiddenlayer1 = my_relu.forward(self.hiddenlayer1_nonactivation)
    self.hiddenlayer2_nonactivation = np.dot(self.hiddenlayer1, self.weight2) + self.bias2
    self.hiddenlayer2 = my_relu.forward(self.hiddenlayer2_nonactivation)
    self.hiddenlayer3 = np.dot(self.hiddenlayer2, self.weight3) + self.bias3
    return self.hiddenlayer3

In [None]:
# This is the test code
model1 = ANN()
X = np.ones(shape=(6, 784))
result = model1.forward(X)
print(result.shape)
print(result)

(6, 10)
[[-0.83941167 -0.26125574  1.90310268  0.36106172  0.96642744  1.15667227
   0.20331913  0.15909078  0.62160771  1.03611572]
 [-0.83941167 -0.26125574  1.90310268  0.36106172  0.96642744  1.15667227
   0.20331913  0.15909078  0.62160771  1.03611572]
 [-0.83941167 -0.26125574  1.90310268  0.36106172  0.96642744  1.15667227
   0.20331913  0.15909078  0.62160771  1.03611572]
 [-0.83941167 -0.26125574  1.90310268  0.36106172  0.96642744  1.15667227
   0.20331913  0.15909078  0.62160771  1.03611572]
 [-0.83941167 -0.26125574  1.90310268  0.36106172  0.96642744  1.15667227
   0.20331913  0.15909078  0.62160771  1.03611572]
 [-0.83941167 -0.26125574  1.90310268  0.36106172  0.96642744  1.15667227
   0.20331913  0.15909078  0.62160771  1.03611572]]
