# LIS 640 Applied Deep Learning : Computation Graphs in Pytorch

# Code Blocks for Problem 3

In [1]:
import torch
from utils import svm_loss, softmax_loss

def hello_fully_connected_networks():
    print('Hello from problem3.ipynb!')


class Linear(object):

    @staticmethod
    def forward(x, w):
        """
        Computes the forward pass for an linear (fully-connected) layer.
        The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
        examples, where each example x[i] has shape (d_1, ..., d_k). We will
        reshape each input into a vector of dimension D = d_1 * ... * d_k, and
        then transform it to an output vector of dimension M.
        Inputs:
        - x: A tensor containing input data, of shape (N, d_1, ..., d_k)
        - w: A tensor of weights, of shape (D, M)
        Returns a tuple of:
        - out: output, of shape (N, M)
        - cache: (x, w)
        """
        out = None
        out = x.view(x.shape[0],-1).mm(w)
        cache = (x, w)
        return out, cache

    @staticmethod
    def backward(dout, cache):
        """
        Computes the backward pass for an linear layer.
        Inputs:
        - dout: Upstream derivative, of shape (N, M)
        - cache: Tuple of:
          - x: Input data, of shape (N, d_1, ... d_k)
          - w: Weights, of shape (D, M)
        Returns a tuple of:
        - dx: Gradient with respect to x, of shape
          (N, d1, ..., d_k)
        - dw: Gradient with respect to w, of shape (D, M)
        """
        x, w = cache
        dx, dw = None, None
        dx = dout.mm(w.t()).view(x.shape)
        dw = x.view(x.shape[0],-1).t().mm(dout)
        return dx, dw


class ReLU(object):

    @staticmethod
    def forward(x):
        """
        Computes the forward pass for a layer of rectified
        linear units (ReLUs).
        Input:
        - x: Input; a tensor of any shape
        Returns a tuple of:
        - out: Output, a tensor of the same shape as x
        - cache: x
        """
        out = None
        ###################################################
        # TODO: Implement the ReLU forward pass.          #
        # You should not change the input tensor x with an#
        # in-place operation. Try to clone it first.      #
        ###################################################
        # Replace "pass" statement with your code
        out = torch.clamp(x, min=0)
        ###################################################
        #                 END OF YOUR CODE                #
        ###################################################
        cache = x
        return out, cache

    @staticmethod
    def backward(dout, cache):
        """
        Computes the backward pass for a layer of rectified
        linear units (ReLUs).
        Input:
        - dout: Upstream derivatives, of any shape
        - cache: Input x, of same shape as dout
        Returns:
        - dx: Gradient with respect to x
        """
        dx, x = None, cache
        #####################################################
        # TODO: Implement the ReLU backward pass.           #
        # You should not change the input tensor dout with  #
        # an in-place operation. Try to clone it first.     #
        #####################################################
        # Replace "pass" statement with your code
        dx = dout.clone()
        dx[x<=0] = 0
        #####################################################
        #                  END OF YOUR CODE                 #
        #####################################################
        return dx


class TwoLayerNet(object):
    """
    A two-layer fully-connected neural network with ReLU nonlinearity and
    softmax loss that uses a modular layer design. We assume an input dimension
    of D, a hidden dimension of H, and perform classification over C classes.
    The architecure should be linear - relu - linear - softmax.
    Note that this class does not implement gradient descent; instead, it
    will interact with a separate Solver object that is responsible for running
    optimization.

    The learnable parameters of the model are stored in the dictionary
    self.params that maps parameter names to PyTorch tensors.
    """

    def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
                 weight_scale=1e-3, reg=0.0,
                 dtype=torch.float32, device='cpu'):
        """
        Initialize a new network.
        Inputs:
        - input_dim: An integer giving the size of the input
        - hidden_dim: An integer giving the size of the hidden layer
        - num_classes: An integer giving the number of classes to classify
        - weight_scale: Scalar giving the standard deviation for random
          initialization of the weights.
        - reg: Scalar giving L2 regularization strength.
        - dtype: A torch data type object; all computations will be
          performed using this datatype. float is faster but less accurate,
          so you should use double for numeric gradient checking.
        - device: device to use for computation. 'cpu' or 'cuda'
        """
        self.params = {}
        self.reg = reg

        # Initialize
        self.params['W1'] = torch.zeros(input_dim, hidden_dim, dtype=dtype,device = device)
        self.params['W1'] += weight_scale*torch.randn(input_dim, hidden_dim, dtype=dtype,device= device)
        self.params['W2'] = torch.zeros(hidden_dim, num_classes, dtype=dtype,device= device)
        self.params['W2'] += weight_scale*torch.randn(hidden_dim, num_classes, dtype=dtype,device= device)

    def save(self, path):
        checkpoint = {
          'reg': self.reg,
          'params': self.params,
        }

        torch.save(checkpoint, path)
        print("Saved in {}".format(path))

    def load(self, path, dtype, device):
        checkpoint = torch.load(path, map_location='cpu')
        self.params = checkpoint['params']
        self.reg = checkpoint['reg']
        for p in self.params:
            self.params[p] = self.params[p].type(dtype).to(device)
        print("load checkpoint file: {}".format(path))

    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.

        Inputs:
        - X: Tensor of input data of shape (N, d_1, ..., d_k)
        - y: int64 Tensor of labels, of shape (N,). y[i] gives the
          label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model
        and return:
        - scores: Tensor of shape (N, C) giving classification scores,
          where scores[i, c] is the classification score for X[i]
          and class c.
        If y is not None, then run a training-time forward and backward
        pass and return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping
          parameter names to gradients of the loss with respect to
          those parameters.
        """
        scores = None
        #############################################################
        # TODO: Implement the forward pass for the two-layer net,   #
        # computing the class scores for X and storing them in the  #
        # scores variable.                                          #
        #############################################################
        # Replace "pass" statement with your code
        hidden = X.mm(self.params['W1'])
        hidden = torch.nn.functional.relu(hidden)
        scores = hidden.mm(self.params['W2'])
        ##############################################################
        #                     END OF YOUR CODE                       #
        ##############################################################

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}

        # compute the loss and gradient for softmax classification
        loss, dout = softmax_loss(scores, y)
        ###################################################################
        # TODO: Implement the backward pass for the two-layer net.        #
        # The upstream derivatives "dout" have been given.                #
        # You just need to compute gradients of Linear and ReLU layer     #
        ###################################################################
        # Replace "pass" statement with your code
        grads['W2'] = hidden.t().mm(dout)

        # Gradient of hidden layer (ReLU)
        dhidden = dout.mm(self.params['W2'].t())
        dhidden_relu = dhidden * (hidden > 0).type(dhidden.dtype)

        # Gradient of W1
        grads['W1'] = X.t().mm(dhidden_relu)

        # Regularization gradient
        grads['W1'] += self.reg * self.params['W1']
        grads['W2'] += self.reg * self.params['W2']
        ###################################################################
        #                     END OF YOUR CODE                            #
        ###################################################################

        return loss, grads


def sgd(w, dw, config=None):
    """
    Performs vanilla stochastic gradient descent.
    config format:
    - learning_rate: Scalar learning rate.
    """
    if config is None:
        config = {}
    config.setdefault('learning_rate', 1e-2)

    w -= config['learning_rate'] * dw
    return w, config

# Questions for Problem 3

# Set up code

In [2]:
import utils
import torch

In this exercise we will implement two-layer network using a more modular approach. For each layer we will implement a `forward` and a `backward` function. The `forward` function will receive inputs, weights, and other parameters and will return both an output and a `cache` object storing data needed for the backward pass, like this:

```python
def forward(x, w):
  """ Receive inputs x and weights w """
  # Do some computations ...
  z = # ... some intermediate value
  # Do some more computations ...
  out = # the output
   
  cache = (x, w, z, out) # Values we need to compute gradients
   
  return out, cache
```

The backward pass will receive upstream derivatives and the `cache` object, and will return gradients with respect to the inputs and weights, like this:

```python
def backward(dout, cache):
  """
  Receive dout (derivative of loss with respect to outputs) and cache,
  and compute derivative with respect to inputs.
  """
  # Unpack cache values
  x, w, z, out = cache
  
  # Use values in cache to compute derivatives
  dx = # Derivative of loss with respect to x
  dw = # Derivative of loss with respect to w
  
  return dx, dw
```

After implementing a bunch of layers this way, we will be able to easily combine them to build classifiers with different architectures. Your task here is to implement `ReLU` activation function with modular approach.


To validate our implementation, we will compare the analytically computed gradients with numerical approximations of the gradient as done in previous assignments. You can inspect the numeric gradient function `utils.compute_numeric_gradient`. Please note that we have updated the function to accept upstream gradients to allow us to debug intermediate layers easily.
  

# ReLU activation function

We will now implement the ReLU activation function. As above, we will define a class with two empty static methods, and implement them in upcoming cells. The class structure can be found in **Code Blocks for Problem 3**

## ReLU activation: forward
Implement the forward pass for the ReLU activation function in the `ReLU.forward` function. You **should not** change the input tensor with an in-place operation.

Run the following to test your implementation of the ReLU forward pass. Your errors should be less than `1e-7`.

In [3]:
utils.reset_seed(0)
x = torch.linspace(-0.5, 0.5, steps=12, dtype=torch.float64, device='cuda')
x = x.reshape(3, 4)

out, _ = ReLU.forward(x)
correct_out = torch.tensor([[ 0.,          0.,          0.,          0.,        ],
                            [ 0.,          0.,          0.04545455,  0.13636364,],
                            [ 0.22727273,  0.31818182,  0.40909091,  0.5,       ]],
                            dtype=torch.float64,
                            device='cuda')

# Compare your output with ours. The error should be on the order of e-8
print('Testing ReLU.forward function:')
print('difference: ', utils.rel_error(out, correct_out))

Testing ReLU.forward function:
difference:  4.5454545613554664e-09


## ReLU activation: backward
Now implement the backward pass for the ReLU activation function.

Again, you should not change the input tensor with an in-place operation.

Run the following to test your implementation of `ReLU.backward`. Your errors should be less than `1e-8`.

In [4]:
utils.reset_seed(0)
x = torch.randn(10, 10, dtype=torch.float64, device='cuda')
dout = torch.randn(*x.shape, dtype=torch.float64, device='cuda')

dx_num = utils.compute_numeric_gradient(lambda x: ReLU.forward(x)[0], x, dout)

_, cache = ReLU.forward(x)
dx = ReLU.backward(dout, cache)

# The error should be on the order of e-12
print('Testing ReLU.backward function:')
print('dx error: ', utils.rel_error(dx_num, dx))

Testing ReLU.backward function:
dx error:  2.6317796097761553e-10


# Two-layer network
In the previous problem2 you implemented a two-layer neural network in a single monolithic class. Now that you have implemented modular versions of the necessary layers, you will reimplement the two layer network using these modular implementations.

Complete the implementation of the `TwoLayerNet` class. This class will serve as a model for the other networks you will implement in this assignment, so read through it to make sure you understand the API.

Once you have finished implementing the forward and backward passes of your two-layer net, run the following to test your implementation:

In [5]:
torch.set_printoptions(precision=12, threshold=None, edgeitems=None, linewidth=None, profile=None)

utils.reset_seed(0)
N, D, H, C = 3, 5, 50, 7
X = torch.randn(N, D, dtype=torch.float64, device='cuda')
y = torch.randint(C, size=(N,), dtype=torch.int64, device='cuda')

std = 1e-3
model = TwoLayerNet(
          input_dim=D,
          hidden_dim=H,
          num_classes=C,
          weight_scale=std,
          dtype=torch.float64,
          device='cuda'
        )

print('Testing initialization ... ')
W1_std = torch.abs(model.params['W1'].std() - std)
W2_std = torch.abs(model.params['W2'].std() - std)
assert W1_std < std / 10, 'First layer weights do not seem right'
assert W2_std < std / 10, 'Second layer weights do not seem right'

print('Testing test-time forward pass ... ')
model.params['W1'] = torch.linspace(-0.7, 0.3, steps=D * H, dtype=torch.float64, device='cuda').reshape(D, H)
model.params['W2'] = torch.linspace(-0.3, 0.4, steps=H * C, dtype=torch.float64, device='cuda').reshape(H, C)
X = torch.linspace(-5.5, 4.5, steps=N * D, dtype=torch.float64, device='cuda').reshape(D, N).t()
scores = model.loss(X)
correct_scores = torch.tensor(
        [[ 8.56847057,  9.12177260,  9.67507463, 10.22837667, 10.78167870,
         11.33498073, 11.88828277],
        [ 9.09451046,  9.57617926, 10.05784805, 10.53951685, 11.02118564,
         11.50285444, 11.98452323],
        [ 9.62055036, 10.03058591, 10.44062147, 10.85065703, 11.26069259,
         11.67072814, 12.08076370]],
    dtype=torch.float64, device='cuda')
scores_diff = torch.abs(scores - correct_scores).sum()
assert scores_diff < 1e-6, 'Problem with test-time forward pass'

print('Testing training loss (no regularization)')
y = torch.tensor([0, 5, 1])
loss, grads = model.loss(X, y)
correct_loss = 2.881451052641
assert abs(loss - correct_loss) < 1e-10, 'Problem with training-time loss'

# Errors should be around e-6 or less
print('Running numeric gradient check:')
loss, grads = model.loss(X, y)

for name in sorted(grads):
  f = lambda _: model.loss(X, y)[0]
  grad_num = utils.compute_numeric_gradient(f, model.params[name])
  print('%s relative error: %.2e' % (name, utils.rel_error(grad_num, grads[name])))

Testing initialization ... 
Testing test-time forward pass ... 
Testing training loss (no regularization)
Running numeric gradient check:
W1 relative error: 1.70e-07
W2 relative error: 2.19e-09
