# Lab 3: Deep Networks

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim

## Create Data

In [2]:
num_samples = 1000
x1 = np.random.normal(0, 1, (num_samples,))
x2 = np.random.normal(0, 1, (num_samples,))
x3 = np.random.normal(0, 1, (num_samples,))

y = 3 * x1 ** 3 + 6 * x2 ** 2 + x3 + 3
X = np.stack((x1, x2, x3, np.ones((num_samples,))), axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =100, random_state=10)

## Batch Gradient Descent

Let's first try to approximate this function with linear regression using gradient descent

Recall that in linear regression:
$$f(x) = X\theta$$

The $L_{2}$ lost function, given $n$ samples:

$$J(\theta) = \frac{1}{n}\|y - X\theta\|_{2}^{2}$$

The gradient of the lost function (for back propagation):

$$\nabla J(\theta) = \frac{2}{m}X^{T}(X\theta - y)$$

We update parameter $\theta$ by

$$\theta = \theta - \alpha \times \nabla J(\theta)$$

Where $\alpha$ is the step size

In [39]:
### The square loss function
def compute_loss(X, y, theta):
    """
    Given a set of X, y, theta, compute the average square loss for predicting y with X*theta.

    Args:
        X - the feature vector, 2D numpy array of size (num_samples, num_features)
        y - the label vector, 1D numpy array of size (num_samples)
        theta - the parameter vector, 1D array of size (num_samples)

    Returns:
        loss - the average square loss, scalar
    """
    loss = 0 #Initialize the average square loss
    
    ################ Your code here ################




### The gradient of the square loss function
def compute_gradient(X, y, theta):
    """
    Compute the gradient of the average square loss

    Args:
        X - the feature vector, 2D numpy array of size (num_samples, num_features)
        y - the label vector, 1D numpy array of size (num_samples)
        theta - the parameter vector, 1D numpy array of size (num_samples)

    Returns:
        grad - gradient vector, 1D numpy array of size (num_features)
    """
    
    ################ Your code here ################


In [43]:
def batch_grad_descent(X, y, alpha=0.01, num_step=1000):
    """
    Batch gradient descent to minimize the average square loss objective.

    Args:
        X - A tuple containing two elements (X_train, X_test), 
            each one of which is a numpy array of size (num_samples, num_features)
        y - A tuple containing two elements (y_train, y_test),
            each one of which is a numpy array of size (num_samples)
        alpha - step size in gradient descent
        num_step - number of steps to run

    Returns:
        best_theta - the set of parameters that achieves smallest loss on test data
        train_loss_hist - the history of average square loss on training data, 1D numpy array, (num_step)
        test_loss_hist - the history of average square loss on testing data, 1D numpy array, (num_step)
    """
    # Initialization
    X_train, X_test = X
    y_train, y_test = y
    num_features = X_train.shape[1]
    theta = np.zeros(num_features)
    
    # Track performance
    best_test_loss = float('Inf')
    best_theta = theta
    test_loss_hist = np.zeros(num_step)
    train_loss_hist = np.zeros(num_step)
    
    ################ Your code here ################
    


    return (best_theta, train_loss_hist, test_loss_hist)

In [None]:
_, train_loss, test_loss = batch_grad_descent((X_train, X_test), (y_train, y_test))
x = np.arange(train_loss.shape[0])
plt.plot(x, train_loss, label="Train Loss")
plt.plot(x, test_loss, label="Test Loss")

plt.legend()
plt.show()

## Minibatch Gradient Descent

Batch gradient descent with very large training data can take a very long time to compute, since it requires looking at each training example to take a single gradient step. Hence in reality, we often use minibatch gradient descent. 

One pass through the training data is called an **epoch**. In each epoch, training data are divided into minibatches after **randomly shuffling**. During training, we sweep through the whole training set one minibatch at a time, and perform a parameter update using one minibatch of data.

In [80]:
#######################################
### Minibatch gradient descent
def minibatch_grad_descent(X, y, batch_size=50, alpha=0.01, num_epoch=50):
    """
    Minibatch gradient descent to minimize the average square loss objective.

    Args:
        X - A tuple containing two elements (X_train, X_test), 
            each one of which is a numpy array of size (num_samples, num_features)
        y - A tuple containing two elements (y_train, y_test),
            each one of which is a numpy array of size (num_samples)
        batch_size - minibatch size
        alpha - step size in gradient descent
        num_step - number of epochs to go through the whole training set

    Returns:
        best_theta - the set of parameters that achieves smallest loss on test data
        train_loss_hist - the history of average square loss on training data, 1D numpy array, (num_step)
        test_loss_hist - the history of average square loss on testing data, 1D numpy array, (num_step)
    """
    
    # Initialization
    X_train, X_test = X
    y_train, y_test = y
    num_samples, num_features = X_train.shape
    num_batches = int(num_samples / batch_size)
    theta = np.ones(num_features) 
    order = np.arange(num_samples)
    step = 0
    
    # Track performance
    best_test_loss = float('Inf')
    best_theta = theta
    train_loss_hist = np.zeros((num_epoch, num_batches))
    test_loss_hist = np.zeros((num_epoch, num_batches))
    
    np.random.seed(432)
    ################ Your code here ################

    
    return (best_theta, train_loss_hist, test_loss_hist)

In [None]:
_, train_loss, test_loss = minibatch_grad_descent((X_train, X_test), (y_train, y_test))
x = np.arange(train_loss.shape[0])
plt.plot(x, np.mean(train_loss, axis=1), label="Train Loss")
plt.plot(x, np.mean(test_loss, axis=1), label="Test Loss")

plt.legend()
plt.show()

## PyTorch Implementation

How can you modify minibatch gradient descent to make it work for PyTorch?

In [4]:
################ Your code here ################
# Define network


# Training Loop