In [1]:
%matplotlib inline  
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import colorsys
import sys
import time
from IPython import display
from mpl_toolkits import mplot3d
from sklearn import datasets

In [2]:
def relu(X):
    return np.maximum(X,0)

def relu_derivative(X):
    return 1.0 * (X>0) #.astype(float)

def tanh(X):
    return np.tanh(X)

def tanh_derivative(X):
    return (1.0-tanh(X)**2)

def logistic(X):
    return 1.0/(1.0+np.exp(-X))

def logistic_derivative(X):
    return (logistic(X)*(1.0-logistic(X)))

In [19]:
# create a two-layer neural network
def create_model(X,hidden_nodes,output_dim=2,activation_function='relu'):
    # this will hold a dictionary of layers
    model = {}
    
    # set the model activation function, the eval() function converts a string into a function
    # this way, we can directly call the appropriate activation function and its derative with just the string name
    # and we can avoid writing "if" statements for each activation function and derivatives
    model['activation_function'] = eval(activation_function);
    
    # set the model activation function derative using eval(), same logic as previous line
    model['activation_function_derivative'] = eval(activation_function + '_derivative')
    
    # input dimensionality
    input_dim = X.shape[1]
    
    # first set of weights from input to hidden layer 1
    model['W1'] = np.random.randn(input_dim, hidden_nodes)/np.sqrt(input_dim)
    # set of biases
    model['b1'] = np.zeros((1, hidden_nodes))
    
    # second set of weights from hidden layer 1 to layer 2
    model['W2'] = np.random.randn(hidden_nodes, output_dim)/np.sqrt(hidden_nodes)
    # set of biases for second hidden layer
    model['b2'] = np.zeroes((1, hidden_nodes))
    
    # third set of weights from hidder layer 2 to output
    model['W2'] = np.random.randn(hidden_nodes, output_dim)/np.sqrt(hidden_nodes)
    # set of biases for output layer
    model['b2'] = np.zeros((1, output_dim))
    
    return model

# defines the forward pass given a model and data
def feed_forward(model, x):
    # get weights and biases
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    # first layer
    z1 = x.dot(W1) + b1
    
    # activation function
    #a1 = logistic(z1)
    #a1 = tanh(z1)
    a1 = model['activation_function'](z1)
    
    # second layer
    z2 = a1.dot(W2)+b2
    
    # no activation function as this is simply a linear layer!!
    out = z2
    return z1, a1, z2, out

# define the regression loss
def calculate_loss(model,X,y,reg_lambda):
    num_examples = X.shape[0]
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    
    # what are the current predictions
    z1, a1, z2, out = feed_forward(model, X)
    
    # calculate L2 loss
    loss = 0.5*np.sum((out-y)**2)
    
    # add regulatization term to loss
    loss += reg_lambda/2*(np.sum(np.square(W1))+ np.sum(np.square(W2)))
    
    # return per-item loss
    return 1./num_examples * loss

# back-propagation for the two-layer network
def backprop(X,y,model,z1,a1,z2,output,reg_lambda):
    
    # derivative of loss function
    delta3 = (output-y)/X.shape[0]
    # multiply this by activation outputs of hidden layer
    dW2 = (a1.T).dot(delta3)
    # and over all neurons
    db2 = np.sum(delta3, axis =0, keepdims=True) #different because it is not being multiplied by all the weights, only 1
    
    # derivative of activation function
    delta2 = delta3.dot(model['W2'].T)* model['activation_function_derivative'](a1) # call the appropriate activation function derivative
    
    # multiply by input data
    dW1 = (X.T).dot(delta2)
    # and sum over all neurons
    db1 = np.sum(delta2, axis=0)
    
    # add regularization terms on the two weights
    dW2 += reg_lambda * model['W2']
    dW1 += reg_lambda * model['W1']
    
    return dW1, dW2, db1, db2

# simple training loop
def train(model, X, y, num_passes=100000, reg_lambda = 0.1, learning_rate = 0.001):
    # whether to do stochastic gradient descent
    sgd = True
    
    # variable that checks whether we break iteration
    done = False
    
    # keeping track of losses
    previous_loss = float('inf')
    losses = []

    # iteration counter
    i = 0
    while done == False:
        if sgd:
            # choose a random set of points
            randinds = np.random.choice(np.arange(len(y)),30,False) #bad programming because we dont know the number of  data points needed , 30 is random
            # get predictions
            z1,a1,z2,output = feed_forward(model, X[randinds,:])
            # feed this into backprop
            dW1, dW2, db1, db2 = backprop(X[randinds,:],y[randinds],model,z1,a1,z2,output,reg_lambda)
        else:
            # get predictions
            z1,a1,z2,output = feed_forward(model, X)
            # feed this into backprop
            dW1, dW2, db1, db2 = backprop(X,y,model,z1,a1,z2,output,reg_lambda)
            
        # given the results of backprop, update both weights and biases
        model['W1'] -= learning_rate * dW1
        model['b1'] -= learning_rate * db1
        model['W2'] -= learning_rate * dW2
        model['b2'] -= learning_rate * db2
        
        # do some book-keeping every once in a while
        if i % 1000 == 0:
            loss = calculate_loss(model, X, y, reg_lambda)
            losses.append(loss)
            print("Loss after iteration {}: {}".format(i, loss))
            # very crude method to break optimization
            if np.abs((previous_loss-loss)/previous_loss) < 0.001:
                done = True
            previous_loss = loss
        i += 1
        if i>=num_passes:
            done = True
    return model, losses

In [21]:
numDataOne = 15
numData = numDataOne*numDataOne
# create data for regression
xs=np.linspace(-8,8,numDataOne)
ys=np.linspace(-8,8,numDataOne)
counter=0
X=np.zeros((numData,2))
y=np.zeros((numData,1))
for r in np.arange(0,numDataOne):
    for c in np.arange(0,numDataOne):
        X[counter,:]=[xs[r],ys[c]]
        y[counter]=xs[r]**2+ys[c]**2+1
        counter=counter+1

# training set size
num_examples = len(X) 
# input layer dimensionality
nn_input_dim = 2 
# output layer dimensionality
nn_output_dim = 1  
# learning rate for gradient descent
learning_rate = 0.001
# regularization strength
reg_lambda = 0.01 

# create the model
model = create_model(X,10,1) # try changing number of neurons from 10 to 4 to 2 to 3 to 100 (to see how the final 3d image is altered)
print(model)
# train it
model, losses = train(model,X, y, reg_lambda=reg_lambda, learning_rate=learning_rate)

# determine predictions of the trained model
output = feed_forward(model, X)

{'activation_function': <function relu at 0x0000020C87689480>, 'activation_function_derivative': <function relu_derivative at 0x0000020CD7505E10>, 'W1': array([[-0.4699478 ,  0.97385053,  0.78625119, -0.52552253, -0.05038252,
         0.28625191, -0.36754976, -0.07414273,  0.80879682, -0.07854602],
       [ 0.3190304 , -1.64384229, -0.68509709,  0.55594788,  0.01703111,
        -1.04160609, -0.05883318, -0.72216067,  0.3902574 , -0.20642454]]), 'b1': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]), 'W2': array([[ 0.30782197],
       [-0.01210331],
       [-0.341612  ],
       [ 0.23950397],
       [ 0.38028241],
       [ 0.20555401],
       [-0.00240822],
       [ 0.07529288],
       [ 0.01969584],
       [ 0.11260976]]), 'b2': array([[0.]])}
Loss after iteration 0: 1498.6878800906895
Loss after iteration 1000: 25.30862710487808
Loss after iteration 2000: 9.79275786461649
Loss after iteration 3000: 7.173275312638423


  if np.abs((previous_loss-loss)/previous_loss) < 0.001:


Loss after iteration 4000: 5.673610204388251
Loss after iteration 5000: 5.207344884467192
Loss after iteration 6000: 4.862537249340139
Loss after iteration 7000: 4.0757602101152814
Loss after iteration 8000: 3.8069720236146876
Loss after iteration 9000: 3.2283361008171716
Loss after iteration 10000: 3.1573898333662034
Loss after iteration 11000: 3.108284907481881
Loss after iteration 12000: 3.0550558558941234
Loss after iteration 13000: 3.1170783268725377
Loss after iteration 14000: 2.9492997196211497
Loss after iteration 15000: 2.991143840487557
Loss after iteration 16000: 3.0207746040675705
Loss after iteration 17000: 2.861867627714464
Loss after iteration 18000: 3.041911281662467
Loss after iteration 19000: 2.98107900618627
Loss after iteration 20000: 2.9616576115393864
Loss after iteration 21000: 2.9345325276155285
Loss after iteration 22000: 2.843377682660933
Loss after iteration 23000: 2.852296427647476
Loss after iteration 24000: 2.829184118621967
Loss after iteration 25000: 3.1