## Circles data 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def circles(X):
    y = np.sin(10*np.sqrt(X[:,0] ** 2 + X[:,1] ** 2))
    return y

def generate_data(y_func,N=1000):
    X = 2*np.random.rand(N,2)-1
    y = y_func(X) + 0.25* np.random.randn(N)
    return X,y

def draw_contours(y_func):
    plt.figure(figsize=(6,6))
    X1,X2 = np.meshgrid(np.linspace(-1, 1, 30),np.linspace(-1, 1, 30))
    Y = y_func(np.c_[X1.ravel(),X2.ravel()]).reshape(X1.shape)
    plt.contourf(X1,X2,Y,alpha=0.5)
    c = plt.colorbar()
    plt.xlabel('$x_{1}$')
    plt.ylabel('$x_{2}$')
    return None

X,y = generate_data(circles)
f = draw_contours(circles)
s = plt.scatter(X[:,0],X[:,1],c=y) 

## Predictions using Keras

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()
model.add(Dense(16, input_dim=2, activation='relu'))
model.add(Dense(16, input_dim=2, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
model.fit(X, y, epochs=256, batch_size=64,verbose=False)

f = draw_contours(model.predict)

## Computing predictions from scratch

### Initializing a neural network 

In [None]:
def initialize_network(layers):
    np.random.seed(108)
    pars = [] # store parameters in a list of dictionaries
    for j in range(1,len(layers)): # loop over layers
        dictionary = {'weight' : np.random.randn(layers[j],layers[j-1]), # weights
                      'bias' : np.random.randn(layers[j],1)} # biases
        pars.append(dictionary)
    return pars

p = X.shape[1] # set number of features
layers = [p,16,16,1] # set number of units in each layer
pars = initialize_network(layers)

### Forward pass

In [None]:
def forward(X,pars):
    
    # make lists to store elements of the graph in memory
    Zs,Hs = [],[]
    
    # initialize inputs
    H = X.T
    
    # loop over layers
    for j in range(len(pars)):
        
        # get parameters for this layer      
        W = pars[j]['weight']
        b = pars[j]['bias']
                
        # activations and output
        Z = None # TODO
        H = Z if j+1 == len(pars) else Z*(Z>0)
         
        # save to list
        Zs.append(Z)
        Hs.append(H)
    
    return Zs,Hs

def predict(X,pars):
    Zs,Hs = forward(X,pars)
    return Hs[-1].flatten()

# test plot using random intialization
f = draw_contours(lambda X: predict(X,pars))

### Backward pass

In [None]:
def backprop(Zs,Hs,X,y,pars,lamda=0):
    
    # setup list for gradients
    grads = []
    
    # data size
    m = X.shape[0]

    # loop over layers
    for j in range(len(pars))[::-1]:
        Z = Zs[j]
        H = Hs[j]
        W = pars[j]['weight']
                
        # get activations gradient
        dZ = (H - y) if j+1 == len(pars) else (Z > 0) * dH
        
        # get input from previous layer
        H_back = Hs[j-1] if j>0 else X.T
                
        # get parameter gradients
        dW = None # TODO
        db = np.sum(dZ,axis=1,keepdims=True) / m
        
        # save to list
        grads.append({'weight' : dW, 'bias' : db})
        
        # move to next layer
        if j>0: dH = np.matmul(W.T,dZ)
                     
    return grads[::-1]

### Vanilla gradient descent

In [None]:
np.random.seed(108)
pars = initialize_network([p,16,16,1])
learning_rate = 1e-2
n_steps = 10000
for i in range(0,n_steps):
    
    # forward and backward pass
    Zs, Hs = forward(X,pars)
    grads = backprop(Zs,Hs,X,y,pars)

    # parameter updates
    for j in range(len(pars)):
        pars[j]['weight'] -= learning_rate * grads[j]['weight']
        pars[j]['bias'] -= learning_rate * grads[j]['bias'] 
        
# plot using optimized parameters 
f = draw_contours(lambda X: predict(X,pars))

### SGD + Adam: TODO (advanced)