In [35]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier

In [2]:
def softmax_stable(Z):
    """
    Compute softmax values for each sets of scores in Z.
    each row of Z is a set of scores.    
    """
    Z-=np.max(Z, axis=1, keepdims=True)
    e_Z = np.exp(Z)
    return e_Z/np.sum(e_Z, axis=1, keepdims=True)

def cross_entropy_loss(y_hat, y):
    """
    y_hat: a numpy array of shape (Npoints, nClasses) -- predicted output 
    y: a numpy array of shape (Npoints) -- ground truth. We don't need to use the one-hot vector here since most of elements are zeros. When programming 
    in numpy, we need to use the corresponding indexes only.
    """
    enum = range(y_hat.shape[0])
    return -np.mean(np.log(y_hat[enum, y]))

def reLU(Z):
    Z[Z<0]=0
    return Z

def reLU_grad(A):
    A[A==0] = 0
    A[A>0] = 1
    return A

In [31]:
def mlp_init(d0, d1, d2):
    """ 
    Initialize W1, b1, W2, b2 
    d0: dimension of input data 
    d1: number of hidden unit 
    d2: number of output unit = number of classes
    """
    W1 = 0.01*np.random.randn(d0, d1)
    b1 = np.zeros(d1)
    W2 = 0.01*np.random.randn(d1, d2)
    b2 = np.zeros(d2)
    return (W1, b1, W2, b2)

def mlp_predict(X, W1, b1, W2, b2):
    """
    Suppose that the network has been trained, predict class of new points. 
    X: data matrix, each ROW is one data point.
    W1, b1, W2, b2: learned weight matrices and biases 
    """
    Z1 = np.dot(X, W1) + b1 # shape (N, d1)
    A1 = reLU(Z1) # shape (N, d1)
    Z2 = np.dot(A1, W2) + b2# shape (N, d2)
    return np.argmax(Z2, axis=1)

def mlp_fit(X, y, W1, b1, W2, b2, lr=0.01, nepoches=10000):
    loss_hist = []
    for i in range(nepoches):
        #Feed Forward
        Z1 = np.dot(X, W1) + b1
        A1 = reLU(Z1)
        Z2 = np.dot(A1, W2) + b2
        y_hat = softmax_stable(Z2)# shape (N, d2)
        
        if i+1%1000==0:
            loss_hist.append(cross_entropy_loss(y_hat, y))
            print('Iteration #{}, loss: {:.5}'.format(i+1, loss_hist[-1]))
            
        # Back Propagaion
        enum=range(len(y_hat))
        y_hat[enum, y]-=1
        E2 = y_hat/len(y_hat)# shape (N, d2)
        dW2 = np.dot(A1.T, E2)
        db2 = np.sum(E2, axis=0)# shape (d2,)
        E1 = np.dot(E2, W2.T) * reLU_grad(Z1)# shape (N, d1)
        dW1 = np.dot(X.T, E1)
        db1 = np.sum(E1, axis=0)# shape (d1,)
        
        # Gradient Descent update
        W1-=lr*dW1
        b1-=lr*db1
        W2-=lr*dW2
        b2-=lr*db2
    return W1, b1, W2, b2, loss_hist

In [37]:
N = 100 # number of points per class
d0 = 2 # dimensionality
C = 3 # number of classes
X = np.zeros((N*C, d0)) # data matrix (each row = single example)
y = np.zeros(N*C, dtype='uint8') # class labels

for j in range(C):
    ix = range(N*j,N*(j+1))
    r = np.linspace(0.0,1,N) # radius
    t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
    X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
    y[ix] = j

In [33]:
d0 = 2
d1 = h = 500 # size of hidden layer
d2 = C = 3
eta = 1 # learning rate
# initialize parameters randomly
(W1, b1, W2, b2) = mlp_init(d0, d1, d2)
(W1, b1, W2, b2, loss_hist) =mlp_fit(X, y, W1, b1, W2, b2, eta)

Iteration #1, loss: 1.0989
Iteration #1001, loss: 0.096582
Iteration #2001, loss: 0.048654
Iteration #3001, loss: 0.035331
Iteration #4001, loss: 0.029042
Iteration #5001, loss: 0.02563
Iteration #6001, loss: 0.023464
Iteration #7001, loss: 0.021925
Iteration #8001, loss: 0.02076
Iteration #9001, loss: 0.019853


In [34]:
y_pred = mlp_predict(X, W1, b1, W2, b2)
acc = 100*np.mean(y_pred == y)
print('training accuracy: %.2f %%' % acc)

training accuracy: 99.33 %


# Sklearn Neural Net

In [52]:
lr = 0.01# regularization parameter
mlp = MLPClassifier(alpha=1e-3, max_iter=10000, hidden_layer_sizes=(d0, d1, d2))
mlp.fit(X, y)

MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2, 500, 3), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [53]:
y_pred = mlp.predict(X) 
acc = 100*np.mean(y_pred == y)
print('training accuracy: %.2f %%' % acc)

training accuracy: 78.00 %


In [44]:
?MLPClassifier

[1;31mInit signature:[0m [0mMLPClassifier[0m[1;33m([0m[0mhidden_layer_sizes[0m[1;33m=[0m[1;33m([0m[1;36m100[0m[1;33m,[0m[1;33m)[0m[1;33m,[0m [0mactivation[0m[1;33m=[0m[1;34m'relu'[0m[1;33m,[0m [0msolver[0m[1;33m=[0m[1;34m'adam'[0m[1;33m,[0m [0malpha[0m[1;33m=[0m[1;36m0.0001[0m[1;33m,[0m [0mbatch_size[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m [0mlearning_rate[0m[1;33m=[0m[1;34m'constant'[0m[1;33m,[0m [0mlearning_rate_init[0m[1;33m=[0m[1;36m0.001[0m[1;33m,[0m [0mpower_t[0m[1;33m=[0m[1;36m0.5[0m[1;33m,[0m [0mmax_iter[0m[1;33m=[0m[1;36m200[0m[1;33m,[0m [0mshuffle[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mtol[0m[1;33m=[0m[1;36m0.0001[0m[1;33m,[0m [0mverbose[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mwarm_start[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mmomentum[0m[1;33m=[0m[1;36m0.9[0m[1;33m,[0m [0mnesterovs_momentum[0m