In [217]:
# Vertical
def loss_i(W, x, y):
    """
    Calculate the loss for a single point
    W CxD
    x Dx1
    y 1
    """
    loss = 0
    f = W.dot(x)
    f -= np.max(f)
    loss = -f[y] + np.log(np.sum(np.exp(f)))
    return loss

def loss_n(W, X, y, reg):
    """
    W CxD
    x DxN
    y N
    """
    n_points = X.shape[1]
    loss = 0
    for point in range(n_points):
        loss += loss_i(W, X[:, point], y[point])
    loss /= n_points
    loss += 0.5 * reg * np.sum(W * W)
    return loss

def loss_v(W, X, y, reg):
    """
    W CxD
    x DxN
    y N
    """
    n_points = X.shape[1]
    F = W.dot(X)
    F = F - np.max(F, axis=0)
    loss = -np.sum(F[y, np.arange(n_points)]) 
    loss += np.sum(np.log(np.sum(np.exp(F), axis=0)))
    loss /= n_points
    loss += 0.5 * reg * np.sum(W * W)
    return loss


def gradient_i(W, x, y, p):
    """
    W CxD
    x Dx1
    y N
    """
    f = W.dot(x)
    f -= np.max(f)
    dW = np.zeros_like(W)
    # -xj
    dW[y] -= x
    sum_f = np.sum(np.exp(f))
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
#            print "%s, %s, %s" % (p, i, j)
#            print "\t%s, %s, %s" % (F_exp[i, p], X[j, p], F_sum[p])
            dW[i, j] += np.exp(f[i]) * x[j] / sum_f
    return dW
               
def gradient_n(W, X, y, reg):
    """
    W CxD
    x DxN
    y N
    """
    dW = np.zeros_like(W)
    n_points = X.shape[1]
    for point in range(n_points):
        dW += gradient_i(W, X[:, point], y[point], point)
    dW /= n_points
    dW += reg * W
    return dW

def gradient_v(W, X, y, reg):
    """
    W CxD
    x DxN
    y N
    """
    dW = np.zeros_like(W)
    
    # -xj
    for i in range(dW.shape[0]):
        dW[i] = -np.sum(X.T[y == i], axis=0)
    
    # Others
    F = W.dot(X)
    F -= np.max(F, axis=0)
    F_exp = np.exp(F)
    F_exp_over_sum = F_exp / np.sum(F_exp, axis=0)
    dW += F_exp_over_sum.dot(X.T)
    dW /= X.shape[1]

    dW += reg * W
    return dW

reg = 0
X = np.array([[101, 55, 88, 33], [27, 155, 37, 189], [200, 245, 88, 99]])
y = np.array([1, 1, 0, 0])

#X = np.array([[101], [27], [200]])
#y = np.array([1])

W = np.array([[ -1.31696361e-04,   1.05243860e-04,   6.73918614e-05],  
              [  1.35716710e-05,  -1.03811763e-04,  -5.01638507e-05]])
 
print "marty loss", loss_v(W, X, y, reg)
print "estimate", -np.log(0.5)

grad_estimate = np.zeros_like(W)
h = 1e-5
for i in range(W.shape[0]):
    for j in range(W.shape[1]):
        oldval = W[i, j]
        W[i, j] = oldval + h
        fxph = loss_v(W, X, y, reg)
        W[i, j] = oldval - h
        fxmh = loss_v(W, X, y, reg)
        W[i, j] = oldval
        grad_estimate[i, j] = (fxph - fxmh) / (2 * h)

print "estimated gradient\n", grad_estimate
        
dW = gradient_v(W, X, y, reg)        
print "analytic gradient\n", dW

marty loss 0.695316060767
estimate 0.69314718056
estimated gradient
[[  4.77403092  -4.40043438  33.56160833]
 [ -4.77403092   4.40043438 -33.56160833]]
analytic gradient
[[  4.77403093  -4.40043411  33.56160882]
 [ -4.77403093   4.40043411 -33.56160882]]


In [212]:
X = np.array([[101], [27], [200]])
y = np.array([1])

X = np.array([[101, 55, 88, 33], [27, 155, 37, 189], [200, 245, 88, 99]])
y = np.array([1, 1, 0, 0])

W = np.array([[ -1.31696361e-04,   1.05243860e-04,   6.73918614e-05],  
              [  1.35716710e-05,  -1.03811763e-04,  -5.01638507e-05]])

print "correct\n", gradient_n(W, X, y, 0)

F = W.dot(X)
F -= np.max(F, axis=0)
F_exp_over_sum = np.exp(F) / np.sum(F_exp, axis=0)
dW = F_exp_over_sum.dot(X.T)
dW /= X.shape[1]
print dW


#dW = np.zeros_like(W)
#for p in range(X.shape[1]):
#    for i in range(W.shape[0]):
#        for j in range(W.shape[1]):
#            dW[i, j] += F_exp_over_sum[i, p] * X[j, p]
            
#dW /= X.shape[1]


correct
[[ 35.02403093  52.09956589  80.31160882]
 [ 34.22596907  49.90043411  77.68839118]]
[[ 35.02403093  52.09956589  80.31160882]
 [ 34.22596907  49.90043411  77.68839118]]


In [3]:
import numpy as np

D = 500
N = 1000
C = 2

D = np.random.randn(1000, 500)
hidden_layer_sizes = [500]*10
print hidden_layer_sizes
print ['tanh'] * len(hidden_layer_sizes)


[500, 500, 500, 500, 500, 500, 500, 500, 500, 500]
['tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh']
