In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [3]:
def eval_numerical_gradient_array(f, x, df, h=1e-5):
  """
  Evaluate a numeric gradient for a function that accepts a numpy
  array and returns a numpy array.
  """
  grad = np.zeros_like(x)
  it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
  while not it.finished:
    ix = it.multi_index
    
    oldval = x[ix]
    x[ix] = oldval + h
    pos = f(x)
    x[ix] = oldval - h
    neg = f(x)
    x[ix] = oldval
    
    grad[ix] = np.sum((pos - neg) * df) / (2 * h)
    it.iternext()
   
  return grad

In [4]:
def eval_numerical_gradient(f, x, verbose=False, h=0.00001):
  """ 
  a naive implementation of numerical gradient of f at x 
  - f should be a function that takes a single argument
  - x is the point (numpy array) to evaluate the gradient at
  """ 

  fx = f(x) # evaluate function value at original point
  grad = np.zeros_like(x)
  # iterate over all indexes in x
  it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
  while not it.finished:

    # evaluate function at x+h
    ix = it.multi_index
    oldval = x[ix]
    x[ix] = oldval + h # increment by h
    fxph = f(x) # evalute f(x + h)
    x[ix] = oldval - h
    fxmh = f(x) # evaluate f(x - h)
    x[ix] = oldval # restore

    # compute the partial derivative with centered formula
    grad[ix] = (fxph - fxmh) / (2 * h) # the slope
    if verbose:
      print ix, grad[ix]
    it.iternext() # step to next dimension

  return grad

In [5]:
def affine_forward(x, w, b):
  """
  Computes the forward pass for an affine (fully-connected) layer.

  The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input.
  We multiply this against a weight matrix of shape (D, M) where
  D = \prod_i d_i

  Inputs:
  x - Input data, of shape (N, d_1, ..., d_k)
  w - Weights, of shape (D, M)
  b - Biases, of shape (M,)
  
  Returns a tuple of:
  - out: output, of shape (N, M)
  - cache: (x, w, b)
  """
  out = None
  #############################################################################
  # TODO: Implement the affine forward pass. Store the result in out. You     #
  # will need to reshape the input into rows.                                 #
  #############################################################################
  N = x.shape[0]
  D = np.prod(x.shape[1:])
  x2 = np.reshape(x, (N, D))
  out = np.dot(x2, w) + b
  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################
  cache = (x, w, b)
  return out, cache

In [6]:
def affine_backward(dout, cache):
  """
  Computes the backward pass for an affine layer.

  Inputs:
  - dout: Upstream derivative, of shape (N, M)
  - cache: Tuple of:
    - x: Input data, of shape (N, d_1, ... d_k)
    - w: Weights, of shape (D, M)

  Returns a tuple of:
  - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
  - dw: Gradient with respect to w, of shape (D, M)
  - db: Gradient with respect to b, of shape (M,)
  """
  x, w, b = cache
  dx, dw, db = None, None, None
  #############################################################################
  # TODO: Implement the affine backward pass.                                 #
  #############################################################################
  N = x.shape[0]
  D = x.shape[1]
 

  dx = np.dot(dout, w.T) # N x D
  dw = np.dot(x.T, dout) # D x M
  db = np.dot(dout.T, np.ones(N)) # M x 1


  #############################################################################
  #                             END OF YOUR CODE                              #
  #############################################################################
  return dx, dw, db

In [7]:
x = np.random.randn(1, 3)
w = np.random.randn(3, 2)
b = np.random.randn(2)
dout = np.random.randn(1, 2)

dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout)
dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout)

_, cache = affine_forward(x, w, b)
dx, dw, db = affine_backward(dout, cache)

print dx_num, dx
# The error should be less than 1e-10
print 'Testing affine_backward function:'
print 'dx error: ', rel_error(dx_num, dx)
print 'dw error: ', rel_error(dw_num, dw)
print 'db error: ', rel_error(db_num, db)

[[-1.53455128  0.39314168 -0.82414949]] [[-1.53455128  0.39314168 -0.82414949]]
Testing affine_backward function:
dx error:  6.48711769279e-12
dw error:  1.10010080021e-10
db error:  2.27556342422e-12


In [8]:
def sigmoid_forward(z):
        sigmoidfn = 1.0 / (1.0 + np.exp(-z)) # sigmoid activation function
        return sigmoidfn
    
def sigmoid_backward(dout,fw_activation):
        return np.multiply(np.multiply(fw_activation,1-fw_activation),dout)

In [9]:
x = np.random.randn(1, 3)
w = np.random.randn(3, 2)
b = np.random.randn(2)
dout = np.random.randn(1, 2)

dx_num = eval_numerical_gradient_array(lambda x: sigmoid_forward(affine_forward(x, w, b)[0]), x, dout)
dw_num = eval_numerical_gradient_array(lambda w: sigmoid_forward(affine_forward(x, w, b)[0]), w, dout)
db_num = eval_numerical_gradient_array(lambda b: sigmoid_forward(affine_forward(x, w, b)[0]), b, dout)

y, cache = affine_forward(x, w, b)
sigout = sigmoid_forward(y)
douty = sigmoid_backward(dout,sigout)
dx, dw, db = affine_backward(douty, cache)

print dx_num, dx
# The error should be less than 1e-10
print 'Testing affine_backward function:'
print 'dx error: ', rel_error(dx_num, dx)
print 'dw error: ', rel_error(dw_num, dw)
print 'db error: ', rel_error(db_num, db)

[[-0.05808236  0.03889595 -0.20467111]] [[-0.05808236  0.03889595 -0.20467111]]
Testing affine_backward function:
dx error:  5.52998852762e-11
dw error:  5.96447051052e-11
db error:  1.59729233003e-11


In [10]:
def euclidean_sq_loss(z,t):
    #z is lower layer's activation, t is the target value
    m = z.shape[0] #number of training sample in a batch
    n = z.shape[1] #size of feature

    loss = 0.5* np.dot( z[0,:]-t[0,:],np.transpose(z[0,:]-t[0,:]) )
    dout = (z - t)
    
    return loss,dout

In [11]:
def piano_fw(x,w,b,t):
    loss,_ = euclidean_sq_loss(sigmoid_forward(affine_forward(x, w, b)[0]),t)
    return loss

In [12]:
x = np.random.randn(1, 3)
w = np.random.randn(3, 2)
b = np.random.randn(2)
t = np.array([[0,1]])

y, cache = affine_forward(x, w, b)
sigout = sigmoid_forward(y)
loss,dout = euclidean_sq_loss(sigout,t)
print loss, 0.5* (np.linalg.norm(sigout-t))**2

dw_num = eval_numerical_gradient(lambda w: piano_fw(x,w,b,t), w)
db_num = eval_numerical_gradient(lambda b: piano_fw(x,w,b,t), b)

print 'dw_num:',dw_num,'db_num:',db_num

douty = sigmoid_backward(dout,sigout)
dx, dw, db = affine_backward(douty, cache)
print dw,db
#print dx_num, dx
# The error should be less than 1e-10
#print 'Testing affine_backward function:'
#print 'dx error: ', rel_error(dx_num, dx)
print 'dw error: ', rel_error(dw_num, dw)
print 'db error: ', rel_error(db_num, db)

0.47809586764 0.47809586764
dw_num: [[-0.00084517  0.00261561]
 [ 0.00769501 -0.02381437]
 [ 0.00949257 -0.02937739]] db_num: [ 0.00820025 -0.02537797]
[[-0.00084517  0.00261561]
 [ 0.00769501 -0.02381437]
 [ 0.00949257 -0.02937739]] [ 0.00820025 -0.02537797]
dw error:  7.43419912547e-10
db error:  8.05032862384e-11


In [13]:
class NeuralNet_Piano:
    #class variables
    # num_classes  is the size of the last affine activation
    # input_dim   is the size of input dimension
    # num_hidden_layers is how many hidden layers
    ## (constructor ) Dict model
    ## (constructor) num_layers is how many totol layers = hidden_layers + 2(Softmax cross entropy loss layer)
    ## (Forward) Dict input cache
    ## (Backward) Dict gradient cache 
    #class def
    # constructor to construct the neural network, given the hyper parameter
    # Forward
    # Backward
    # applyGrad
    def __init__(self,model_hyper):
        num_classes = model_hyper['num_classes']
        input_dim  = model_hyper['input_dim']
        self.num_hidden_layers = model_hyper['num_hidden_layers']
        self.num_layers = self.num_hidden_layers + 1
        layer_list = [None] * self.num_hidden_layers
        cur_layer_size = input_dim
        self.learnrate = model_hyper['learnrate']
        self.NN_model = []
        self.NN_grad = []
        self.NN_num_grad = []
        for hid_layer_iter in range(self.num_hidden_layers):  
            layer_model = {}
            next_layer_size = model_hyper['layer_sizes'][hid_layer_iter]
            layer_model['W'] = 0.9 * np.random.randn(cur_layer_size, next_layer_size)
            layer_model['b'] =0.9 * np.random.randn(next_layer_size)
            self.NN_model.append(layer_model)
            cur_layer_size = next_layer_size
        
        layer_model = {}   
        next_layer_size = num_classes
        layer_model['W'] = 0.9 * np.random.randn(cur_layer_size, next_layer_size)
        self.NN_model.append(layer_model)
        self.NN_cache=[]
        self.sigmoid_on = False
        for i in self.NN_model:
            print i['W'].shape
            if 'b' in i:
                print i['b'].shape
            print "__________________"
            
    def affine_backward(self,dout, cache):
        x = cache['x']
        w = cache['w']
        b = cache['b']
        #print x,w,b
        grad_cache = {}
        dx, dw, db = None, None, None
        N = x.shape[0]
        D = np.prod(x.shape[1:])
        x2 = np.reshape(x, (N, D))

        dx2 = np.dot(dout, w.T) # N x D
        dw = np.dot(x2.T, dout) # D x M
        db = np.dot(dout.T, np.ones(N)) # M x 1

        dx = np.reshape(dx2, x.shape)
        grad_cache['dx'] = dx
        grad_cache['dw'] = dw
        #print grad_cache['dw']
        grad_cache['db'] = db
        return grad_cache
    
    def affine_backward2(self,dout, cache):
        x = cache['x']
        w = cache['w']

        grad_cache = {}
        dx, dw = None, None
        N = x.shape[0]
        D = np.prod(x.shape[1:])
        x2 = np.reshape(x, (N, D))

        dx2 = np.dot(dout, w.T) # N x D
        dw = np.dot(x2.T, dout) # D x M       

        dx = np.reshape(dx2, x.shape)
        grad_cache['dx'] = dx
        grad_cache['dw'] = dw
 
        return grad_cache  

    def affine_forward(self,x, w, b): 
        cache = {}
        out = None
        N = x.shape[0]
        D = np.prod(x.shape[1:])
        x2 = np.reshape(x, (N, D))
        out = np.dot(x2, w) + b
        cache['x'] = x
        cache['w'] = w
        cache['b'] = b
        return out, cache    
    
    def affine_forward2(self,x, w): 
        out = None
        cache = {}
        N = x.shape[0]
        D = np.prod(x.shape[1:])
        x2 = np.reshape(x, (N, D))
        out = np.dot(x2, w) 
        cache['x'] = x
        cache['w'] = w
        return out, cache  

    def sigmoid_forward(self,z):
        sigmoidfn = 1.0 / (1.0 + np.exp(-z)) # sigmoid activation function
        return sigmoidfn
    
    def sigmoid_backward(self,dout,fw_activation):
        return np.multiply(np.multiply(fw_activation,1-fw_activation),dout)

    def euclidean_sq_loss(self,z,t):
        #z is lower layer's activation, t is the target value
        m = z.shape[0] #number of training sample in a batch
        n = z.shape[1] #size of feature
        #print z.shape, t.shape
        loss = 0.5* np.dot( z[0,:]-t[0,:],np.transpose(z[0,:]-t[0,:]) )
        dout = (z - t)
    
        return loss,dout
    
    def forward_train(self,x,y):
        activation = x
        #print activation

        for i in self.NN_model:
            if 'b' in i:
                activation, cache = self.affine_forward(activation, i['W'], i['b'])
                self.NN_cache.append(cache)
            else:
                activation, cache = self.affine_forward2(activation, i['W'])
                self.NN_cache.append(cache)
        

        if (self.sigmoid_on == True):
            sigout = self.sigmoid_forward(activation) 
            self.NN_cache.append(sigout)
            loss,dz = self.euclidean_sq_loss(sigout,y)
        else:
            loss,dz = self.euclidean_sq_loss(activation,y)
        #for i in self.NN_cache:
        #    print i['w'].shape,i['x'].shape
        #    if 'b' in i:
        #        print i['b'].shape
        #    print "__________________"   
        return loss,dz
    
    def forward_test(self,x):
        activation = x
        #print activation

        for i in self.NN_model:
            if 'b' in i:
                activation, cache = self.affine_forward(activation, i['W'], i['b'])
                self.NN_cache.append(cache)
            else:
                activation, cache = self.affine_forward2(activation, i['W'])
                self.NN_cache.append(cache)
 
        return self.sigmoid_forward(activation)
    
    def backward_train(self,dz):
        if self.sigmoid_on == True:
            sigactivation = self.NN_cache[-1]
            backgrad = sigmoid_backward(dz,sigactivation)
            for i in reversed(self.NN_cache[:-1]):
                if 'b' in i:
                    grad_cache = self.affine_backward(backgrad,i)
                else:
                    grad_cache = self.affine_backward2(backgrad,i)
                backgrad = grad_cache['dx']
                
            self.NN_grad.insert(0,grad_cache)
        else:
            backgrad = dz
            for i in reversed(self.NN_cache):
                if 'b' in i:
                    grad_cache = self.affine_backward(backgrad,i)
                else:
                    grad_cache = self.affine_backward2(backgrad,i)
                backgrad = grad_cache['dx']
                
            self.NN_grad.insert(0,grad_cache)     

            #self.NN_grad.append(grad_cache)
    def Apply_UpdateW(self):
        for i in range(self.num_layers):
            delta_W = -self.learnrate * self.NN_grad[i]['dw'] 
            print delta_W
            self.NN_model[i]['W'] += delta_W
            print 'Apply Update',i
            if 'db' in self.NN_grad[i] :
                delta_b = -self.learnrate * self.NN_grad[i]['db'] 
                self.NN_model[i]['b'] += delta_b
        self.reset_cache()

    def reset_cache(self):
        del(self.NN_cache[:])
        del(self.NN_grad[:]) 
        del(self.NN_num_grad[:]) 
        
    def get_grad(self):
        return self.NN_grad

    def get_W(self):
        return self.NN_model
    def cal_num_grad(self,w,h,x,y):
        grad = np.zeros_like(w)
        #print "w.shape",w.shape
        # iterate over all indexes in w
        it = np.nditer(w, flags=['multi_index'], op_flags=['readwrite'])
        
        while not it.finished:
            
            # evaluate function at x+h
            ix = it.multi_index
            oldval = w[ix]
            w[ix] = oldval + h # increment by h
            fxph,_ = self.forward_train(x,y) # evalute f(x + h)
            w[ix] = oldval - h
            fxmh,_ = self.forward_train(x,y) # evaluate f(x - h)
            w[ix] = oldval # restore
            # compute the partial derivative with centered formula
            grad[ix] = (fxph - fxmh) / (2 * h) # the slope
            it.iternext()           
        return grad
    def get_numerical_gradient(self):
        return self.NN_num_grad
    def set_numerical_gradient(self,x,y, h=0.0001):        
        for i in self.NN_model:
            num_grad_cache = {}
            num_grad_cache['dw'] = self.cal_num_grad(i['W'],h,x,y)
            
            if 'b' in i:
                num_grad_cache['db'] = self.cal_num_grad(i['b'],h,x,y)
               
            self.NN_num_grad.append(num_grad_cache)
            
        return self.NN_num_grad
    
    def check_grad_numgrad(self,x,y, h=0.0001,verbose = True):       
        self.set_numerical_gradient(x,y, h)  
        if verbose:
            for i in range(self.num_hidden_layers+1):
                print i,"th layer"
                if (rel_error(self.NN_grad[i]['dw'],self.NN_num_grad[i]['dw'])<1e-7):
                    print 'grad W correct',rel_error(self.NN_grad[i]['dw'],self.NN_num_grad[i]['dw'])
                else:
                    print self.NN_grad[i]['dw'].flatten()
                    print self.NN_num_grad[i]['dw'].flatten()
                    print rel_error(self.NN_grad[i]['dw'],self.NN_num_grad[i]['dw'])
                    print np.linalg.norm(self.NN_grad[i]['dw'].flatten() - self.NN_num_grad[i]['dw'].flatten())**2
                    print '==========='
                if 'db' in self.NN_grad[i] :
                    if (rel_error(self.NN_grad[i]['db'],self.NN_num_grad[i]['db'])<1e-7):
                        print 'grad b correct',rel_error(self.NN_grad[i]['db'],self.NN_num_grad[i]['db'])
                    else:
                        print self.NN_grad[i]['db'].flatten()
                        print self.NN_num_grad[i]['db'].flatten()
                        print rel_error(self.NN_grad[i]['db'],self.NN_num_grad[i]['db'])
                        print '==========='

In [14]:
model_hyper = {}
model_hyper['num_classes'] = 4
model_hyper['input_dim']=4
model_hyper['num_hidden_layers']=0
model_hyper['learnrate']=0.0001
layer_size = [None] * model_hyper['num_hidden_layers']
#layer_size[0] = 4
#layer_size[1] = 4
#layer_size[2] = 5
#layer_size[3] = 6
model_hyper['layer_sizes'] = layer_size  

In [15]:
a_nn = NeuralNet_Piano(model_hyper)
w_org = a_nn.get_W()
w_org0 = w_org[0]['W'].copy()
num_inputs = 1
x = np.random.randn(num_inputs, model_hyper['input_dim'])
y = np.random.randint(model_hyper['num_classes'] , size=num_inputs)
y_onehot=np.zeros([num_inputs,model_hyper['num_classes']])
y_onehot[np.arange(num_inputs), y] = 1
y = y_onehot.astype(int)
print 'xshape',x.shape,y.shape
print y,'\n',y_onehot
print '----------'
loss,dz = a_nn.forward_train(x,y)
print 'dz shape',dz.shape
a_nn.backward_train(dz)
grad = a_nn.get_grad()
a_nn.check_grad_numgrad(x,y)
print y.dtype,x.dtype

(4, 4)
__________________
xshape (1, 4) (1, 4)
[[0 0 1 0]] 
[[ 0.  0.  1.  0.]]
----------
dz shape (1, 4)
0 th layer
grad W correct 1.49319282588e-11
int64 float64


In [16]:
w_after = a_nn.get_W()
print w_org[0]['W'] - w_after[0]['W']
grad_w = a_nn.get_grad()
delta_w = -model_hyper['learnrate'] * grad_w [0]['dw'] 
print delta_w

[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
[[  3.00970625e-05  -5.27228171e-04   4.78362477e-04  -6.54646927e-04]
 [  3.01015855e-05  -5.27307403e-04   4.78434365e-04  -6.54745308e-04]
 [ -4.42938892e-05   7.75922440e-04  -7.04006729e-04   9.63444802e-04]
 [  4.55283656e-05  -7.97547499e-04   7.23627488e-04  -9.90296134e-04]]


In [17]:
num_grad = a_nn.get_numerical_gradient()

In [18]:
for i in a_nn.get_W():
    print i['W'].shape
for i in grad:
    print i['dw'].shape
for i in num_grad:
    print i['dw'].shape

(4, 4)
(4, 4)
(4, 4)


In [19]:
print '=====num grad ====='
for i in range(model_hyper['num_hidden_layers']+1):
    print i,"th layer:W error",rel_error(grad[i]['dw'],num_grad[i]['dw'])    
    if 'db' in grad[i] :
        print i,"th layer:b error",rel_error(grad[i]['db'],num_grad[i]['db'])
    print "__________________"  

print grad[0]['dw'].flatten()
print num_grad[0]['dw'].flatten()
print "__________________"  


=====num grad =====
0 th layer:W error 1.49319282588e-11
__________________
[-0.30097063  5.27228171 -4.78362477  6.54646927 -0.30101586  5.27307403
 -4.78434365  6.54745308  0.44293889 -7.7592244   7.04006729 -9.63444802
 -0.45528366  7.97547499 -7.23627488  9.90296134]
[-0.30097063  5.27228171 -4.78362477  6.54646927 -0.30101586  5.27307403
 -4.78434365  6.54745308  0.44293889 -7.7592244   7.04006729 -9.63444802
 -0.45528366  7.97547499 -7.23627488  9.90296134]
__________________


In [20]:
a_nn.Apply_UpdateW()
print y.dtype,x.dtype
loss,dz = a_nn.forward_train(x,y)
print 'dz shape',dz.shape
a_nn.backward_train(dz)
grad = a_nn.get_grad()
a_nn.check_grad_numgrad(x,y_onehot)
num_grad = a_nn.get_numerical_gradient()
print '=====num grad ====='
for i in range(model_hyper['num_hidden_layers']+1):
    print i,"th layer:W error",rel_error(grad[i]['dw'],num_grad[i]['dw'])    
    if 'db' in grad[i] :
        print i,"th layer:b error",rel_error(grad[i]['db'],num_grad[i]['db'])
    print "__________________"  

print grad[0]['dw'].flatten()
print num_grad[0]['dw'].flatten()
print "__________________"  


[[  3.00970625e-05  -5.27228171e-04   4.78362477e-04  -6.54646927e-04]
 [  3.01015855e-05  -5.27307403e-04   4.78434365e-04  -6.54745308e-04]
 [ -4.42938892e-05   7.75922440e-04  -7.04006729e-04   9.63444802e-04]
 [  4.55283656e-05  -7.97547499e-04   7.23627488e-04  -9.90296134e-04]]
Apply Update 0
int64 float64
dz shape (1, 4)
0 th layer
grad W correct 2.3823248661e-11
=====num grad =====
0 th layer:W error 2.3823248661e-11
__________________
[-0.30056661  5.26520431 -4.77720333  6.53768142 -0.30061178  5.26599557
 -4.77792125  6.53866391  0.4423443  -7.74880857  7.03061685 -9.62151493
 -0.45467249  7.96476887 -7.22656105  9.8896678 ]
[-0.30056661  5.26520431 -4.77720333  6.53768142 -0.30061178  5.26599557
 -4.77792125  6.53866391  0.4423443  -7.74880857  7.03061685 -9.62151493
 -0.45467249  7.96476887 -7.22656105  9.8896678 ]
__________________


In [21]:
w_after = a_nn.get_W()
w_after0= w_after[0]['W']
real_delta_w = w_org0 - w_after0

print 'w_after0:\n',w_after0
print 'w_org0:\n',w_org0
print 'real_delta_w:\n',real_delta_w
grad_w = a_nn.get_grad()
delta_w = model_hyper['learnrate'] * grad_w [0]['dw'] 
print 'delta_w:\n',delta_w

w_after0:
[[ 0.38626467  0.37073964 -0.16543359  1.0017585 ]
 [ 1.66452958  0.43935213 -1.69210078  0.59696873]
 [ 0.07950536  0.03226324 -0.18581565 -0.45950859]
 [-1.37405442  1.16940226 -0.01273694  0.57407696]]
w_org0:
[[ 0.38623457  0.37126687 -0.16591196  1.00241315]
 [ 1.66449947  0.43987943 -1.69257921  0.59762347]
 [ 0.07954966  0.03148731 -0.18511165 -0.46047203]
 [-1.37409995  1.17019981 -0.01346057  0.57506726]]
real_delta_w:
[[ -3.00970625e-05   5.27228171e-04  -4.78362477e-04   6.54646927e-04]
 [ -3.01015855e-05   5.27307403e-04  -4.78434365e-04   6.54745308e-04]
 [  4.42938892e-05  -7.75922440e-04   7.04006729e-04  -9.63444802e-04]
 [ -4.55283656e-05   7.97547499e-04  -7.23627488e-04   9.90296134e-04]]
delta_w:
[[ -3.00566608e-05   5.26520431e-04  -4.77720333e-04   6.53768142e-04]
 [ -3.00611778e-05   5.26599557e-04  -4.77792125e-04   6.53866391e-04]
 [  4.42344300e-05  -7.74880857e-04   7.03061685e-04  -9.62151493e-04]
 [ -4.54672493e-05   7.96476887e-04  -7.22656105e-0

In [22]:
a_nn.Apply_UpdateW()
print y.dtype,x.dtype
loss,dz = a_nn.forward_train(x,y)
print 'dz shape',dz.shape
a_nn.backward_train(dz)
grad = a_nn.get_grad()
a_nn.check_grad_numgrad(x,y_onehot)
num_grad = a_nn.get_numerical_gradient()
print '=====num grad ====='
for i in range(model_hyper['num_hidden_layers']+1):
    print i,"th layer:W error",rel_error(grad[i]['dw'],num_grad[i]['dw'])    
    if 'db' in grad[i] :
        print i,"th layer:b error",rel_error(grad[i]['db'],num_grad[i]['db'])
    print "__________________"  

print grad[0]['dw'].flatten()
print num_grad[0]['dw'].flatten()
print "__________________"  


[[  3.00566608e-05  -5.26520431e-04   4.77720333e-04  -6.53768142e-04]
 [  3.00611778e-05  -5.26599557e-04   4.77792125e-04  -6.53866391e-04]
 [ -4.42344300e-05   7.74880857e-04  -7.03061685e-04   9.62151493e-04]
 [  4.54672493e-05  -7.96476887e-04   7.22656105e-04  -9.88966780e-04]]
Apply Update 0
int64 float64
dz shape (1, 4)
0 th layer
grad W correct 7.07169841035e-12
=====num grad =====
0 th layer:W error 7.07169841035e-12
__________________
[-0.30016313  5.2581364  -4.77079051  6.52890537 -0.30020824  5.2589266
 -4.77150747  6.52988654  0.44175051 -7.73840673  7.02117909 -9.60859921
 -0.45406215  7.95407713 -7.21686027  9.87639212]
[-0.30016313  5.2581364  -4.77079051  6.52890537 -0.30020824  5.2589266
 -4.77150747  6.52988654  0.44175051 -7.73840673  7.02117909 -9.60859921
 -0.45406215  7.95407713 -7.21686027  9.87639212]
__________________


In [23]:
w_after = a_nn.get_W()
w_after0= w_after[0]['W']
real_delta_w = w_org0 - w_after0

print 'w_after0:\n',w_after0
print 'w_org0:\n',w_org0
print 'real_delta_w:\n',real_delta_w
grad_w = a_nn.get_grad()
delta_w = model_hyper['learnrate'] * grad_w [0]['dw'] 
print 'delta_w:\n',delta_w

w_after0:
[[ 0.38629473  0.37021312 -0.16495587  1.00110473]
 [ 1.66455964  0.43882553 -1.69162299  0.59631486]
 [ 0.07946113  0.03303812 -0.18651871 -0.45854644]
 [-1.37400896  1.16860578 -0.01201429  0.57308799]]
w_org0:
[[ 0.38623457  0.37126687 -0.16591196  1.00241315]
 [ 1.66449947  0.43987943 -1.69257921  0.59762347]
 [ 0.07954966  0.03148731 -0.18511165 -0.46047203]
 [-1.37409995  1.17019981 -0.01346057  0.57506726]]
real_delta_w:
[[ -6.01537233e-05   1.05374860e-03  -9.56082809e-04   1.30841507e-03]
 [ -6.01627633e-05   1.05390696e-03  -9.56226491e-04   1.30861170e-03]
 [  8.85283191e-05  -1.55080330e-03   1.40706841e-03  -1.92559630e-03]
 [ -9.09956149e-05   1.59402439e-03  -1.44628359e-03   1.97926291e-03]]
delta_w:
[[ -3.00163134e-05   5.25813640e-04  -4.77079051e-04   6.52890537e-04]
 [ -3.00208243e-05   5.25892660e-04  -4.77150747e-04   6.52988654e-04]
 [  4.41750506e-05  -7.73840673e-04   7.02117909e-04  -9.60859921e-04]
 [ -4.54062150e-05   7.95407713e-04  -7.21686027e-0

num_train = 50000
y = np.random.randint(model_hyper['num_classes'] , size=num_train)
y_onehot=np.zeros([num_train,model_hyper['num_classes']])

y_onehot[np.arange(num_train), y] = 1
x = y_onehot
y = y_onehot.astype(int)
model2_hyper = {}
model2_hyper['num_classes'] = 4
model2_hyper['input_dim']=4
model2_hyper['num_hidden_layers']=1
model2_hyper['learnrate']=0.001
layer_size = [None] * model2_hyper['num_hidden_layers']
layer_size[0] = 4
#layer_size[1] = 4
#layer_size[2] = 5
#layer_size[3] = 6
model2_hyper['layer_sizes'] = layer_size  

batch_size =1
num_train_batch = num_train/batch_size
num_rec = num_train
cost_rec = np.zeros(num_rec,dtype=np.float)
a_nn.reset_cache()
for ii in range(10):
    for i in range(num_rec):
        th_batch = i%num_train_batch
        Xbatch = x[th_batch*batch_size:th_batch*batch_size+batch_size,:]
        Ybatch = y[th_batch*batch_size:th_batch*batch_size+batch_size].astype(int)  
        #print Xbatch
        loss,dz = a_nn.forward_train(Xbatch,Ybatch)
        print loss
        a_nn.backward_train(dz)
        #iris_nn.check_grad_numgrad(Xbatch,Ybatch,0.0001,True)
        a_nn.Apply_UpdateW()

In [24]:
Wmodel = a_nn.get_W()
print Wmodel
print len(Wmodel)



[{'W': array([[ 0.38629473,  0.37021312, -0.16495587,  1.00110473],
       [ 1.66455964,  0.43882553, -1.69162299,  0.59631486],
       [ 0.07946113,  0.03303812, -0.18651871, -0.45854644],
       [-1.37400896,  1.16860578, -0.01201429,  0.57308799]])}]
1


In [26]:

DataXy_ = np.loadtxt(open("iris_prep.txt", "rb"), delimiter=",", skiprows=1)
print DataXy_.shape
rand_idx = np.random.permutation(149)
DataXy =  DataXy_[rand_idx,:]
print DataXy[0:4,:]
num_train = 100
num_test = 49
trainX = DataXy[0:num_train,:-1]
meanX = trainX.mean()
#print trainX
trainX = 0.1*(trainX-meanX) 
trainY = DataXy[0:num_train,-1]
trainY_onehot = np.zeros([num_train,3])
trainY_onehot[np.arange(num_train), trainY.astype(int)] = 1
trainY = trainY_onehot
testX = 0.1*(DataXy[num_train:,:-1]-meanX) 
testY = DataXy[num_train:,-1]
testY_onehot = np.zeros([num_test,3])
testY_onehot[np.arange(num_test), testY.astype(int)] = 1
testY = testY_onehot
#print trainX

model_hyper2 = {}
model_hyper2['num_classes'] = 3
model_hyper2['input_dim']=4
model_hyper2['num_hidden_layers']=3
model_hyper2['learnrate']=0.00001
layer_size = [None] * model_hyper2['num_hidden_layers']
layer_size[0] = 5
layer_size[1] = 4
layer_size[2] = 5
#layer_size[3] = 6
model_hyper2['layer_sizes'] = layer_size    

batch_size =1
num_train_batch = num_train/batch_size
num_rec = num_train
cost_rec = np.zeros(num_rec,dtype=np.float)
iris_nn = NeuralNet_Piano(model_hyper2)
#iris_nn.reset_cache()
for i in range(50000):
    th_batch = i%num_train_batch
    Xbatch = trainX[th_batch*batch_size:th_batch*batch_size+batch_size,:]
    #print Xbatch
    Ybatch = trainY[th_batch*batch_size:th_batch*batch_size+batch_size].astype(int)  
    #print Ybatch
    loss_iris,dz_iris = iris_nn.forward_train(Xbatch,Ybatch)
    if i < num_rec:
        cost_rec[i] = loss_iris
    print loss_iris
    iris_nn.backward_train(dz_iris)
    #iris_nn.check_grad_numgrad(Xbatch,Ybatch,0.0001,True)
    iris_nn.Apply_UpdateW()

(149, 5)
[[ 5.   3.5  1.6  0.6  2. ]
 [ 5.   3.6  1.4  0.2  2. ]
 [ 6.7  3.1  5.6  2.4  0. ]
 [ 7.3  2.9  6.3  1.8  0. ]]
(4, 5)
(5,)
__________________
(5, 4)
(4,)
__________________
(4, 5)
(5,)
__________________
(5, 3)
__________________
355.101208438
[[ -1.25059127e-04  -1.90090355e-05   3.49577867e-04   2.61091103e-04
   -1.18717917e-04]
 [ -2.29165940e-06  -3.48333112e-07   6.40587714e-06   4.78439194e-06
   -2.17545922e-06]
 [  1.53213800e-04   2.32885566e-05  -4.28278643e-04  -3.19870775e-04
    1.45444988e-04]
 [  2.35058779e-04   3.57290249e-05  -6.57059970e-04  -4.90741916e-04
    2.23139960e-04]]
Apply Update 0


IndexError: list index out of range

In [None]:
import matplotlib.pyplot as plt
plt.plot(cost_rec[0:100])
plt.show()