# Gradient checking Affine with respect to W

In [1]:
# gradient checking: compare the analytical gradient with the numerical gradient
# taking the affine layer as an example
from gradient_check import eval_numerical_gradient_array
import numpy as np
from layers import *
N = 5
D = 4
M = 3
x = np.random.normal(size=(N, D))
w = np.random.normal(size=(D, M))
b = np.random.normal(size=(M, ))
dout = np.random.normal(size=(N, M))

# do a forward pass first
out, cache = affine_forward(x, w, b)
# check grad f/grad w, the [0] below gets the output out of the (output, cache) original output
f=lambda w: affine_forward(x, w, b)[0]

# compute the analytical gradient you wrote, [1] get the dw out of the (dx, dw, db) original output
grad = affine_backward(dout, cache)[1]
#print(grad)
# compute the numerical gradient using the provided utility function
ngrad = eval_numerical_gradient_array(f, w, dout)
print(grad, 'this is my affine gradient')
print(ngrad, 'this is their affine gradient')
# they should be similar enough within some small error tolerance
#print(out)

[[ 1.12937624 -2.84668334  3.61661502]
 [-4.3714214  -0.71148866 -0.51577405]
 [ 2.19399945  1.45658743 -0.40621823]
 [-0.78482181  2.00100755 -0.22594629]] this is my affine gradient
[[ 1.12937624 -2.84668334  3.61661502]
 [-4.3714214  -0.71148866 -0.51577405]
 [ 2.19399945  1.45658743 -0.40621823]
 [-0.78482181  2.00100755 -0.22594629]] this is their affine gradient


# Affine  Checking with Respect to X

In [2]:
f = lambda x: affine_forward(x,w,b)[0]
grad = affine_backward(dout,cache)[0]
ngrad = eval_numerical_gradient_array(f, x, dout)
print(grad, 'this is my affine gradient')
print(ngrad, 'this is their affine gradient')

[[-4.0957136  -0.75567965  0.93638084 -0.16579169]
 [-2.04891708  0.38670672 -0.49181883 -4.26199382]
 [ 2.69022253  0.25111822 -0.02029018 -0.17091953]
 [-2.66378183 -0.63481211  0.82644712  0.46381368]
 [-0.75447122 -0.29284864  0.5660749  -0.32425008]] this is my affine gradient
[[-4.0957136  -0.75567965  0.93638084 -0.16579169]
 [-2.04891708  0.38670672 -0.49181883 -4.26199382]
 [ 2.69022253  0.25111822 -0.02029018 -0.17091953]
 [-2.66378183 -0.63481211  0.82644712  0.46381368]
 [-0.75447122 -0.29284864  0.5660749  -0.32425008]] this is their affine gradient


# Affine Checking with Respect to b

In [3]:
f = lambda b: affine_forward(x,w,b)[0]
grad = affine_backward(dout,cache)[2]
ngrad = eval_numerical_gradient_array(f, b, dout)
print(grad, 'this is my affine gradient')
print(ngrad, 'this is their affine gradient')

[-5.04544695  4.58685785 -4.01200432] this is my affine gradient
[-5.04544695  4.58685785 -4.01200432] this is their affine gradient


# Checking Relu Gradient

In [4]:
dout_2 = np.random.uniform(size= x.shape )

In [5]:
out, cache = relu_forward(x)
grad = relu_backward(dout_2, cache)
f = lambda x: relu_forward(x)[0]
ngrad = eval_numerical_gradient_array(f, x, dout_2)

print(grad, 'this is my relu gradient')
print(ngrad, 'this is their gradient')

[[0.         0.46061718 0.         0.88956867]
 [0.94622545 0.88672816 0.66799915 0.19587757]
 [0.         0.         0.3898386  0.55176336]
 [0.52736307 0.27512982 0.         0.        ]
 [0.         0.         0.926261   0.42521301]] this is my relu gradient
[[0.         0.46061718 0.         0.88956867]
 [0.94622545 0.88672816 0.66799915 0.19587757]
 [0.         0.         0.3898386  0.55176336]
 [0.52736307 0.27512982 0.         0.        ]
 [0.         0.         0.926261   0.42521301]] this is their gradient


# Checking Softmax Gradient, Loss

In [4]:
dout = 1
x = np.random.normal(size=(N, D))
y = np.random.randint(low=0, high=D-1, size=(N, ))

loss, dx = softmax_loss(x, y)
f = lambda x, y: softmax_loss(x, y)[0]
grad = softmax_loss(x, y)[1]
ngrad = eval_numerical_gradient_array(f, x, dout, y=y)

print(grad, 'this is my gradient')
print(ngrad, 'this is their gradient')

[[-0.90382577  0.14415576  0.75967   ]
 [-0.90413718  0.05454155  0.84959563]
 [ 0.65070121 -0.67233653  0.02163531]
 [-0.91532487  0.21933155  0.69599332]
 [-0.51555631  0.22196262  0.29359369]] this is my gradient
[[-0.90382577  0.14415576  0.75967   ]
 [-0.90413718  0.05454155  0.84959563]
 [ 0.65070121 -0.67233653  0.02163531]
 [-0.91532487  0.21933155  0.69599332]
 [-0.51555631  0.22196262  0.29359369]] this is their gradient


# example of training a network

In [7]:
# TODO: put the path to your 'hw6_mds189', which should contain a 'trainval' and 'test' directory
path = "/Users/Oscar Ortega/Desktop/CS189/hw6/cs189_hw6/resources/hw6_mds189"
from data_utils import load_mds189
# load the dataset
debug = False  # OPTIONAL: you can change this to True for debugging *only*. Your reported results must be with debug = False
feat_train, label_train, feat_val, label_val = load_mds189(path,debug)
from solver import Solver
from classifiers.fc_net import FullyConnectedNet

# Two-Layer Network

In [8]:
data = {
      'X_train': feat_train,
      'y_train': label_train,
      'X_val': feat_val,
      'y_val': label_val}

# TODO: fill out the hyperparamets
hyperparams = {'lr_decay': .99,
               'num_epochs': 100,
               'batch_size': 100,
               'learning_rate': 1e-5,
               'weight_scale': .01
              }

# TODO: fill out the number of units in your hidden layers
hidden_dim = [150, 150] # this should be a list of units for each hiddent layer

model = FullyConnectedNet(input_dim=75,
                          hidden_dim=hidden_dim, 
                          weight_scale = hyperparams['weight_scale'])
solver = Solver(model, data,
                update_rule='sgd',
                optim_config={
                  'learning_rate': hyperparams['learning_rate'],
                },
                lr_decay=hyperparams['lr_decay'],
                num_epochs=hyperparams['num_epochs'], 
                batch_size=hyperparams['batch_size'],
                print_every=1000)
solver.train()

(Iteration 1 / 3600) loss: 220.758844
(Epoch 0 / 100) train acc: 0.148000; val_acc: 0.129167
(Epoch 1 / 100) train acc: 0.704000; val_acc: 0.678333
(Epoch 2 / 100) train acc: 0.689000; val_acc: 0.677500
(Epoch 3 / 100) train acc: 0.800000; val_acc: 0.795000
(Epoch 4 / 100) train acc: 0.651000; val_acc: 0.676667
(Epoch 5 / 100) train acc: 0.872000; val_acc: 0.840000
(Epoch 6 / 100) train acc: 0.906000; val_acc: 0.863333
(Epoch 7 / 100) train acc: 0.878000; val_acc: 0.845000
(Epoch 8 / 100) train acc: 0.904000; val_acc: 0.876667
(Epoch 9 / 100) train acc: 0.774000; val_acc: 0.780000
(Epoch 10 / 100) train acc: 0.571000; val_acc: 0.562500
(Epoch 11 / 100) train acc: 0.929000; val_acc: 0.883333
(Epoch 12 / 100) train acc: 0.935000; val_acc: 0.890833
(Epoch 13 / 100) train acc: 0.943000; val_acc: 0.896667
(Epoch 14 / 100) train acc: 0.953000; val_acc: 0.895833
(Epoch 15 / 100) train acc: 0.958000; val_acc: 0.900833
(Epoch 16 / 100) train acc: 0.747000; val_acc: 0.745000
(Epoch 17 / 100) tra

In [11]:
data = {
      'X_train': feat_train,
      'y_train': label_train,
      'X_val': feat_val,
      'y_val': label_val}

# TODO: fill out the hyperparamets
hyperparams = {'lr_decay': 0.99,
               'num_epochs': 200,
               'batch_size': 100,
               'learning_rate': 1e-5
              }

# TODO: fill out the number of units in your hidden layers
hidden_dim = [30, 20, 20] # this should be a list of units for each hiddent layer

model = FullyConnectedNet(input_dim=75,
                          hidden_dim=hidden_dim)
solver = Solver(model, data,
                update_rule='sgd',
                optim_config={
                  'learning_rate': hyperparams['learning_rate'],
                },
                lr_decay=hyperparams['lr_decay'],
                num_epochs=hyperparams['num_epochs'], 
                batch_size=hyperparams['batch_size'],
                print_every=1000)
solver.train()

(Iteration 1 / 7200) loss: 1030.872534
(Epoch 0 / 200) train acc: 0.194000; val_acc: 0.198333
(Epoch 1 / 200) train acc: 0.275000; val_acc: 0.291667
(Epoch 2 / 200) train acc: 0.421000; val_acc: 0.426667
(Epoch 3 / 200) train acc: 0.480000; val_acc: 0.453333
(Epoch 4 / 200) train acc: 0.537000; val_acc: 0.513333
(Epoch 5 / 200) train acc: 0.547000; val_acc: 0.520833
(Epoch 6 / 200) train acc: 0.543000; val_acc: 0.547500
(Epoch 7 / 200) train acc: 0.599000; val_acc: 0.575833
(Epoch 8 / 200) train acc: 0.589000; val_acc: 0.591667
(Epoch 9 / 200) train acc: 0.632000; val_acc: 0.607500
(Epoch 10 / 200) train acc: 0.619000; val_acc: 0.629167
(Epoch 11 / 200) train acc: 0.635000; val_acc: 0.616667
(Epoch 12 / 200) train acc: 0.650000; val_acc: 0.627500
(Epoch 13 / 200) train acc: 0.654000; val_acc: 0.625833
(Epoch 14 / 200) train acc: 0.681000; val_acc: 0.665000
(Epoch 15 / 200) train acc: 0.701000; val_acc: 0.651667
(Epoch 16 / 200) train acc: 0.670000; val_acc: 0.675000
(Epoch 17 / 200) tr

(Epoch 142 / 200) train acc: 0.959000; val_acc: 0.880833
(Epoch 143 / 200) train acc: 0.960000; val_acc: 0.881667
(Epoch 144 / 200) train acc: 0.968000; val_acc: 0.881667
(Epoch 145 / 200) train acc: 0.960000; val_acc: 0.882500
(Epoch 146 / 200) train acc: 0.960000; val_acc: 0.883333
(Epoch 147 / 200) train acc: 0.966000; val_acc: 0.882500
(Epoch 148 / 200) train acc: 0.960000; val_acc: 0.880833
(Epoch 149 / 200) train acc: 0.945000; val_acc: 0.875833
(Epoch 150 / 200) train acc: 0.969000; val_acc: 0.882500
(Epoch 151 / 200) train acc: 0.965000; val_acc: 0.883333
(Epoch 152 / 200) train acc: 0.966000; val_acc: 0.881667
(Epoch 153 / 200) train acc: 0.949000; val_acc: 0.882500
(Epoch 154 / 200) train acc: 0.964000; val_acc: 0.883333
(Epoch 155 / 200) train acc: 0.966000; val_acc: 0.882500
(Epoch 156 / 200) train acc: 0.950000; val_acc: 0.884167
(Epoch 157 / 200) train acc: 0.958000; val_acc: 0.883333
(Epoch 158 / 200) train acc: 0.964000; val_acc: 0.883333
(Epoch 159 / 200) train acc: 0.