In [1]:
# Extension to reload modules before cell execution
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

# sigmoid

We'll test the sigmoid function, and compare it to scipy's implementation to make sure we're getting the right return values.

In [10]:
from nnScript import sigmoid # our sigmoid
from scipy.special import expit # scipy's sigmoid

In [11]:
# these should be the same
print(sigmoid(1))
print(expit(1))

0.7310585786300049
0.7310585786300049


In [23]:
for i in np.linspace(-10,10,100):
    assert(sigmoid(i) == expit(i)) # this shouldn't fail

In [21]:
# for sanity check
try:
    assert(sigmoid(0.7) == expit(0.71)) # this should fail
except AssertionError as e:
    print("It Successfully Failed :D")

It Successfully Failed :D


In [26]:
# checking vector values
v = np.array([0.75, 0.3, -0.56, 0.01]) # random vector
print(sigmoid(v))
print(expit(v))
print("Are they equal?", sigmoid(v) == expit(v))

[0.6791787  0.57444252 0.36354746 0.50249998]
[0.6791787  0.57444252 0.36354746 0.50249998]
Are they equal? [ True  True  True  True]


# preprocess

Now we'll test the preprocess function

In [133]:
def preprocess():
    """ Input:
     Although this function doesn't have any input, you are required to load
     the MNIST data set from file 'mnist_all.mat'.

     Output:
     train_data: matrix of training set. Each row of train_data contains 
       feature vector of a image
     train_label: vector of label corresponding to each image in the training
       set
     validation_data: matrix of training set. Each row of validation_data 
       contains feature vector of a image
     validation_label: vector of label corresponding to each image in the 
       training set
     test_data: matrix of training set. Each row of test_data contains 
       feature vector of a image
     test_label: vector of label corresponding to each image in the testing
       set

     Some suggestions for preprocessing step:
     - feature selection"""

    mat = loadmat('mnist_all.mat')  # loads the MAT object as a Dictionary

    # Pick a reasonable size for validation data

    # ------------Initialize preprocess arrays----------------------#
    train_preprocess = np.zeros(shape=(50000, 784))
    validation_preprocess = np.zeros(shape=(10000, 784))
    test_preprocess = np.zeros(shape=(10000, 784))
    train_label_preprocess = np.zeros(shape=(50000,))
    validation_label_preprocess = np.zeros(shape=(10000,))
    test_label_preprocess = np.zeros(shape=(10000,))
    # ------------Initialize flag variables----------------------#
    train_len = 0
    validation_len = 0
    test_len = 0
    train_label_len = 0
    validation_label_len = 0
    # ------------Start to split the data set into 6 arrays-----------#
    for key in mat:
        # -----------when the set is training set--------------------#
        if "train" in key:
            label = key[-1]  # record the corresponding label
            tup = mat.get(key)
            sap = range(tup.shape[0])
            tup_perm = np.random.permutation(sap)
            tup_len = len(tup)  # get the length of current training set
            tag_len = tup_len - 1000  # defines the number of examples which will be added into the training set

            # ---------------------adding data to training set-------------------------#
            train_preprocess[train_len:train_len + tag_len] = tup[tup_perm[1000:], :]
            train_len += tag_len

            train_label_preprocess[train_label_len:train_label_len + tag_len] = label
            train_label_len += tag_len

            # ---------------------adding data to validation set-------------------------#
            validation_preprocess[validation_len:validation_len + 1000] = tup[tup_perm[0:1000], :]
            validation_len += 1000

            validation_label_preprocess[validation_label_len:validation_label_len + 1000] = label
            validation_label_len += 1000

            # ---------------------adding data to test set-------------------------#
        elif "test" in key:
            label = key[-1]
            tup = mat.get(key)
            sap = range(tup.shape[0])
            tup_perm = np.random.permutation(sap)
            tup_len = len(tup)
            test_label_preprocess[test_len:test_len + tup_len] = label
            test_preprocess[test_len:test_len + tup_len] = tup[tup_perm]
            test_len += tup_len
            # ---------------------Shuffle,double and normalize-------------------------#
    train_size = range(train_preprocess.shape[0])
    train_perm = np.random.permutation(train_size)
    train_data = train_preprocess[train_perm]
    train_data = np.double(train_data)
    train_data = train_data / 255.0
    train_label = train_label_preprocess[train_perm]

    validation_size = range(validation_preprocess.shape[0])
    vali_perm = np.random.permutation(validation_size)
    validation_data = validation_preprocess[vali_perm]
    validation_data = np.double(validation_data)
    validation_data = validation_data / 255.0
    validation_label = validation_label_preprocess[vali_perm]

    test_size = range(test_preprocess.shape[0])
    test_perm = np.random.permutation(test_size)
    test_data = test_preprocess[test_perm]
    test_data = np.double(test_data)
    test_data = test_data / 255.0
    test_label = test_label_preprocess[test_perm]

    # Feature selection
    # Your code here.

    print('preprocess done')

    return train_data, train_label, validation_data, validation_label, test_data, test_label

In [134]:
train_data, train_label, validation_data, validation_label, test_data, test_label = preprocess()

592
674
595
613
584
542
591
626
585
594
preprocess done


We've now loaded what would normally be the returned data of preprocess() to the above variables.  Now we want to figure out a way to check if a value is the same accross all rows for a given column.

Since the algorthm will only be trained on 'train_data', we should only have to test this on that set of data.

In [76]:
#https://stackoverflow.com/questions/14859458/how-to-check-if-all-values-in-the-columns-of-a-numpy-matrix-are-the-same
(train_data == train_data[0,:])

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [79]:
res = np.all(train_data == train_data[0,:], axis = 0)
print(len(res)) # 784 == 28 x 28
print(res)

784
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True False
 False False False False False False False False False False False False
 False False False False  True  True  True  True  True  True  True  True
 False False False False False False False False False False False False
 False False False False False False False False False False  True  True
  True  True  True  True False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True  True  True  True False False False False False
 False False False False False False False False False False False False
 False False False False False False False  True  True  True  True False
 False False False False False False False False False False False False
 False False False False False False False Fals

The above is a vector of size 784 indicating `True` if the column is the same for all rows and `False` otherwise.  So, any column that is `True` here is giving us the same value accross the every training example.

To make sure we're doing this correctly, we'll perform a similar check in a more intuitive, but disgusting inefficient way.  We see in the above example that the first entry is `True`, meaning that every row should share the same value (either `1` or `0`).

So, we'll loop through every example and check if an entry is the same in every row.

In [114]:
# sanity check

# 0th Entry is True
t0 = train_data[0][0]
for t in train_data:
    if (t0 != t[0]):
        print("Something's wrong") # shouldn't print
        
# 100th Entry is True
t0 = train_data[0][100]
for t in train_data:
    if (t0 != t[100]):
        print("This one's right") # should print
        break

This one's right


In [107]:
# To see the sum of these values
for i in range(784):
    s = 0
    for t in train_data:
        s += t[i]

    print(i, res[i], s)

0 True 0.0
1 True 0.0
2 True 0.0
3 True 0.0
4 True 0.0
5 True 0.0
6 True 0.0
7 True 0.0
8 True 0.0
9 True 0.0
10 True 0.0
11 True 0.0
12 True 0.0
13 True 0.0
14 True 0.0
15 True 0.0
16 True 0.0
17 True 0.0
18 True 0.0
19 True 0.0
20 True 0.0
21 True 0.0
22 True 0.0
23 True 0.0
24 True 0.0
25 True 0.0
26 True 0.0
27 True 0.0
28 True 0.0
29 True 0.0
30 True 0.0
31 True 0.0
32 True 0.0
33 True 0.0
34 True 0.0
35 False 0.37254901960784315
36 False 1.5176470588235293
37 False 1.0588235294117647
38 False 1.0235294117647058
39 False 1.3176470588235294
40 False 1.3607843137254902
41 False 1.3764705882352941
42 False 2.603921568627451
43 False 2.3568627450980393
44 False 3.384313725490196
45 False 4.349019607843138
46 False 2.0470588235294116
47 False 0.6509803921568628
48 False 0.043137254901960784
49 False 0.5764705882352941
50 False 0.7215686274509804
51 False 0.058823529411764705
52 True 0.0
53 True 0.0
54 True 0.0
55 True 0.0
56 True 0.0
57 True 0.0
58 True 0.0
59 True 0.0
60 False 0.60392

331 False 221.85882352941204
332 False 64.04705882352941
333 False 6.572549019607844
334 False 1.5529411764705885
335 True 0.0
336 False 0.2549019607843137
337 False 1.5999999999999999
338 False 6.047058823529412
339 False 17.33333333333333
340 False 62.121568627451026
341 False 185.83137254901976
342 False 438.2941176470599
343 False 827.5058823529433
344 False 1335.4549019607875
345 False 1757.7882352941226
346 False 1921.462745098048
347 False 1802.3803921568676
348 False 1600.6823529411797
349 False 1574.3843137254942
350 False 1754.192156862748
351 False 2002.3450980392251
352 False 2222.886274509815
353 False 2306.6823529411927
354 False 2098.772549019618
355 False 1633.9333333333395
356 False 1158.149019607847
357 False 758.4274509803947
358 False 458.945098039217
359 False 228.15686274509852
360 False 71.82745098039216
361 False 5.780392156862747
362 False 1.3333333333333335
363 True 0.0
364 False 0.12549019607843137
365 False 0.41960784313725485
366 False 2.1176470588235294
36

640 False 19.24313725490196
641 False 3.537254901960784
642 False 0.043137254901960784
643 True 0.0
644 True 0.0
645 True 0.0
646 False 0.8352941176470589
647 False 10.16470588235294
648 False 36.20392156862746
649 False 107.86274509803924
650 False 263.69411764705916
651 False 490.2784313725499
652 False 820.3215686274523
653 False 1169.6980392156902
654 False 1509.9921568627497
655 False 1797.2588235294193
656 False 1961.988235294128
657 False 1946.9843137254993
658 False 1773.239215686278
659 False 1468.6745098039255
660 False 1122.6823529411793
661 False 771.9568627451004
662 False 472.15294117647113
663 False 273.3333333333335
664 False 151.87450980392177
665 False 84.63921568627454
666 False 43.29411764705883
667 False 22.552941176470583
668 False 7.0705882352941165
669 False 1.8627450980392157
670 True 0.0
671 True 0.0
672 True 0.0
673 True 0.0
674 False 0.03529411764705882
675 False 1.525490196078431
676 False 12.51764705882353
677 False 35.43921568627453
678 False 95.184313725

So, the above function works to find these useless features.  We now only need to remove those features from the data sets and note the indices.

In [118]:
res = np.all(train_data == train_data[0,:], axis = 0)
removable_indices = np.where(res)
print(removable_indices) # indices of useless features

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  52,  53,  54,  55,
        56,  57,  58,  59,  82,  83,  84,  85,  86,  87, 110, 111, 112,
       113, 114, 139, 140, 141, 142, 167, 168, 196, 224, 251, 252, 280,
       308, 335, 363, 390, 391, 392, 393, 418, 419, 420, 421, 448, 449,
       476, 504, 532, 559, 560, 561, 587, 588, 589, 615, 616, 617, 643,
       644, 645, 670, 671, 672, 673, 699, 700, 701, 702, 703, 727, 728,
       729, 730, 731, 752, 753, 754, 755, 756, 757, 758, 759, 760, 779,
       780, 781, 782, 783]),)


In [127]:
clean_train_data = np.delete(train_data, removable_indices, 1)
print(train_data.shape)
print(clean_train_data.shape)

(50000, 784)
(50000, 663)


Notice how we removed columns from the matrix.

In [126]:
print(train_data.shape)
print(clean_train_data.shape)

(50000, 784)
(50000, 663)


In [130]:
res_train_data = np.all(train_data == train_data[0,:], axis = 0)
print(res_train_data)

res_clean_data = np.all(clean_train_data == clean_train_data[0,:], axis = 0)
print(res_clean_data)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True False
 False False False False False False False False False False False False
 False False False False  True  True  True  True  True  True  True  True
 False False False False False False False False False False False False
 False False False False False False False False False False  True  True
  True  True  True  True False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True  True  True  True False False False False False
 False False False False False False False False False False False False
 False False False False False False False  True  True  True  True False
 False False False False False False False False False False False False
 False False False False False False False False Fa

In [132]:
print(sum(res_train_data))
print(sum(res_clean_data)) #should be 0

121
0


Now we implement this into the code.

In [3]:
from nnScript import preprocess

In [7]:
train_data, train_label, validation_data, validation_label, test_data, test_label = preprocess()

Used Features (array([ 12,  13,  14,  15,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  58,  59,
        60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
        73,  74,  75,  76,  77,  78,  79,  80,  81,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 142, 143, 144, 145,
       146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172,
       173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
       186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
       199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
       212, 213, 214, 215, 216, 217, 218, 219, 22

In [8]:
train_data.shape

(50000, 717)

In [6]:
res = np.all(train_data == train_data[0,:], axis = 0)
print(sum(res))
print(res)

0
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False 

And it seems to work.

# nnObjFunction

In [27]:
from nnScript import preprocess, sigmoid, initializeWeights
from scipy.optimize import minimize

In [28]:
train_data, train_label, validation_data, validation_label, test_data, test_label = preprocess()

#  Train Neural Network

# set the number of nodes in input unit (not including bias unit)
n_input = train_data.shape[1]

# set the number of nodes in hidden unit (not including bias unit)
n_hidden = 50

# set the number of nodes in output unit
n_class = 10

# initialize the weights into some random matrices
initial_w1 = initializeWeights(n_input, n_hidden)
initial_w2 = initializeWeights(n_hidden, n_class)

# unroll 2 weight matrices into single column vector
initialWeights = np.concatenate((initial_w1.flatten(), initial_w2.flatten()), 0)

# set the regularization hyper-parameter
lambdaval = 0

args = (n_input, n_hidden, n_class, train_data, train_label, lambdaval)

# Train Neural Network using fmin_cg or minimize from scipy,optimize module. Check documentation for a working example

opts = {'maxiter': 50}  # Preferred value.

#nn_params = minimize(nnObjFunction, initialWeights, jac=True, args=args, method='CG', options=opts)

Used Features (array([ 12,  13,  14,  15,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  58,  59,
        60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
        73,  74,  75,  76,  77,  78,  79,  80,  81,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 142, 143, 144, 145,
       146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172,
       173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
       186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
       199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
       212, 213, 214, 215, 216, 217, 218, 219, 22

Implement the function

In [52]:
def nnObjFunction(params, *args):
    n_input, n_hidden, n_class, training_data, training_label, lambdaval = args

    w1 = params[0:n_hidden * (n_input + 1)].reshape((n_hidden, (n_input + 1)))
    w2 = params[(n_hidden * (n_input + 1)):].reshape((n_class, (n_hidden + 1)))
    obj_val = 0

    # Your code here
    
    # Set Bias
    np.append(np.array([[1,2,3], [1,1,1], [2,2,2]]), np.ones((3,1)), 1)
    
    b1 = np.ones((len(training_data), 1))
    b2 = np.ones((len(training_data), 1))

    # Forward Propagation
    X = np.append(training_data, b1, 1) # append bias
    net1 = X.dot(w1.T)
    o1 = sigmoid(net1)
    
    H = np.append(net1, b2, 1)
    net2 = H.dot(w2.T)
    o2 = sigmoid(net2)
    
    # 1-hot encoding
    y = np.zeros(o2.shape)
    y[np.arange(o2.shape[0]), train_label.astype(int)] = 1
    
    # Error
    E = (y*np.log(o2) + (np.ones(y.shape) - y)*np.log(np.ones(o2.shape) - o2))
    obj_val = -(np.sum(E) / len(training_data))
    
    # Gradients
    grad_w2 = o2*(o2-y)
    grad_w1 = np.zeros(w1.shape)
    
    # Make sure you reshape the gradient matrices to a 1D array. for instance if your gradient matrices are grad_w1 and grad_w2
    # you would use code similar to the one below to create a flat array
    obj_grad = np.concatenate((grad_w1.flatten(), grad_w2.flatten()),0)
    print(grad_w2.shape)
    print(grad_w1.shape)
    print(obj_grad.shape)
    #obj_grad = np.array([])

    return (obj_val, obj_grad)

In [53]:
nnObjFunction(initialWeights, *args)

(10, 51)
(50, 713)
(36160,)


(8.213019694139337, array([0. , 0. , 0. , ..., 0.1, 0.1, 0.1]))

In [54]:
print(initial_w2.shape)
print(initial_w1.shape)

(10, 51)
(50, 713)


Test it

In [55]:
nn_params = minimize(nnObjFunction, initialWeights, jac=True, args=args, method='CG', options=opts)

(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(36160,)
(10, 51)
(50, 713)
(

In [57]:
nn_params

     fun: 8.213019694139337
     jac: array([0. , 0. , 0. , ..., 0.1, 0.1, 0.1])
 message: 'Desired error not necessarily achieved due to precision loss.'
    nfev: 95
     nit: 0
    njev: 83
  status: 2
 success: False
       x: array([-0.07195695,  0.08258197,  0.07292078, ...,  0.04886368,
       -0.21037462,  0.29325883])

In [51]:
initialWeights

array([-0.07195695,  0.08258197,  0.07292078, ...,  0.04886368,
       -0.21037462,  0.29325883])