In [4]:
from __future__ import print_function

import keras
from keras.datasets import mnist

import numpy as np
import pickle
import matplotlib.pyplot as plt #For plotting
np.random.seed(0) #For repeatability of the experiment
from sklearn import cross_validation
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import warnings 
warnings.filterwarnings("ignore")

In [5]:
# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [6]:
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print(x_train.shape, 'train samples')
print(x_test.shape, 'test samples')



(60000, 784) train samples
(10000, 784) test samples


In [7]:
print(y_train.shape, 'train lables')
print(y_test.shape, 'test lables')

(60000,) train lables
(10000,) test lables


In [14]:
# Feedforward neural net model
np.random.seed(0)
D = x_train.shape[1] #Number of features
K = max(y_train)+1 #Number of classes assuming class index starts from 0
# Start with an initialize parameters randomly
h = 100 # size of hidden layer
W = 0.05 * np.random.randn(D,h)
b = np.zeros((1,h))
W_cv=W
b_cv=b

W2 = 0.05 * np.random.randn(h,K)
b2 = np.zeros((1,K))
W2_cv=W2
b2_cv=b2

In [15]:
# gradient descent loop for relu
def grad_descent_relu(xtrain, xtest, ytrain, ytest,W,W2,b,b2,reg,step_size,num_examples,iterations):
    
    for i in range(iterations):

          # evaluate class scores, [N x K]
        hidden_layer = np.maximum(0, np.dot(xtrain, W) + b) # note, ReLU activation
        scores = np.dot(hidden_layer, W2) + b2

        # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]

        # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),ytrain])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
#         if i % 500 == 0:
#              print("iteration %d: loss %f" % (i, loss))

        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),ytrain] -= 1
        dscores /= num_examples

          # backpropate the gradient to the parameters
        # first backprop into parameters W2 and b2
        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis=0, keepdims=True)
        # next backprop into hidden layer
        dhidden = np.dot(dscores, W2.T)
        # backprop the ReLU non-linearity
        dhidden[hidden_layer <= 0] = 0
        # finally into W,b
        dW = np.dot(xtrain.T, dhidden)
        db = np.sum(dhidden, axis=0, keepdims=True)

        # add regularization gradient contribution
        dW2 += reg * W2
        dW += reg * W

        # perform a parameter update
        W += -step_size * dW
        b += -step_size * db
        W2 += -step_size * dW2
        b2 += -step_size * db2


    hidden_layer = np.maximum(0, np.dot(xtest, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    Accuracy=np.mean(predicted_class == ytest)
#     print('Accuracy: %.2f' % Accuracy)
    
           
    return W, W2,b,b2,Accuracy
    


In [16]:
# Initial values from hyperparameter
reg = 1e-4 # regularization strength
#For simplicity we will take the batch size to be the same as number of examples
num_examples = x_train.shape[0]
#Initial value for the Gradient Descent Parameter
step_size = 1e-1 #Also called learning rate
iterations=1000
W,W2,b,b2,Test_Accuracy = grad_descent_relu(x_train, x_test, y_train, y_test,W,W2,b,b2,reg,step_size,num_examples,iterations)
print('Test Accuracy: %.2f' % Test_Accuracy)

Test Accuracy: 0.93


In [13]:
D = x_train.shape[1] #Number of features
h = 100
K = max(y_train)+1
reg = 1e-4 # regularization strength
#Initial value for the Gradient Descent Parameter
step_size = 1e-1
iterations=1000
kfold = KFold(3)
validation_accuracy=[]
factor=[0.001,0.01,0.05,0.5,1]
np.random.seed(0)
for item in factor:
    W = item*np.random.randn(D,h)
    b = np.zeros((1,h))
    W2= item * np.random.randn(h,K)
    b2=np.zeros((1,K))
    for k,(train,test) in enumerate(kfold.split(x_train,y_train)):
        w_cv=W
        B_cv=b
        B2_cv=b2
        w2_cv=W2
        num_examples=x_train[train].shape[0]
        w,w2,b,b2,accuracy=grad_descent_relu(x_train[train], x_train[test], y_train[train], y_train[test],w_cv,w2_cv,B_cv,B2_cv,reg,step_size,num_examples,iterations)
        validation_accuracy.append(accuracy)
        #print('Validation Accuracy for fold %d: %.2f' % (k, accuracy))
        #print('done processing for k: %d' % k)
    
    #Avg_validation_accuracy.append(np.mean(validation_accuracy))
    print(item)
    print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(validation_accuracy), np.std(validation_accuracy) * 2))

iteration 0: loss 2.301610
iteration 500: loss 0.341049
Accuracy: 0.92
iteration 0: loss 0.275764
iteration 500: loss 0.229687
Accuracy: 0.94
iteration 0: loss 0.213695


KeyboardInterrupt: 