In [19]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt 
from sklearn.metrics import confusion_matrix
data = pd.read_csv('../input/digit-recognizer/train.csv')
data_test = pd.read_csv('../input/digit-recognizer/test.csv')

In [20]:
data.head()
print(data_test.shape)

In [21]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)
data_dev = data[0:1000].T
Y_test= data_dev[0]

X_test = data_dev[1:n]
X_test = X_test / 255

# first 1000 are labels
data_train = data[1000:m]
print('1_DATASHAPEEEE')
print(data_train.shape)
#data_train = data[30750:m].T
data_train = data[20500:m].T
print('2_dataShape')
print(data_train.shape)

#get Labels from first col
Y_train = data_train[0]

#skip first row
X_train = data_train[1:n]
X_train = X_train / 255 


In [22]:

def init(hidden_units_num):
    #initialize arguments
    input_to_hidden_weight_matrix = np.random.rand(hidden_units_num, 784) - 0.5                                                
    b1 = np.random.rand(hidden_units_num, 1) - 0.5 
    hidden_to_output_weight_matrix = np.random.rand(10, hidden_units_num) - 0.5                                       
    b2 = np.random.rand(10, 1) - 0.5 
    return input_to_hidden_weight_matrix , b1, hidden_to_output_weight_matrix, b2 


def relu(Z):
    #similar to sigmoid function
    return np.maximum(Z, 0) 


def softmax(Z):
    #converts vector into a vector of probabilities, 
    A = np.exp(Z) / sum(np.exp(Z))
    return A


def prop_forward(input_to_hidden_weight_matrix , b1, hidden_to_output_weight_matrix, b2, X, hidden_units_num):
    #get Z1, the first WEIGHTED SUM ("the unactivated first layer")
    Z1 = input_to_hidden_weight_matrix .dot(X) + b1
    
    #first layer
    input_layer = relu(Z1) 
    
    #get Z2, second unactivated layer, weighted sum
    Z2 = hidden_to_output_weight_matrix.dot(input_layer) + b2 
    #activate function used on Z2 to determine current result) 
    output_layer = softmax(Z2) 
    return Z1, input_layer, Z2, output_layer


def deriv_relu(Z):
    #for backprop
    return Z > 0 

#create binary vector
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1)) 
    one_hot_Y[np.arange(Y.size), Y] = 1 
    #get dimensions correct
    one_hot_Y = one_hot_Y.T 
    return one_hot_Y

#backward propagation;

def back_prop(Z1, input_layer, Z2, output_layer, input_to_hidden_weight_matrix , hidden_to_output_weight_matrix, X, Y):
    '''
    args: Weighted Sums (Z1, Z2), activated layers (input_layer, output_layer), Weights (input_to_hidden_weight_matrix , hidden_to_output_weight_matrix)
    retval: second layer weight matrix and predictions
    dZ2: d/dx Z2 -> hidden_to_output_weight_matrix
    dhidden_to_output_weight_matrix: d/dx hidden_to_output_weight_matrix-> b2
    dZ1: d/dx  Z1 -> input_to_hidden_weight_matrix 
    d input_to_hidden_weight_matrix : d/dx input_to_hidden_weight_matrix  -> b1
    '''
    one_hot_Y = one_hot(Y)
    
    dZ2 = output_layer - one_hot_Y # AKA [(2nd layer predictions) - (one-hot encoded expected values for input)]
    dhidden_to_output_weight_matrix = 1 / m * dZ2.dot(input_layer.T) # AKA [(1 / size of Y) * (dZ2 (DOT transpose of 1st layer predictions))]
    db2 = 1 / m * np.sum(dZ2) # AKA [(1 / size of Y) * (sum of all values in dZ2)]

    #dZ1 is "applying the weights in reverse"
    dZ1 = hidden_to_output_weight_matrix.T.dot(dZ2) * deriv_relu(Z1) 
    dinput_to_hidden_weight_matrix  = 1 / m * dZ1.dot(X.T) #[(1 / size of Y) * (dZ2 (DOT transpose of 1st layer predictions)]
    db1 = 1 / m * np.sum(dZ1) #[(1 / size of Y) * (sum of all values in dZ1)]
    
    return dinput_to_hidden_weight_matrix , db1, dhidden_to_output_weight_matrix, db2 


def update(input_to_hidden_weight_matrix , b1, hidden_to_output_weight_matrix, b2, dinput_to_hidden_weight_matrix , db1, dhidden_to_output_weight_matrix, db2, learn_rate, momentum, prev_input_to_hidden_weight_matrix , prev_hidden_to_output_weight_matrix, is_1st_epoch):
    if is_1st_epoch:
        del_input_to_hidden_weight_matrix  = learn_rate * dinput_to_hidden_weight_matrix 
        del_hidden_to_output_weight_matrix = learn_rate * dhidden_to_output_weight_matrix
    else:
        del_input_to_hidden_weight_matrix  = learn_rate * dinput_to_hidden_weight_matrix  + (momentum * prev_input_to_hidden_weight_matrix )
        del_hidden_to_output_weight_matrix = learn_rate * dhidden_to_output_weight_matrix + (momentum * prev_hidden_to_output_weight_matrix)
       
    
    input_to_hidden_weight_matrix  = input_to_hidden_weight_matrix  - del_input_to_hidden_weight_matrix 
    b1 = b1 - learn_rate * db1
    hidden_to_output_weight_matrix = hidden_to_output_weight_matrix - del_hidden_to_output_weight_matrix
    b2 = b2 - learn_rate * db2
   
   
    return input_to_hidden_weight_matrix , b1, hidden_to_output_weight_matrix, b2, del_input_to_hidden_weight_matrix , del_hidden_to_output_weight_matrix #sending back values, including now "prev" delta Ws

In [23]:

def get_predictions(output_layer):
    return np.argmax(output_layer, 0)


def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size 

def trainNN(X, Y, learn_rate, momentum, iterations, hidden_units_num,X_test,Y_test):
    '''
    Flow of program:
        forward propogate:
        input goes through input_to_hidden_weight_matrix
        hidden_layer activations go through hid_to_output_matrix--second weiht matrix
        
        back propogate:
        update output layer 
        update output to hidden weights 
        update hidden activation layer
        update input to hidden weights
    '''
    train_acc_array=[]
    train_time_array=[]
    acc_array=[]
    time_array=[]
    input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2 = init(hidden_units_num)
    prev_input_to_hidden_weight_matrix  = np.zeros((hidden_units_num, 784))
    prev_hid_to_output_matrix= np.zeros((10, hidden_units_num))
    for i in range(iterations):
        Z1, input_layer, Z2, output_layer = prop_forward(input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, X, hidden_units_num)
        dinput_to_hidden_weight_matrix , db1, dhid_to_output_matrix, db2 = back_prop(Z1, input_layer, Z2, output_layer, input_to_hidden_weight_matrix , hid_to_output_matrix, X, Y)
        if i == 0:
            input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, prev_input_to_hidden_weight_matrix, prev_hid_to_output_matrix = update(input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, dinput_to_hidden_weight_matrix , db1, dhid_to_output_matrix, db2, learn_rate, momentum, prev_input_to_hidden_weight_matrix , prev_hid_to_output_matrix, True)
        else:
            input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, prev_input_to_hidden_weight_matrix, prev_hid_to_output_matrix = update(input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, dinput_to_hidden_weight_matrix , db1, dhid_to_output_matrix, db2, learn_rate, momentum, prev_input_to_hidden_weight_matrix , prev_hid_to_output_matrix, False)
             
       
        print("epoch... ", i) 
        predictions = get_predictions(output_layer) 
        print(get_accuracy(predictions, Y))
        accs=(get_accuracy(predictions, Y))
        train_acc_array.append(accs)
        train_time_array.append(i)
 
    
    
    for i in range(iterations):
        Z1, input_layer, Z2, output_layer = prop_forward(input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, X_test, hidden_units_num)
        dinput_to_hidden_weight_matrix , db1, dhid_to_output_matrix, db2 = back_prop(Z1, input_layer, Z2, output_layer, input_to_hidden_weight_matrix , hid_to_output_matrix, X_test, Y_test)
        #If first iteration, don't include momentum.Use boolean in else statement
        if i == 0:
            input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, prev_input_to_hidden_weight_matrix , prev_hid_to_output_matrix = update(input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, dinput_to_hidden_weight_matrix , db1, dhid_to_output_matrix, db2, learn_rate, momentum, prev_input_to_hidden_weight_matrix , prev_hid_to_output_matrix, True)
        else:
            input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, prev_input_to_hidden_weight_matrix , prev_hid_to_output_matrix = update(input_to_hidden_weight_matrix , b1, hid_to_output_matrix, b2, dinput_to_hidden_weight_matrix , db1, dhid_to_output_matrix, db2, learn_rate, momentum, prev_input_to_hidden_weight_matrix, prev_hid_to_output_matrix, False)
        
        test_predictions = get_predictions(output_layer)
        test_accs=(get_accuracy(test_predictions, Y_test))
        print (test_accs)
        acc_array.append(test_accs)
        time_array.append(i)
        
    print('ACCARRAY')
    print(acc_array)
    conf_mat = confusion_matrix(test_predictions, Y_test)
    print('TEST CONFUSION MATRIX')
    print(conf_mat)
    
    plt.plot(train_time_array,train_acc_array)
    plt.plot(time_array,acc_array)
    plt.title('accuracies over epochs')
    plt.xlabel('epochs')
    plt.ylabel('accuracies')
    plt.show()
    return prev_input_to_hidden_weight_matrix, b1, hid_to_output_matrix, b2
    

In [24]:
input_to_hidden_weight_matrix , b1, hidden_to_output_weight_matrix, b2 = trainNN(X_train, Y_train, 0.10, .5, 50, 100, X_test,Y_test )