In [2]:
import numpy as np
import idx2numpy

In [3]:
data = idx2numpy.convert_from_file('data/train.idx3-ubyte')
labels = idx2numpy.convert_from_file('data/labels.idx1-ubyte')

In [4]:
B,W,H = data.shape[0], data.shape[1], data.shape[2]
data = data / data.reshape(B * W * H).max().item()
data = data.reshape(B, W* H)

In [5]:
def train_dev_split(data,label,train,dev):
    Ntr = int(data.shape[0] * train)
    Ndev = int(data.shape[0] * dev)
    ind = np.random.randint(low=0,high=data.shape[0],size=(data.shape[0],))
    data_shuffled = data[ind]
    label_shuffled = label[ind]
    Xtr = data_shuffled[:Ntr]
    Ytr = label_shuffled[:Ntr]
    Xdev = data_shuffled[Ntr:Ntr+Ndev]
    Ydev = label_shuffled[Ntr:Ntr+Ndev]
    return Xtr,Ytr,Xdev,Ydev
Xtr,Ytr,Xdev,Ydev = train_dev_split(data,labels, 0.9,0.1)
    

In [383]:
number_neurons_layer1 = 100
number_neurons_layer2 = 100
number_neurons_layer3 = 10
number_inputs = W*H
limit1 = np.sqrt(6/(number_inputs + number_neurons_layer1))
W1 = np.random.uniform(-limit1, limit1, size=(number_inputs,number_neurons_layer1))
b1 = np.random.randn(number_neurons_layer1) * 0
limit2 = np.sqrt(6 / (number_neurons_layer1 + number_neurons_layer2))
W2 = np.random.uniform(-limit2,limit2, size=(number_neurons_layer1,number_neurons_layer2))
b2 = np.random.randn(number_neurons_layer2) * 0
limit3 = np.sqrt(6 / (number_neurons_layer2 + number_neurons_layer3))
W3 = np.random.uniform(-limit3,limit3, size=(number_neurons_layer2,number_neurons_layer3))
b3 = np.random.randn(number_neurons_layer3) * 0
parameters = [W1,b1,W2,b2,W3,b3]

In [384]:
for i in range(1000):
    #Mini-Batching
    batch_size = 128
    epsilon = 1e-9
    ind = np.random.randint(low=0,high=Xtr.shape[0],size=(batch_size,))
    mini_batch = Xtr[ind]
    if i == 0:
        print(f'Mini batch mean: {np.mean(mini_batch)} std: {np.std(mini_batch)}')

    #Forward pass
    layer1 = mini_batch @ W1 + b1
    if i % 100 == 0:
        print(f'Layer 1 mean: {np.mean(layer1)} std: {np.std(layer1)}')
    tanhlayer = np.tanh(layer1)
    layer2 = tanhlayer @ W2 + b2
    if i % 100 == 0:
        print(f'Layer 2 mean: {np.mean(layer2)} std: {np.std(layer2)}')
    tanhlayer2 = np.tanh(layer2)
    layer3 = tanhlayer2 @ W3 + b3
    if i % 100 == 0:
        print(f'Layer 3 mean: {np.mean(layer3)} std: {np.std(layer3)}')
    softmax = np.exp(layer3) / np.sum(np.exp(layer3), axis = 1, keepdims=True)
    one_hot = np.zeros((batch_size,10))
    for k in range(batch_size):
        one_hot[k,Ytr[ind][k]] += 1
    cross_entropy_loss = 0
    correct_class_labels = Ytr[ind]
    correct_logprobs = -np.log(softmax[range(batch_size), correct_class_labels])
    cross_entropy_loss = np.mean(correct_logprobs)
    print(f'Loss = {cross_entropy_loss}')

    #Backward pass
    dlayer3 = (softmax - one_hot) / batch_size
    dtanhlayer2 = dlayer3 @ W3.T #layer3 = tanhlayer2 @ W3 + b3
    dW3 = tanhlayer2.T @ dlayer3
    db3 = dlayer3.sum(0)  
    dlayer2 = (1-tanhlayer2**2) * dtanhlayer2 #tanhlayer2 = np.tanh(layer2)
    dtanhlayer = dlayer2 @ W2.T #layer2 = tanhlayer @ W2 + b2
    dW2 = tanhlayer.T @ dlayer2  
    db2 = dlayer2.sum(0)  
    dlayer1 = (1-tanhlayer** 2) * dtanhlayer #tanhlayer = np.tanh(layer1)
    dW1 = mini_batch.T @ dlayer1  #layer1 = mini_batch @ W1 + b1
    db1 = dlayer1.sum(0)
    gradient_params = [dW1,db1,dW2,db2,dW3,db3]

    lr = 1e-2
    for j in range(len(parameters)):
        parameters[j] = parameters[j] - lr * gradient_params[j]
    W1,b1,W2,b2,W3,b3 = tuple(parameters)

Mini batch mean: 0.13374169980492195 std: 0.310971277240233
Layer 1 mean: -0.0030163679184008734 std: 0.4293240829955881
Layer 2 mean: 0.01593556605080713 std: 0.3702864400635133
Layer 3 mean: -0.1700615625251002 std: 0.5146610162819465
Loss = 2.4814417206961457
Loss = 2.4035057631897305
Loss = 2.4169340611114873
Loss = 2.520697950125048
Loss = 2.377714461737032
Loss = 2.417370707726943
Loss = 2.4446521576450486
Loss = 2.4213434684113913
Loss = 2.3340700456473864
Loss = 2.3726148224268293
Loss = 2.3393229658312933
Loss = 2.361924682467492
Loss = 2.403560246142213
Loss = 2.2846209015206487
Loss = 2.329555257248063
Loss = 2.347235192254936
Loss = 2.355646963774813
Loss = 2.3486701524819273
Loss = 2.3405225450445806
Loss = 2.340950413511094
Loss = 2.3872900023490873
Loss = 2.3572338775262454
Loss = 2.339902603554858
Loss = 2.274477187410847
Loss = 2.311858238273072
Loss = 2.366262323606522
Loss = 2.3369364401181585
Loss = 2.323490939245259
Loss = 2.345984062905555
Loss = 2.328937205754888

KeyboardInterrupt: 