In [119]:
import numpy as np
import pandas as pd

In [284]:
class Network:
    
    def __init__(self, network_structure):
        #network_structure -> [2,3,4...5], each element represents the number of nodes
        # in the layer, and the length of the array is the number of layers
        self.number_of_layers = len(network_structure)
        self.biases = []
        self.weights = []
        #initial biases and weights are all zeros
        for num_of_nodes in network_structure[1:]:
            layer_bias = np.zeros((num_of_nodes,1))
            self.biases.append(layer_bias)
        self.biases = np.array(self.biases)
        
        for previous_layer_num_nodes, current_layer_num_nodes in zip(network_structure[:-1],network_structure[1:]):
            layer_weights = np.zeros((current_layer_num_nodes, previous_layer_num_nodes))
            self.weights.append(layer_weights)
        self.weights = np.array(self.weights)
        
    def gradient_descent(self, original_value, learning_rate, gradients):
        #this function returns the new value after gradient descent
        output = original_value
        for index, gradient in enumerate(gradients):
            print("Gradient descent index: {0}").format(index)
            output = output - learning_rate*gradient/self.number_of_training_examples
        return 
    
    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
    
    def sigmoid_prime(self, z):
        return self.sigmoid(z)*(1-self.sigmoid(z))
    
    def layer_output(self, input_matrix, layer):
        layer_weights = self.weights[layer-1]
        layer_biases = self.biases[layer-1]
        z = (np.dot(layer_weights, input_matrix.transpose()) + layer_biases).transpose()
        print("Layer output Input shape: {0}").format(input_matrix.shape)
        print("Layer Output Input Transpose shape: {0}").format(input_matrix.transpose().shape)
        print("Layer Output Weights shape: {0}").format(layer_weights.shape)
        print("Layer Output Z shape: {0}").format(z.shape)
        output_matrix = self.sigmoid(z)
        print("Layer Output Output shape {0}").format(output_matrix.shape)
        return (np.array(z), np.array(output_matrix))
    
    def cost_function_derivative(self, predicted_output, output_matrix):
        return (predicted_output - output_matrix)
    
    def feed_forward(self, input_matrix):
        current_input = input_matrix
        zs = []
        activations = [current_input]
        print("Feed forward Initial Activations: {0}").format(activations)
        #Here we have feeded the input matrix into the network, computed all the activation values
        for layer in xrange(1, self.number_of_layers):
            (z, output) = self.layer_output(current_input, layer)
            print("Feed forward Output shape: {0}").format(output.shape)
            activations.append(output)
            print("Feed forward New activation: {0}").format(output)
            print("Feed forward Activations: {0}").format(activations)
            zs.append(z)
            current_input = output
            
        #print("Feed forward Activations shape: {0}").format(activations.shape)
        return (zs, activations)
    
    def back_propagation(self, input_matrix, output_matrix, zs, activations):
        #try np.zeros(self.biases.shape)
        delta_b = [np.zeros(self.biases.shape) for bias in self.biases]
        #try np.zeros(self.weights.shape)
        delta_w = [np.zeros(self.weights.shape) for weights in self.weights]
        delta = self.cost_function_derivative(activations[-1], output_matrix) * self.sigmoid_prime(zs[-1])
        
        for l in xrange(2,self.number_of_layers):
            #l represents the layers when we are coming back from output to input
            z = zs[-l]
            #weights(l-1) -> weights of (l-1) layer
            weights_l_1 = self.weights[-l+1]
            delta_l_1 = delta[-l+1]
            sigmoid_prime_z = self.sigmoid_prime(z)
            delta = np.dot(weights_l_1.transpose(), delta_l_1)*sigmoid_prime_z
            delta_b[-l] = delta
            #print("Back Propagation Activations shape: {0}, Delta Shape: {1}").format(activations.transpose().shape, delta.shape)
            delta_w[-l] = np.dot(activations[-l-1].transpose(), delta)
        
        #print("Back Propagation Shape DW: {0}").format(delta_w.shape)
        return (delta_b,delta_w)
    
    def fit(self, train_data, learning_rate, epochs, mini_batch_size, test_data):
        
        self.epochs = epochs
        feature_train_matrix = train_data[0]
        label_train_matrix = train_data[1]  
        feature_test_matrix = test_data[0]
        label_test_matrix = test_data[1]
        self.number_of_training_examples = len(feature_train_matrix)
        self.epochs = epochs
            
        for epoch in xrange(self.epochs):
            mini_batch_inputs = [feature_train_matrix[k:k+mini_batch_size] for k in xrange(0,len(feature_train_matrix),mini_batch_size)]
            mini_batch_outputs = [label_train_matrix[k:k+mini_batch_size] for k in xrange(0, len(label_train_matrix), mini_batch_size)]
            
            for mini_batch_input, mini_batch_output in zip(mini_batch_inputs, mini_batch_outputs):
                zs, activations = self.feed_forward(mini_batch_input)
                delta_b, delta_w = self.back_propagation(mini_batch_input, mini_batch_output, zs, activations)
                self.biases = [self.gradient_descent(bias, learning_rate, db) for db, bias in zip(delta_b, self.biases)]
                self.weights = [self.gradient_descent(weight, learning_rate, dw) for dw, weight in zip(delta_w, self.weights)]
            
            correctly_predicted = 0
            for index, o in enumerate(label_test_matrix):
                i = feature_test_matrix[index]
                predicted_output = self.feed_forward(i)
                predicted = True
                for p,a in zip(predicted_output, o):
                    if p != a:
                        predicted = False
                if predicted == True:
                    correctly_predicted = correctly_predicted + 1
                print("Epoch {0}: {1}/{2}").format(epoch, correctly_predicted,len(feature_train_matrix))

In [96]:
mnist = pd.read_csv('train.csv')

In [98]:
len(mnist)

42000

In [100]:
train_data = mnist[:40000]
test_data = mnist[40000:42000]
train_data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
train_data.tail()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
39995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39999,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
def convert_to_one_hot(input_dataframe):
    labels = input_dataframe.iloc[:,:1].values.ravel()
    unique_values = np.unique(labels)
    output = []
    for label in labels:
        one_hot_vector = np.zeros((len(unique_values),1))
        one_hot_vector[label] = 1
        output.append(one_hot_vector)
    return output

In [256]:
nn_train_features = train_data.iloc[:,1:].values
nn_train_labels = convert_to_one_hot(train_data)
nn_test_features = test_data.iloc[:,1:].values
nn_test_labels = convert_to_one_hot(test_data)

In [257]:
nn_train = (nn_train_features, [label.ravel() for label in nn_train_labels])
nn_test = (nn_test_features, [label.ravel() for label in nn_test_labels])

In [258]:
nn_train_labels[0].ravel()

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [285]:
network = Network([784,30,10])
network.fit(nn_train, 0.01, 10, 200, nn_test)

Feed forward Initial Activations: [array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])]
Layer output Input shape: (200, 784)
Layer Output Input Transpose shape: (784, 200)
Layer Output Weights shape: (30, 784)
Layer Output Z shape: (200, 30)
Layer Output Output shape (200, 30)
Feed forward Output shape: (200, 30)
Feed forward New activation: [[ 0.5  0.5  0.5 ...,  0.5  0.5  0.5]
 [ 0.5  0.5  0.5 ...,  0.5  0.5  0.5]
 [ 0.5  0.5  0.5 ...,  0.5  0.5  0.5]
 ..., 
 [ 0.5  0.5  0.5 ...,  0.5  0.5  0.5]
 [ 0.5  0.5  0.5 ...,  0.5  0.5  0.5]
 [ 0.5  0.5  0.5 ...,  0.5  0.5  0.5]]
Feed forward Activations: [array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), array([[ 0.5,  0.5,  0.5, ...,  

ValueError: operands could not be broadcast together with shapes (30,784) (30,) 