In [1]:
import numpy as np
import pandas as pd

In [2]:
class Network:
    
    def __init__(self, network_structure):
        #network_structure -> [2,3,4...5], each element represents the number of nodes
        # in the layer, and the length of the array is the number of layers
        self.number_of_layers = len(network_structure)
        self.biases = []
        self.weights = []
        #initial biases and weights are all zeros
        for num_of_nodes in network_structure[1:]:
            layer_bias = np.zeros((num_of_nodes,1))
            self.biases.append(layer_bias)
        
        for previous_layer_num_nodes, current_layer_num_nodes in zip(network_structure[:-1],network_structure[1:]):
            layer_weights = np.zeros((current_layer_num_nodes, previous_layer_num_nodes))
            self.weights.append(layer_weights)
        
    def gradient_descent(self, original_value, learning_rate, gradients):
        #this function returns the new value after gradient descent
        return [original_value - learning_rate*gradient/self.number_of_training_examples for gradient in gradients]
    
    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
    
    def sigmoid_prime(self, z):
        return self.sigmoid(z)*(1-self.sigmoid(z))
    
    def layer_output(self, input_matrix, layer):
        layer_weights = self.weights[layer-1]
        layer_biases = self.biases[layer-1]
        z = (np.dot(layer_weights, input_matrix.transpose()) + layer_biases).transpose()
        output_matrix = self.sigmoid(z)
        return (z, output_matrix)
    
    def cost_function_derivative(self, predicted_output, output_matrix):
        return (predicted_output - output_matrix)
    
    def feed_forward(self, input_matrix):
        current_input = input_matrix
        zs = []
        activations = [np.array(current_input)]
        #Here we have feeded the input matrix into the network, computed all the activation values
        for layer in xrange(1, self.number_of_layers):
            (z, output) = self.layer_output(current_input, layer)
            activations.append(output)
            zs.append(z)
            current_input = output
        return (zs, activations)
    
    def back_propagation(self, input_matrix, output_matrix, zs, activations):
        #try np.zeros(self.biases.shape)
        delta_b = [np.zeros(bias.shape) for bias in self.biases]
        #try np.zeros(self.weights.shape)
        delta_w = [np.zeros(weights.shape) for weights in self.weights]
        delta = self.cost_function_derivative(activations[-1], output_matrix)*self.sigmoid_prime(zs[-1])
        
        for l in xrange(2,self.number_of_layers):
            #l represents the layers when we are coming back from output to input
            z = zs[-l]
            #weights(l-1) -> weights of (l-1) layer
            weights_l_1 = self.weights[-l+1]
            delta_l_1 = delta[-l+1]
            sigmoid_prime_z = self.sigmoid_prime(z)
            delta = np.dot(weights_l_1.transpose(), delta_l_1)*sigmoid_prime_z
            delta_b[-l] = delta
            delta_w[-l] = np.dot(activations[-l-1].transpose(), delta)
        
        return (delta_b,delta_w)
    
    def fit(self, train_data, learning_rate, epochs, mini_batch_size, test_data):
        
        self.epochs = epochs
        feature_train_matrix = train_data[0]
        label_train_matrix = train_data[1]  
        feature_test_matrix = test_data[0]
        label_test_matrix = test_data[1]
        self.number_of_training_examples = len(feature_train_matrix)
        self.epochs = epochs
            
        for epoch in xrange(self.epochs):
            mini_batch_inputs = [feature_train_matrix[k:k+mini_batch_size] for k in xrange(0,len(feature_train_matrix),mini_batch_size)]
            mini_batch_outputs = [label_train_matrix[k:k+mini_batch_size] for k in xrange(0, len(label_train_matrix), mini_batch_size)]
            
            for mini_batch_input, mini_batch_output in zip(mini_batch_inputs, mini_batch_outputs):
                zs, activations = self.feed_forward(mini_batch_input)
                delta_b, delta_w = self.back_propagation(mini_batch_input, mini_batch_output, zs, activations)
                self.biases = [self.gradient_descent(bias, learning_rate, db) for bias, db in zip(self.biases, delta_b)]
                for weight, dw in zip(self.weights, delta_w):
                    
                self.weights = [self.gradient_descent(weight, learning_rate, dw) for weight, dw in zip(self.weights, delta_w)]
            
            correctly_predicted = 0
            for index, o in enumerate(label_test_matrix):
                i = feature_test_matrix[index]
                predicted_output = self.feed_forward(i)
                predicted = True
                for p,a in zip(predicted_output, o):
                    if p != a:
                        predicted = False
                if predicted == True:
                    correctly_predicted = correctly_predicted + 1
                print("Epoch {0}: {1}/{2}").format(epoch, correctly_predicted,len(feature_train_matrix))

In [3]:
mnist = pd.read_csv('train.csv')

In [6]:
len(mnist)

42000

In [4]:
train_data = mnist[:40000]
test_data = mnist[40000:42000]
train_data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
train_data.tail()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
39995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39999,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def convert_to_one_hot(input_dataframe):
    labels = input_dataframe.iloc[:,:1].values.ravel()
    unique_values = np.unique(labels)
    output = []
    for label in labels:
        one_hot_vector = np.zeros((len(unique_values),1))
        one_hot_vector[label] = 1
        output.append(one_hot_vector)
    return output

In [7]:
nn_train_features = train_data.iloc[:,1:].values
nn_train_labels = convert_to_one_hot(train_data)
nn_test_features = test_data.iloc[:,1:].values
nn_test_labels = convert_to_one_hot(test_data)

In [8]:
nn_train = (nn_train_features, [label.ravel() for label in nn_train_labels])
nn_test = (nn_test_features, [label.ravel() for label in nn_test_labels])

In [9]:
nn_train_labels[0].ravel()

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [9]:
network = Network([784,30,10])
network.fit(nn_train, 0.01, 10, 200, nn_test)

(200, 10)


ValueError: operands could not be broadcast together with shapes (30,784) (30,) 