In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
import numpy as np
import pandas as pd

class HyperParameters:
    def __init__(self, learning_rate=0.01, epochs=10, mini_batch_size=None, beta=.9, layers=None, beta1=.9, beta2=.998, lambd=0):
        if layers is None:
            layers = [10, 20, 10]
        self.layers = layers
        self.no_l = len(layers)
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = mini_batch_size
        self.beta = beta
        self.beta1 = beta1
        self.beta2 = beta2
        self.lambd = lambd

'''        
Class that is responsbile for initialising weights and biases. 
'''
class WeightAndBias: 
    def __init__(self, number_features, layers, initialisation_type="random"):
        
        self.initialisation_type = initialisation_type
        self.layers = [number_features] + layers
        self.weights = [pd.DataFrame()] + [np.random.randn(self.layers[i+1], self.layers[i]) * 0.01 for i in range(len(self.layers)-1)]
        self.biases = [pd.DataFrame()] + [np.zeros([self.layers[i+1], 1]) for i in range(len(self.layers)-1)]
                
    ''' 
    method to update learning parameters    
    '''
    def update_learning_parameters(self, no_l, hp_obj, dW, db, m_training) :
        for l in range(1, no_l+1):
            self.biases[l] =  self.biases[l] - hp_obj.learning_rate * db[l]
            self.weights[l] = (1 - (hp_obj.lambd *  hp_obj.learning_rate)/m_training) * self.weights[l] - hp_obj.learning_rate * dW[l]

'''            
ActivationFunctions that takes layers and list of activation functions to be used for each of the layers.
'''
class ActivationFunctions:
    def __init__(self, layers, activation_functions=None) :
        if activation_functions is None: 
            activation_functions= ['tanh'] * (len(layers) - 1) + ['softmax']
            
        self.activation_functions = [None] + [eval(f'ActivationFunctions.{activation_function}') 
                                     for activation_function in activation_functions]
        
        self.derivative_functions = [None] + [eval(f'ActivationFunctions.{activation_function}_derivative') 
                                     for activation_function in activation_functions]
        
    @staticmethod
    def sigmoid(z) :
        return 1 / (1 + np.exp( -z ))
    
    @staticmethod
    def relu(z) : 
        return np.where(z>0, z, 0.0001 * z )
    
    @staticmethod
    def tanh(z) :
        # return np.tanh(z
        z = np.clip(z, -20, 20)
        return (np.exp(z) - np.exp(-z))/ (np.exp(z) + np.exp(-z))
    
    @staticmethod
    def softmax(z):
        z = np.clip(z, -20, 20)
        return np.exp(z) / np.sum(np.exp(z), axis=0) 
    
    @staticmethod
    def softmax_derivative(y, a) :
        return a - y
    
    @staticmethod
    def sigmoid_derivative(y, a) :
        return a - y
    
    @staticmethod
    def tanh_derivative(z) :
        return (1 - np.tanh(z) ** 2)
    
    @staticmethod
    def relu_derivative(z) :
        return (z > 0) * 1
    
    @staticmethod
    def calculate_loss(a, y, m, hp, lp) :
            return (-1/m * np.sum(np.multiply(y, np.log(a))), 
                    -1/m * np.sum(np.multiply(y, np.log(a))) + hp.lambd/(2 *m ) * sum(np.sum(np.square(lp.weights[i]))
                                                                                    for i in range(1, hp.no_l+1)))
    
'''
NeuralNetwork class where the magic happens, Forward prop and Backprop happens.
'''
class NeuralNetwork: 
    def __init__(self, X_train, y_train, HyperParameters, activation_functions=None) :
        
        self.X_train, self.y_train = X_train, y_train
        self.n, self.m = X_train.shape
        
        print(f"number of training examples: {self.m}\nnumber of features: {self.n}"
              f"\nshape of y_train {self.y_train.shape}")

        #hp --> hyperparameters
        self.hp = HyperParameters 
        self.layers = self.hp.layers
        self.no_l = self.hp.no_l
        
        self.act_function_obj = ActivationFunctions(self.layers, activation_functions=activation_functions)
 
        #lp --> learning parameters -> weights and biases
        self.lp = WeightAndBias(self.n, self.layers) 
    
        if self.hp.batch_size is None:
            self.hp.batch_size = self.m
            

    def forward_propagation(self, X_batch) :
        self.Z, self.A = [0] + [None] * self.no_l, [X_batch ] + [None] * self.no_l
        activation_functions = self.act_function_obj.activation_functions

        for l in range(1, self.no_l + 1):
            self.Z[l] = np.dot(self.lp.weights[l], self.A[l-1]) + self.lp.biases[l]
            self.A[l] = activation_functions[l](self.Z[l])      

    def back_propagation(self, y_batch) :

        derivative_functions = self.act_function_obj.derivative_functions
        batch_size = y_batch.shape[1]
        
        self.dZ =[None] +  [None] * self.no_l
        self.dW =[None] +  [None] * self.no_l
        self.db =[None] +  [None] * self.no_l

        self.dZ[self.no_l] = derivative_functions[self.no_l](y_batch, self.A[self.no_l])
        self.dW[self.no_l] = 1/batch_size * np.dot(self.dZ[self.no_l] , self.A[self.no_l - 1].T)
        self.db[self.no_l] = 1/batch_size * np.sum(self.dZ[self.no_l], axis=1, keepdims=True)

        assert self.dZ[self.no_l].shape == self.Z[self.no_l].shape
        assert self.db[self.no_l].shape == self.lp.biases[self.no_l].shape        
        assert self.dW[self.no_l].shape == self.lp.weights[self.no_l].shape

        for l in range(self.no_l - 1, 0, -1) : 

            self.dZ[l] = np.dot(self.lp.weights[l+1].T, self.dZ[l+1] )* derivative_functions[l](self.Z[l])
            self.dW[l] = 1/batch_size * np.dot(self.dZ[l], self.A[l-1].T)
            self.db[l] = 1/batch_size * np.sum(self.dZ[l], axis=1, keepdims=True)

            assert self.dZ[l].shape == self.Z[l].shape
            assert self.dW[l].shape == self.lp.weights[l].shape
            assert self.db[l].shape == self.lp.biases[l].shape  
            
            
    def train_nn(self, verbose=False, per_epoch_log=100) :
        for epoch in range(self.hp.epochs): 
            for batch_s in range(0, self.m, self.hp.batch_size) :
                
                batch_e = min(batch_s + self.hp.batch_size, self.m)
                
                X_batch = self.X_train[:, batch_s: batch_e]
                y_batch = self.y_train[:, batch_s: batch_e]
                m_batch_size = batch_e - batch_s

                self.forward_propagation(X_batch)
                self.back_propagation(y_batch)
                self.lp.update_learning_parameters(self.no_l, self.hp,  self.dW, self.db, m_batch_size)

            if verbose and epoch % per_epoch_log == 0: 
                print(f"epochs {epoch} loss: ",ActivationFunctions.calculate_loss(self.A[self.no_l], y_batch, m_batch_size, self.hp,  
                                                                                  self.lp))

    def predict(self, X_test):
        self.forward_propagation(X_test)
        preds=  self.A[self.no_l].T
        return (preds == preds.max(axis=1)[:,None]).astype(int)

def one_hot_encoding_y(train_data) :
    a = train_data.label
    b = np.zeros((a.size, 10))
    b[np.arange(a.size),a] = 1
    return b

In [7]:
train_data = pd.read_csv('train.csv')

In [8]:
m = train_data.shape[0]
X = train_data.drop('label', axis=1).iloc[0:m].to_numpy() / 255
y  = one_hot_encoding_y(train_data)[:m]
y = np.reshape(y, (m, 10))

layers=[256, 10]
activation_functions = ['relu'] * (len(layers) - 1) + ['softmax']
hp = HyperParameters(layers=layers, learning_rate=.5, epochs=500, mini_batch_size=2048, lambd=.1)

nn = NeuralNetwork(X.T, y.T, hp, activation_functions=activation_functions)

nn.train_nn( verbose=True, per_epoch_log=10)

number of training examples: 42000
number of features: 784
shape of y_train (10, 42000)
epochs 0 loss:  (1.183773680744079, 1.185490161822964)
epochs 10 loss:  (0.21547382936325504, 0.21985174022926463)
epochs 20 loss:  (0.1408202103371239, 0.14668417831298255)
epochs 30 loss:  (0.0982197614891936, 0.10538648778049989)
epochs 40 loss:  (0.07105613454820482, 0.07938684893561114)
epochs 50 loss:  (0.05317434554385173, 0.06256163777776504)
epochs 60 loss:  (0.0413090525415835, 0.05166543438159734)
epochs 70 loss:  (0.032918313007260794, 0.04417391137296511)
epochs 80 loss:  (0.026738097130996993, 0.03883247257436905)
epochs 90 loss:  (0.022206989611361187, 0.03508689923889535)
epochs 100 loss:  (0.018844447584489384, 0.03246152583234133)
epochs 110 loss:  (0.016319240052911053, 0.030629577854840946)
epochs 120 loss:  (0.014385596010992657, 0.02934818896295907)
epochs 130 loss:  (0.012804411360465742, 0.028380596765228937)
epochs 140 loss:  (0.011545497639632247, 0.027698876140105536)
epoc

In [9]:
prob_preds = lambda preds: (preds == preds.max(axis=1)[:,None]).astype(int)
preds = nn.predict(X.T)
r = np.sum(np.argmax(y, axis=1) == np.argmax(preds, axis=1))
w = np.sum(np.argmax(y, axis=1) != np.argmax(preds, axis=1))
print(f"total number of examples: {m}\nnumber of right predictions: {r}\nnumber of wrong predictions: {w}\n"
          f"accuracy on train: {r/m * 100}%")

total number of examples: 42000
number of right predictions: 42000
number of wrong predictions: 0
accuracy on train: 100.0%


In [10]:
test_data = pd.read_csv('test.csv')

In [11]:
X_test = test_data.to_numpy() / 255
preds = nn.predict(X_test.T)
preds = np.argmax(preds, axis=1)

In [12]:
sub_df = pd.DataFrame(preds, columns=['Label'])
sub_df.index.name= 'ImageId'
sub_df.index = sub_df.index + 1
sub_df.reset_index().to_csv('mnsit_submission.csv',index=False)