## Implement basic fully connect neural network
### The usage step:
1. create a network by calling network = fullyConnect(layers, batch_size, learning_rate)
    - layers: a list of number of neurons per layers, e.g [2,3,2,1] means 4 layers network, train data has 2 features, 1st hidden layer has 3 neurons, 2nd hidden layer has 2 neurons, and output layer has 1 neuron
    - batch_size
    - learning_rate
2. train the network by calling network.train(train_data, labels, steps)
3. predict with input data by calling network.predict(input_data)

### Tips and findings for training a network to solve XOR problem:
  - batch_size is an important hyper parameter, only when batch_size is 1 or 2 works
  - when trains the network, if picks the train data sequently, the gradient will saturate in early stage, therefore the training fails. So training data has to be picked randomly for training steps

In [1]:
import numpy as np

In [2]:
# not used any more
def softmax(x):
    x = np.exp(x)
    exp_sum = np.sum(x, axis = 1, keepdims = True)
    return x/exp_sum
    

In [6]:
class fullyConnect():
    def __init__(self, layers, batch_size, learning_rate):
        
        #self.activation = self.__tanh
        #self.derative = self.__tanh_derative
        self.activation = self.__relu
        self.derative = self.__relu_derative
        self.output_derative = self.__sigmoid_derative
        self.output_func = self.__sigmoid
        #self.output_func = self.__tanh
        #self.output_derative = self.__tanh_derative
        self.layers = layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        
        # Set the seed for the random number generator
        # Ensures same random numbers are produced every time the program is run
        np.random.seed(42)
        self.A = []
        self.Z = []
        self.W = [0]
        self.B = [0]

        # Initialize the parameter W and B for each layer
        for i in range(len(layers)-1):
            weights = np.random.rand(layers[i+1], layers[i])*2-1
            bias = np.zeros((layers[i+1], 1))
            self.W.append(weights)
            self.B.append(bias)
            
    def __relu(self,x):
        x[x<0] = 0
        return x
    def __relu_derative(self, x):
        y = x
        y[y>0] = 1
        y[y<=0] = 0
        return y
    def __tanh(self,x):
        return np.tanh(x)
    
    def __tanh_derative(self, x):      
        return 1.0 - self.__tanh(x)**2
    
    def __sigmoid(self,x):
        y = np.exp(-x)
        one = np.ones(x.shape)
        return 1/(one + y)    
    
    def __sigmoid_derative(self,x):
        sig_value = self.__sigmoid(x)
        result = sig_value*(1-sig_value)
        return result
    
    def __cost_func(self,a, y):        

        #loss = -(y*np.transpose(np.log(a))+(1-y)*np.transpose(np.log(1-a)))
        
        loss_1 = np.dot(y, np.transpose(np.log(a)))        
        loss_2 = np.dot(1-y, np.transpose(np.log(1-a)))
        loss = - (loss_1+loss_2) 
        #loss = np.sum((y-a)*(y-a))/2 #this loss function also works
        #loss = a-y
        cost = loss/self.batch_size
        return cost
    
    def __cost_derative(self, A, Y):
        dA= np.sum((1-Y)/(1-A)-Y/A)/Y.shape[1]
        #dA = np.sum((A-Y))/Y.shape[1] #this loss function also works
        
        return dA
    
    def propogate(self, X, mode,Y=None):
        self.A=[]
        self.A.append(X)
        last_layer = len(self.layers)
        for i in range(last_layer-1):
            z_value = np.dot(self.W[i+1], self.A[i])+self.B[i+1]
            self.Z.append(z_value)
            if i==last_layer-2:
                self.A.append(self.output_func(z_value))
            else:
                self.A.append(self.activation(z_value))
        if mode==1:
            cost = self.__cost_func(self.A[-1], Y)
            return cost
        else:
            return self.A[-1]

    def back_propogate(self, Y):

        last_layer = len(self.layers) - 1 #layer from 1 to last_layer   

        dA= self.__cost_derative(self.A[-1],Y)
        
        for i in reversed(range(1,len(self.layers))):
            if i== last_layer:
                dZ = dA*self.output_derative(self.Z[i])
            else:
                dZ = dA*self.derative(self.Z[i])        
            dW = np.dot(dZ,np.transpose(self.A[i-1]))/self.batch_size  
            dB= np.sum(dZ, axis=1, keepdims = True)/self.batch_size               
            dA = np.dot(np.transpose(self.W[i]), dZ)
            self.W[i] = self.W[i] - self.learning_rate*dW
            
            self.B[i] = self.B[i] - self.learning_rate*dB
        



    def train(self, X, Y, steps):
        # X.shape=(dimension of one single sample data, batch_size)
        # Y.shape: (dimension of output data, batch_size)
        self.A = []
        self.Z = []
        self.Z.append(X)
        
        start = 0

        for step in range(steps):      
            # Random pick one of batch can reduce the gradient saturation
            start = np.random.randint(X.shape[1],high=None)
           
            if (start + batch_size) <= X.shape[1]:
                data_set = X[:,start:start+self.batch_size]
                label = Y[:, start:start+self.batch_size]
                start = (start + self.batch_size) % X.shape[1]

            else:
                data_set = np.hstack((X[:, start: X.shape[1]], X[:, 0:self.batch_size-(X.shape[1]-start)]))
                label = np.hstack((Y[:, start:X.shape[1]], Y[:, 0:self.batch_size-(X.shape[1]-start)]))
                start = (self.batch_size-(X.shape[1]-start)) % X.shape[1]
            
            cost = self.propogate(data_set,1,label)

            
            if step < steps -1:
                self.back_propogate(label)
            if (step % 1000 == 0):    
                print("step: ", step, "cost: ",cost)            
        self.A = []
        self.Z = []
            
        return cost

    def predict(self, test_data):
        result = self.propogate(test_data, 0, Y=None)
        return result



In [20]:
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
Y = np.array([[0, 1, 1, 0]]).T
Y = Y.reshape(1,X.shape[1])

layers = [X.shape[0],3,2,1]  
batch_size = 1
learning_rate = 0.2
steps = 22000

network = fullyConnect(layers, batch_size, learning_rate)
cost = network.train(X, Y, steps)


step:  0 cost:  [[0.69314718]]
step:  1000 cost:  [[0.37196612]]
step:  2000 cost:  [[0.53218323]]
step:  3000 cost:  [[0.01933986]]
step:  4000 cost:  [[0.13162396]]
step:  5000 cost:  [[0.25839508]]
step:  6000 cost:  [[0.0455034]]
step:  7000 cost:  [[0.10264321]]
step:  8000 cost:  [[0.06950828]]
step:  9000 cost:  [[0.09878183]]
step:  10000 cost:  [[0.04611934]]
step:  11000 cost:  [[1.55380216e-06]]
step:  12000 cost:  [[0.06639188]]
step:  13000 cost:  [[6.04676326e-08]]
step:  14000 cost:  [[2.62396108e-08]]
step:  15000 cost:  [[3.47378584e-08]]
step:  16000 cost:  [[0.04544249]]
step:  17000 cost:  [[0.00164739]]
step:  18000 cost:  [[0.00026714]]
step:  19000 cost:  [[0.00045645]]
step:  20000 cost:  [[2.75316214e-10]]
step:  21000 cost:  [[2.57296406e-11]]


In [21]:
input=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
output = network.predict(input)
print("input: ", input)
print("output: ", output)
print("W: ", network.W)
print("B:", network.B)

input:  [[0 0 1 1]
 [0 1 0 1]]
output:  [[0.00361658 0.99691532 1.         0.00361658]]
W:  [0, array([[-0.25091976,  0.90142861],
       [ 0.46398788,  0.19731697],
       [-0.68796272, -0.68801096]]), array([[-0.88383278,  0.73235229,  0.20223002],
       [ 0.41614516, -0.95883101,  0.9398197 ]]), array([[95.40497891, 61.29580576]])]
B: [0, array([[0.],
       [0.],
       [0.]]), array([[0.],
       [0.]]), array([[-5.61860446]])]


In [22]:
layers = [X.shape[0],2,2,1]  # after reducing the number of neurons on 1st hidden layer from 3 to 2, it doesn't work well
batch_size = 1
learning_rate = 0.2
steps = 22000

network = fullyConnect(layers, batch_size, learning_rate)
cost = network.train(X, Y, steps)
input=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
output = network.predict(input)
print("input: ", input)
print("output: ", output)
print("W: ", network.W)
print("B:", network.B)

step:  0 cost:  [[0.69314718]]
step:  1000 cost:  [[0.24069847]]
step:  2000 cost:  [[0.0455723]]
step:  3000 cost:  [[0.96778194]]
step:  4000 cost:  [[0.4629813]]
step:  5000 cost:  [[0.00086015]]
step:  6000 cost:  [[0.42312116]]
step:  7000 cost:  [[0.55206004]]
step:  8000 cost:  [[0.50420067]]
step:  9000 cost:  [[1.15818855e-06]]
step:  10000 cost:  [[0.89142695]]
step:  11000 cost:  [[1.01492483e-07]]
step:  12000 cost:  [[2.65838803e-08]]
step:  13000 cost:  [[6.78885772e-09]]
step:  14000 cost:  [[1.28765465e-09]]
step:  15000 cost:  [[0.4059906]]
step:  16000 cost:  [[0.56052839]]
step:  17000 cost:  [[0.35030652]]
step:  18000 cost:  [[1.57259486]]
step:  19000 cost:  [[7.01438907e-13]]
step:  20000 cost:  [[2.8399505e-13]]
step:  21000 cost:  [[6.30606678e-14]]
input:  [[0 0 1 1]
 [0 1 0 1]]
output:  [[0.56440166 0.56440166 1.         0.56440166]]
W:  [0, array([[-0.25091976,  0.90142861],
       [ 0.46398788,  0.19731697]]), array([[-0.68796272, -0.68801096],
       [-0.8

In [29]:
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
Y = np.array([[0, 1, 1, 0]]).T
Y = Y.reshape(1,X.shape[1])

layers = [X.shape[0],3,2,1]  
batch_size = 3 # increase bathc_size to 3 or 4, it doesn't work at all
learning_rate = 0.2
steps = 8000

network = fullyConnect(layers, batch_size, learning_rate)
cost = network.train(X, Y, steps)
input=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
output = network.predict(input)
print("input: ", input)
print("output: ", output)
print("W: ", network.W)
print("B:", network.B)

step:  0 cost:  [[0.71145214]]
step:  1000 cost:  [[0.63787695]]
step:  2000 cost:  [[0.70598654]]
step:  3000 cost:  [[0.68100681]]
step:  4000 cost:  [[0.80615593]]
step:  5000 cost:  [[0.69866118]]
step:  6000 cost:  [[0.72618465]]
step:  7000 cost:  [[0.67023916]]
input:  [[0 0 1 1]
 [0 1 0 1]]
output:  [[0.48746667 0.4848391  0.64568725 0.65477434]]
W:  [0, array([[-0.63472111,  0.79100249],
       [ 6.45394514,  0.51814401],
       [-0.68796272, -0.68801096]]), array([[-0.88383278,  0.73235229,  0.20223002],
       [ 0.42951861, -6.44852819,  0.9398197 ]]), array([[ 0.13757974, -0.4382871 ]])]
B: [0, array([[-0.05554361],
       [ 0.10439359],
       [ 0.        ]]), array([[ 0.        ],
       [-0.00595413]]), array([[-0.06066221]])]


In [28]:
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
Y = np.array([[0, 1, 1, 0]]).T
Y = Y.reshape(1,X.shape[1])

layers = [X.shape[0],3,2,1]  
batch_size = 2 # increase bathc_size to 2, it works but not as good as when batch_size is 1
learning_rate = 0.2
steps = 22000

network = fullyConnect(layers, batch_size, learning_rate)
cost = network.train(X, Y, steps)
input=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
output = network.predict(input)
print("input: ", input)
print("output: ", output)
print("W: ", network.W)
print("B:", network.B)

step:  0 cost:  [[0.69314718]]
step:  1000 cost:  [[0.61623889]]
step:  2000 cost:  [[0.39742342]]
step:  3000 cost:  [[0.2412631]]
step:  4000 cost:  [[0.29788379]]
step:  5000 cost:  [[0.30789416]]
step:  6000 cost:  [[0.3909768]]
step:  7000 cost:  [[0.14938316]]
step:  8000 cost:  [[0.17329874]]
step:  9000 cost:  [[0.21560454]]
step:  10000 cost:  [[0.20853821]]
step:  11000 cost:  [[0.05357006]]
step:  12000 cost:  [[0.1589372]]
step:  13000 cost:  [[0.03093324]]
step:  14000 cost:  [[0.06694393]]
step:  15000 cost:  [[0.03436084]]
step:  16000 cost:  [[0.07330538]]
step:  17000 cost:  [[0.14002454]]
step:  18000 cost:  [[0.10915363]]
step:  19000 cost:  [[0.08110672]]
step:  20000 cost:  [[0.0183464]]
step:  21000 cost:  [[0.0265849]]
input:  [[0 0 1 1]
 [0 1 0 1]]
output:  [[0.0586183  0.96418148 0.99999979 0.0586183 ]]
W:  [0, array([[-0.25091976,  0.90142861],
       [ 0.46398788,  0.19731697],
       [-0.68796272, -0.68801096]]), array([[-0.88383278,  0.73235229,  0.20223002