# N-layer Binary Classifier
#### Implementation from scratch using Python

---

# Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
class NN():    
    def __init__(self, layers_dimensions):
        self.layers_dimensions = layers_dimensions
        self.parameters = None
        
        
    def initialize_parameters(self, layer_dims):
        parameters = {}
        L = len(layer_dims)
        for l in range(1, L):
            parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * 0.01
            parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
            assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
            assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
        return parameters
    
    
    def sigmoid(self, Z):
        A = 1/(1+np.exp(-Z))
        cache = Z
        return A, cache
    
    
    def sigmoid_backpropagation(self, dA, cache):
        """
        Implement the backward propagation for a single SIGMOID unit.
        Arguments:
        dA -- post-activation gradient, of any shape
        cache -- 'Z' where we store for computing backward propagation efficiently
        Returns:
        dZ -- Gradient of the cost with respect to Z
        """
        Z = cache
        s = 1/(1+np.exp(-Z))
        dZ = dA * s * (1-s)
        assert (dZ.shape == Z.shape)
        return dZ
    
    
    def relu(self, Z):
        A = np.maximum(0, Z)
        cache = Z 
        assert(A.shape == Z.shape)
        return A, cache
    
    
    def relu_backpropagation(self, dA, cache):
        """
        Implement the backward propagation for a single RELU unit.
        Arguments:
        dA -- post-activation gradient, of any shape
        cache -- 'Z' where we store for computing backward propagation efficiently
        Returns:
        dZ -- Gradient of the cost with respect to Z
        """
        Z = cache
        dZ = np.array(dA, copy=True)
    
        # When z <= 0, you should set dz to 0 as well. 
        dZ[Z <= 0] = 0
    
        assert (dZ.shape == Z.shape)
        return dZ
    
    
    def forward_propagation_activation(self, A_prev, W, b, activation):
        Z = np.dot(W, A_prev) + b
    
        assert(Z.shape == (W.shape[0], A_prev.shape[1]))
        linear_cache = (A_prev, W, b)
    
        if activation == "sigmoid":
            A, activation_cache = self.sigmoid(Z)
        elif activation == "relu":
            A, activation_cache = self.relu(Z)
    
        cache = (linear_cache, activation_cache)
    
        assert (A.shape == (W.shape[0], A_prev.shape[1]))
        return A, cache
    
    
    def forward_propagation(self, X, parameters):
        caches = []
        A = X
        L = len(parameters) // 2               

        for l in range(1, L):
            A_prev = A 
            A, cache = self.forward_propagation_activation(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], 'relu')
            caches.append(cache)
            
        AL, cache = self.forward_propagation_activation(A, parameters['W' + str(L)], parameters['b' + str(L)], 'sigmoid')
        caches.append(cache)

        assert(AL.shape == (1,X.shape[1]))     
        return AL, caches
    
    
    def compute_cost(self, AL, Y):
        m = Y.shape[1]
        cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply((1 - Y), np.log(1 - AL)))
        cost = np.squeeze(cost) 
        
        assert(cost.shape == ())
        return cost
    
    
    def backpropagation_activation(self, dA, cache, activation):
        linear_cache, activation_cache = cache
        if activation == "relu":
            dZ = self.relu_backpropagation(dA, activation_cache)
            A_prev, W, b = linear_cache
            m = A_prev.shape[1]
            
            dW = (1 / m) * np.dot(dZ, A_prev.T)
            db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
            dA_prev = np.dot(W.T, dZ)
        elif activation == "sigmoid":
            dZ = self.sigmoid_backpropagation(dA, activation_cache)
            A_prev, W, b = linear_cache
            m = A_prev.shape[1]
            
            dW = (1 / m) * np.dot(dZ, A_prev.T)
            db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
            dA_prev = np.dot(W.T, dZ)
    
        assert (dA_prev.shape == A_prev.shape)
        assert (dW.shape == W.shape)
        assert (db.shape == b.shape)
        return dA_prev, dW, db
    
    
    def backpropagation(self, AL, Y, caches):
        grads = {}
        L = len(caches)
        m = AL.shape[1]
        Y = Y.reshape(AL.shape)

        dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
        current_cache = caches[L - 1]
        grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = self.backpropagation_activation(dAL, current_cache, 'sigmoid')
        
        for l in reversed(range(L-1)):
            current_cache = caches[l]
            dA_prev_temp, dW_temp, db_temp = self.backpropagation_activation(grads['dA' + str(l + 1)], current_cache, 'relu')
            grads["dA" + str(l)] = dA_prev_temp
            grads["dW" + str(l + 1)] = dW_temp
            grads["db" + str(l + 1)] = db_temp
        return grads
    
    
    def update_parameters(self, parameters, grads, learning_rate):
        L = len(parameters) // 2
        for l in range(L):
            parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads['dW' + str(l + 1)]
            parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads['db' + str(l + 1)]
        return parameters
       
        
    def plot_cost(self, costs, learning_rate):
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per tens)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()    
        
        
    def train(self, X, Y, learning_rate=0.0075, num_iterations=3000, print_cost=False): #lr was 0.009
        costs = []
        parameters = self.initialize_parameters(self.layers_dimensions)
        for i in range(num_iterations):
            AL, caches = self.forward_propagation(X, parameters)
            cost = self.compute_cost(AL, Y)
            grads = self.backpropagation(AL, Y, caches)
            parameters = self.update_parameters(parameters, grads, learning_rate)

            if print_cost and i % 100 == 0:
                costs.append(cost)
                print ("Cost after iteration %i: %f" % (i, cost))
            
        self.plot_cost(costs, learning_rate)
        self.parameters = parameters
        return parameters
    
    
    def predict(self, X, Y):
        prediction = self.forward_propagation(X, self.parameters)
        return prediction

---

# Results

## Credit Card Fraud Dataset

In [3]:
#fraud_data = pd.read_csv('./data/fraud_dataset.csv')
#fraud_data.head()

In [4]:
# No need for missing data imputation, since no data is missing at all.
#fraud_data.isnull().values.any()

In [5]:
#Y = np.array(fraud_data['Class'])
#X = np.array(fraud_data.drop(columns=['Class']))

In [6]:
#X.shape

In [7]:
#Y = Y.reshape(Y.shape[0], 1)
#Y.shape

In [8]:
#train_X, test_X, train_Y, test_Y = train_test_split(X, Y)
#print("X_train shape:", train_X.shape)
#print("Y_train shape:", train_Y.shape)
#print("X_test shape:", test_X.shape)
#print("Y_test shape:", test_Y.shape)

In [9]:
#train_X = train_X.T
#test_X = test_X.T
#print("X_train shape:", train_X.shape)
#print("X_test shape:", train_X.shape)

In [10]:
#layers_dimensions = [train_X.shape[0], 20, 7, 5, 1] 
#model = NN(layers_dimensions)

In [11]:
#model.train(train_X, train_Y, learning_rate=0.001, num_iterations=5000, print_cost=True)

---

# Conclusion