In [1]:
import numpy as np
import pandas as pd
import math

# Author: Pavan Pandurangi
# Last Modified: 7/29/19
# This program roughly sketches a three layer neural network (30-30-1) that
# attempts to classify a breast cancer tumor as malignant or benign based on 30 features.
# The neural network uses sigmoid as its activation function. Theta is the variable
# denoted for weights and lambda (lambd) is used for regularization. X is commonly used
# to denote features while y is commonly used to denote the labels. z is used to denote
# values of a layer in the pre-activiation stage, while a is used to denote values of a 
# layer in the post-activation stage (other than the input layer).

# note: this roughly created ANN does not have an implementation
# using cross-validation data yet. This will be incorporated later.
# For now, the values of the parameters have been chosen by hand.

# this function creates a dataframe using the data.csv file
# which contains the breast cancer training and testing data.
# It then gets rid of unwanted columns (unnecessary for implementation)
# and separates the dataset into training and testing data (random examples
# in both datasets)
def clean_data(csv_file_name, training_percent, testing_percent):
    data_set = pd.read_csv(csv_file_name)
    
    data_set["diagnosis"] = data_set["diagnosis"].replace("M", 1) # Malignant is 1
    data_set["diagnosis"] = data_set["diagnosis"].replace("B", 0) # Benign is 0
    
    data_set = data_set.drop(["id", "Unnamed: 32"], axis = 1)
    
    data_set = (data_set - data_set.min()) / (data_set.max() - data_set.min())
    
    data_set = data_set.sample(frac=1)
    
    training_set = data_set.head(int(len(data_set)*(training_percent / 100)))
    testing_set = data_set.tail(int(len(data_set)*(testing_percent / 100)))

    return training_set, testing_set

# this function is the core function for training the ANN (it calculates the proper
# weights that correlate the training features with the labels). First, the design
# (feature) matrix, X, is created along with the testing feature matrix. The training
# and testing label vectors are created as well. The weight matrices, Theta1 and Theta2,
# are created, where Theta1 represents the weights applied from the input layer to the
# hidden layer, and Theta2 represents the weights applied from the hidden layer to the
# output layer. Gradient descent is then run to calculate optimal values for Theta1 and Theta2
# based on the training set. The testing set is used to calculate the accuracy of the algorithm.
def train_algorithm(training_set, testing_set, num_iters, learning_rate, lambd):
    # define dataset features and label
    X = np.array(training_set.drop(["diagnosis"], axis = 1)) # 569 x 30
    X = np.hstack((np.ones((len(X), 1)), X)) # now 569 x 31
    X_test = np.array(testing_set.drop(["diagnosis"], axis = 1))
    X_test = np.hstack((np.ones((len(X_test), 1)), X_test))
    y = np.reshape(np.array(training_set["diagnosis"]), (-1, 1)) # 569 x 1
    y_test = np.reshape(np.array(testing_set["diagnosis"]), (-1, 1)) # 569 x 1

    INIT_EPSILON = 0.001
    Theta1 = np.random.rand(30, 31) * 2 * INIT_EPSILON - INIT_EPSILON
    Theta2 = np.random.rand(1, 31) * 2 * INIT_EPSILON - INIT_EPSILON
    
    Theta1, Theta2 = perform_gradient_descent(X, y, num_iters, Theta1, Theta2, learning_rate, lambd)
    
    hypothesis, accuracy = feedforward(X_test, y_test, Theta1, Theta2, True)
    return hypothesis, accuracy
    
# this function runs the gradient descient algorithm with the training set X with num_iters
# iterations. The starting values of the weights matrices are specified as init_theta1
# and init_theta2. The learning rate is specified and the regularization parameter lambda (lambd) is
# also specified.
def perform_gradient_descent(X, y, num_iters, init_theta1, init_theta2, learning_rate, lambd):
    theta1 = init_theta1
    theta2 = init_theta2
    for i in range(num_iters):
        theta1_grad, theta2_grad = getCostGradients(X, y, theta1, theta2, lambd)
        temp1 = theta1
        temp2 = theta2
        
        temp1 = temp1 - learning_rate * theta1_grad
        temp2 = temp2 - learning_rate * theta2_grad
        
        theta1 = temp1
        theta2 = temp2
    return theta1, theta2

# this function calculates the gradients needed for each iteration of
# gradient descent by feeding foward through the neural network and 
# backpropagating.
def getCostGradients(X, y, theta1, theta2, lambd):
    a1, z2, a2, z3, a3 = feedforward(X, y, theta1, theta2, False)
    return backprop(y, a3, z2, a2, a1, theta1, theta2, lambd)

# this function uses an input X and calculates the hypothesis (prediction)
# utilizing the weights. If the user is testing the accuracy of the algorithm,
# the accuracy is returned along with the prediction.
def feedforward(X, y, theta1, theta2, testing):
    a1 = X # a1: 569 x 31
    z2 = np.matmul(a1, theta1.T) # 569 x 30
    a2 = np.hstack((np.ones((len(z2), 1)), sigmoid(z2))) # 569 x 31
    z3 = np.matmul(a2, theta2.T)
    a3 = sigmoid(z3)
    if not testing:
        return a1, z2, a2, z3, a3
    else:
        costs = np.round(a3) - y
        count = 0
        for i in costs:
            if i == 0:
                count += 1
        accuracy = (count / len(X))
        return np.round(a3), accuracy

# this function backpropagates through the neural network starting
# at the output layer and calculates the gradients for each weight
# matrix. The bias elements are dealt with to allow for the vectorized
# calculations to run without error.
def backprop(y, a3, z2, a2, a1, theta1, theta2, lambd):
    m = len(y)
    d3 = a3 - y
    
    non_bias_theta2 = np.delete(theta2, (0), axis = 1)
    
    d2 = np.matmul(d3, non_bias_theta2) * sigmoid_gradient(z2)
    
    delta1 = np.matmul(d2.T, a1)
    delta2 = np.matmul(d3.T, a2)
    
    non_bias_theta1 = np.delete(theta1, (0), axis = 1)

    zero_bias_theta1 = np.hstack((np.zeros((len(non_bias_theta1), 1)), non_bias_theta1))
    zero_bias_theta2 = np.hstack((np.zeros((len(non_bias_theta2), 1)), non_bias_theta2))
    
    p1 = lambd * zero_bias_theta1
    p2 = lambd * zero_bias_theta2
    
    theta1_grad = (1 / m) * (delta1 + p1)
    theta2_grad = (1 / m) * (delta2 + p2)
    return theta1_grad, theta2_grad

# this function is the activation function used for each layer of the neural network
def sigmoid(z):
    return 1 / (1 + math.e ** (-z))

# this function is the gradient of the activation function and is needed during
# backpropagation to calculate the weight gradients.
def sigmoid_gradient(z):
    return sigmoid(z) * (1 - sigmoid(z))

In [2]:
# using a 70 / 30 split, will probably change to 60 / 20 / 20 with cross-validation.
training_set, testing_set = clean_data("data.csv", 70, 30)
hypothesis, accuracy = train_algorithm(training_set, testing_set, 10000, 0.1, 0.5)
print(hypothesis)
print(accuracy) # 97.0% - 98.88% accuracy on test set.

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.