# Neural Network from Scratch
## STAT 7900 - Python for Data Science
## Spring 2021
### Connor Armstrong
### May 5th, 2021
This notebook contains code for a class which can be used to train a neural network with one or two hidden layers to predict binary data (labels, result) from binary descriptors (features). The class contains several functions with inputs that can be selected by the user. The following is a definition of the inputs to class initialization and functions:

#### Initialization
&nbsp; "features" - 2-dimensional numpy array of features data with column of 1's in the first column<br>
&nbsp; "result" - 1-dimensional numpy array of label data<br>
&nbsp; "batchsize" - integer, amount of observations to be evaluated at a time within the "fit" function<br>
&nbsp; "h1" - number of nodes in the hidden layer (1), second layer if num_hidden_layers = 2<br>
&nbsp; "h2" - if two hidden layers are specified by the user, the number of nodes in the first hidden layer (2). Default = 5<br>
&nbsp; "num_hidden_layers" - User specifies 1 or 2 for one hidden layer or two hidden layers. Default = 1<br>

#### "fit" function
&nbsp; "num_iterations" - integer, number of passes through the model<br>
&nbsp; "learning_rate" - float, learning rate variable multiplied to weight gradients at each iteration<br>
&nbsp; "out_freq" - integer, frequency at which model performance statistics are printed. Default = 10 <br>
&nbsp; "stifle" - boolean, user specifies whether printed output is desired. Helpful when iterating through the class many times. Default = False<br>

#### <br>Libraries

In [4]:
import numpy as np                             #array manipulation
import collections                             #frequency counter
from sklearn import svm                        #import a1a dataset
from sklearn.datasets import load_svmlight_file#import a1a dataset
from sklearn.utils import shuffle              #for batching 
import pandas as pd                            #for model performance and grid search dataframes
import random                                  #for random sample
import math                                    #floor
from sklearn.model_selection import train_test_split

#### <br>Define Neural Network Class and Functions

In [5]:
#Sigmoid Function
def sig(x):
    out = 1/(1+np.exp(-x))
    return out

#Neural Network class with one or two hidden layers
class neural_network:
    def __init__(self, features, result, batchsize, h1, h2=5, num_hidden_layers=1):
        self.inputx = features
        self.inputy = np.squeeze(result)
        self.batchsize1 = batchsize
        self.num_hidden_layers1 = num_hidden_layers
        h21 = h2  #number of nodes in first hidden layer W2
        h11 = h1  #number of nodes in second hidden layer W1 (OR FIRST HIDDEN LAYER IF ONE LAYER NN)
        self.names = ["Step", "Number Correct", "MSE", "Accuracy"]#model performance dataframe column names

        if self.num_hidden_layers1 == 2:
            W2_number = self.inputx.shape[1]*h2#features includes column of ones, so features.shape[1] = p+1
            W1_number = (h21+1)*h11
            self.W2 = np.array(np.random.standard_normal((self.inputx.shape[1],h21)))#*((2/self.batchsize1)**0.5))
            #dimensions are (# features + 1, h2)
            
            self.W1 = np.array(np.random.standard_normal((h21+1,h11)))#*((2/self.batchsize1)**0.5))
            #dimensions are (h2 + 1, h1)
            
        elif self.num_hidden_layers1 == 1:
            W1_number = self.inputx.shape[1]*h11
            self.W1 = np.array(np.random.standard_normal((self.inputx.shape[1],h11))*((2/self.batchsize1)**0.5))
        self.W0 = np.random.standard_normal(h1+1)*((2/(batchsize+1))**0.5)#dim (h1+1,)
        
    def fit(self, num_iterations, learning_rate, out_freq = 10, stifle = False):#consider using verbose 0 is everything, 1 is only important, 2 is nothing
        #initial user feedback
        if stifle == False and self.num_hidden_layers1 == 1:
            print("Training Neural Network with 1 Hidden-Layer\n")
        elif stifle == False and self.num_hidden_layers1 == 2:
            print("Training Neural Network with 2 Hidden-Layers\n")
            
        #begin training model
        for x in range(num_iterations):
            shuffle_x, shuffle_y = shuffle(self.inputx, self.inputy, random_state=None)
            for i in np.arange(0, shuffle_x.shape[0], self.batchsize1):
                batch_x = shuffle_x[i:i+self.batchsize1]
                batch_y = np.squeeze(shuffle_y[i:i+self.batchsize1])
                if self.num_hidden_layers1 == 2:
                    X = np.matmul(batch_x, self.W2)                
                    Y2 =  (np.c_[np.ones(X.shape[0]), sig(X)])     
                    Hidden = np.matmul(Y2,self.W1)                 
                    Y1 =  (np.c_[np.ones(X.shape[0]), sig(Hidden)])
                elif self.num_hidden_layers1 == 1:
                    X = np.matmul(batch_x, self.W1)                
                    Y1 =  (np.c_[np.ones(X.shape[0]), sig(X)])     
                Yhat = sig(np.matmul(Y1, self.W0))             
                delta_W0 = 2*np.dot((Yhat-batch_y)*Yhat*(1-Yhat), Y1)
                hidden_element = np.einsum("ij, j -> ij",np.einsum("i, ij -> ij",(Yhat-batch_y)*Yhat*(1-Yhat),Y1[:,1:]*(1-Y1[:,1:])),self.W0[1:])   
                if self.num_hidden_layers1 == 2:
                    delta_W1 = 2*np.matmul(Y2.T, hidden_element) 
                    delta_W2 = 2*np.matmul(batch_x.T,np.einsum("ij,ij -> ij",np.matmul(hidden_element,self.W1[1:].T),Y2[:,1:]*(1-Y2[:,1:])))
                    self.W2 = self.W2 - learning_rate * delta_W2    
                elif self.num_hidden_layers1 == 1:
                    delta_W1 = 2*np.matmul(batch_x.T, hidden_element)
                self.W0 = self.W0 - learning_rate * delta_W0
                self.W1 = self.W1 - learning_rate * delta_W1
            #to print model performance statistics while fitting model
            if x % out_freq == 0 and stifle == False and x != num_iterations and x != 0:
                if self.num_hidden_layers1 == 2:
                    X1 = np.matmul(self.inputx, self.W2)                                           
                    Y11 =  (np.c_[np.ones(X1.shape[0]), sig(np.matmul((np.c_[np.ones(X1.shape[0]), sig(X1)]),self.W1))])
                if self.num_hidden_layers1 == 1:
                    X1 = np.matmul(self.inputx, self.W1)                     
                    Y11 =  (np.c_[np.ones(X1.shape[0]), sig(X1)])
                Yhat1 = sig(np.matmul(Y11, self.W0))   
                MSE = (1/(2*len(self.inputy)))*sum((Yhat1-self.inputy)**2)
                number_correct = sum(np.round(Yhat1) == self.inputy)
                accuracy = 100*number_correct/len(self.inputy)
                mod_perf_list = [(x, number_correct, MSE, accuracy)]
                print(pd.DataFrame(mod_perf_list, columns = self.names))#print individual values of dataframe ie mod_perf_list.MSE
                print()
        
        #calculate final performance statistics
        if self.num_hidden_layers1 == 2:
            self.X = np.matmul(self.inputx, self.W2)                   
            Y2 =  (np.c_[np.ones(self.X.shape[0]), sig(self.X)])     
            Hidden = np.matmul(Y2,self.W1)                       
            Y1 =  (np.c_[np.ones(self.X.shape[0]), sig(Hidden)])
        if self.num_hidden_layers1 == 1:
            self.X = np.matmul(self.inputx, self.W1)                     
            Y1 =  (np.c_[np.ones(self.X.shape[0]), sig(self.X)])
        self.Yhat = sig(np.matmul(Y1, self.W0))                  
    
        #modelevalparams, put them here so can be accessed without running model_eval
        self.MSE = (1/(2*len(self.inputy)))*sum((self.Yhat-self.inputy)**2)
        self.MAE = sum(abs(self.Yhat-self.inputy))/len(self.inputy)
        self.number_predicted = sum(np.round(self.Yhat))
        self.number_correct = sum(np.round(self.Yhat) == self.inputy)
        self.accuracy = 100*self.number_correct/len(self.inputy)
         
        
        #model_eval
        if stifle == False:
            if self.num_hidden_layers1 == 1:
                print("\nNeural Network with 1 Hidden-Layer Complete\n")
            elif self.num_hidden_layers1 == 2:
                print("\nNeural Network with 2 Hidden-Layers Complete\n")
            print("Final Performance Metrics after", num_iterations, "iterations:\n")
            print("MSE: ","{:.3f}".format(self.MSE))
            print("Mean Absolute Error: ","{:.3f}".format(self.MAE))
            print(self.number_correct,"/",len(self.inputy)," predicted correctly -> ","{:.1f}".format(self.accuracy),"% accuracy")
        
    def predict(self, test_x, test_y, stifle = False):
        if self.num_hidden_layers1 == 2:
            self.X = np.matmul(test_x, self.W2)                   
            Y2 =  (np.c_[np.ones(self.X.shape[0]), sig(self.X)])     
            Hidden = np.matmul(Y2,self.W1)                       
            Y1 =  (np.c_[np.ones(self.X.shape[0]), sig(Hidden)])
        if self.num_hidden_layers1 == 1:
            self.X = np.matmul(test_x, self.W1)                     
            Y1 =  (np.c_[np.ones(self.X.shape[0]), sig(self.X)])
        self.Yhat = sig(np.matmul(Y1, self.W0))
        self.MSE = (1/(2*len(test_y)))*sum((self.Yhat-test_y)**2)
        self.MAE = sum(abs(self.Yhat-test_y))/len(test_y)
        self.number_predicted = sum(np.round(self.Yhat))
        self.number_correct = sum(np.round(self.Yhat) == test_y)
        self.accuracy = 100*self.number_correct/len(test_y)
        if stifle == False:
            if self.num_hidden_layers1 == 1:
                print("\nNeural Network with 1 Hidden-Layer Prediction\n")
            elif self.num_hidden_layers1 == 2:
                print("\nNeural Network with 2 Hidden-Layers Prediction\n")
            print("MSE: ","{:.3f}".format(self.MSE))
            print("Mean Absolute Error: ","{:.3f}".format(self.MAE))
            print(self.number_correct,"/",len(test_y)," predicted correctly -> ","{:.1f}".format(self.accuracy),"% accuracy")
        return self.Yhat
    def score(self, test_x, test_y):
        SSE = sum((self.predict(test_x, test_y) - test_y)**2)
        SSM = sum((test_y - np.mean(test_y))**2)
        return 1 - SSE/SSM
    def coef(self):
        if self.num_hidden_layers1 == 2:
            print("W0", self.W0, "\n")
            print("W1", self.W1, "\n")
            print("W2", self.W2, "\n")
        if self.num_hidden_layers1 == 1:
            print("W0", self.W0, "\n")
            print("W1", self.W1, "\n")
#method returns 2 data objects for input and output, to import a1a data
def get_data(datafile):
    data = load_svmlight_file(datafile)
    return data[0], data[1]

#### <br>Import and Prepare a1a Dataset

In [6]:
X, Z = get_data('C:/Users/conno/OneDrive/Desktop/STAT 7900 - Python for Data Science/Project/a1a.txt')

#X is sparse matrix
denseX = X.todense()
arrayX = np.squeeze(np.asarray(denseX))

#Convert -1's to 0's
Z_01 = Z
for x in range(0, Z.shape[0]):
    if Z_01[x] == -1:
            Z_01[x] = 0
            
#define inputs to model
a1a_x = np.hstack((np.ones((arrayX.shape[0],1)),arrayX))
a1a_y = Z_01

#### <br>Simple One-Hidden-Layer Demonstration

In [83]:
test = neural_network(features=a1a_x, result=a1a_y, batchsize=20, h1=13, num_hidden_layers=1)
test.fit(num_iterations = 100, learning_rate = 0.01)

Training Neural Network with 1 Hidden-Layer


Neural Network with 1 Hidden-Layer Complete

Final Performance Metrics after 100 iterations:

MSE:  0.049
Mean Absolute Error:  0.005
1400 / 1605  predicted correctly ->  87.2 % accuracy


#### <br>Simple Two-Hidden-Layer Demonstration

In [84]:
test2 = neural_network(features=a1a_x, result=a1a_y, batchsize=20, h1=13, h2=5, num_hidden_layers=2)
test2.fit(num_iterations = 100, learning_rate = 0.01)

Training Neural Network with 2 Hidden-Layers


Neural Network with 2 Hidden-Layers Complete

Final Performance Metrics after 100 iterations:

MSE:  0.048
Mean Absolute Error:  0.000
1397 / 1605  predicted correctly ->  87.0 % accuracy


#### <br>Demonstrate "stifle" and Calling Parameters Manually

In [85]:
test2.fit(num_iterations = 100, learning_rate = 0.01, stifle = True)
print(test2.accuracy)
print(test2.number_correct)

88.1619937694704
1415


#### <br>Demonstrate use of user-defined print interval

In [82]:
test2.fit(num_iterations = 100, learning_rate = 0.01, out_freq=20)

Training Neural Network with 2 Hidden-Layers

   Step  Number Correct       MSE  Accuracy
0    20            1452  0.038905  90.46729

   Step  Number Correct       MSE   Accuracy
0    40            1458  0.036938  90.841121

   Step  Number Correct       MSE   Accuracy
0    60            1468  0.035446  91.464174

   Step  Number Correct       MSE   Accuracy
0    80            1464  0.034593  91.214953


Neural Network with 2 Hidden-Layers Complete

Final Performance Metrics after 100 iterations:

MSE:  0.033
Mean Absolute Error:  0.014
1478 / 1605  predicted correctly ->  92.1 % accuracy


#### <br>Define Train and Test Data

In [7]:
test_number = math.floor(a1a_x.shape[0]*0.20)
a1a_x_train, a1a_x_test, a1a_y_train, a1a_y_test = train_test_split(a1a_x, a1a_y, test_size=test_number)
print(a1a_x_train.shape)
print(a1a_x_test.shape)
print(a1a_y_train.shape)
print(a1a_y_test.shape)

(1284, 120)
(321, 120)
(1284,)
(321,)


#### <br>Train and Test Neural Network with 1 Hidden-Layer

In [94]:
train = neural_network(features=a1a_x_train, result=a1a_y_train, batchsize=20, h1=13, num_hidden_layers=1)
train.fit(num_iterations = 100, learning_rate = 0.01, stifle = True)
train.predict(a1a_x_test,a1a_y_test)


Neural Network with 1 Hidden-Layer Prediction

MSE:  0.057
Mean Absolute Error:  0.006
267 / 321  predicted correctly ->  83.2 % accuracy


#### <br>Train and Test Neural Network with 2 Hidden-Layers

In [8]:
train2 = neural_network(features=a1a_x_train, result=a1a_y_train, batchsize=20, h1=13, h2=5, num_hidden_layers=2)
train2.fit(num_iterations = 100, learning_rate = 0.01, stifle = True)
train2.predict(a1a_x_test,a1a_y_test)


Neural Network with 2 Hidden-Layers Prediction

MSE:  0.053
Mean Absolute Error:  0.216
269 / 321  predicted correctly ->  83.8 % accuracy


array([0.05946836, 0.06530213, 0.52400793, 0.02573309, 0.05977396,
       0.03285443, 0.03523158, 0.16419985, 0.20845284, 0.02759085,
       0.28250217, 0.04652   , 0.09688343, 0.02936442, 0.80070116,
       0.39837617, 0.30744115, 0.41437255, 0.02670354, 0.03266903,
       0.62019784, 0.19352364, 0.06059123, 0.03046259, 0.37986079,
       0.23768813, 0.04272216, 0.15435943, 0.02668232, 0.28563277,
       0.28173459, 0.08614673, 0.19054602, 0.04601018, 0.07170828,
       0.1911843 , 0.03825208, 0.33709658, 0.19436484, 0.03773154,
       0.02946099, 0.03170569, 0.02623537, 0.03551721, 0.2411004 ,
       0.03399372, 0.04057315, 0.03940577, 0.11254694, 0.39556085,
       0.06151516, 0.04619945, 0.62069425, 0.04095179, 0.07723804,
       0.20966108, 0.29178431, 0.8558893 , 0.87279071, 0.55570535,
       0.52014021, 0.08755416, 0.264792  , 0.33032978, 0.60655453,
       0.09893384, 0.06812367, 0.623975  , 0.07071352, 0.02695892,
       0.09535574, 0.71846294, 0.76557469, 0.44661018, 0.03042

#### <br>Demonstrate Grid Search Implementation with One-Layer Neural Network

In [101]:
bs_vector = np.arange(20,120,5)#20 to 100 by 20's, 5 of them
ni_vector = np.arange(50,350,100)#50 to 250 by 100's, 3 of them
lr_vector = np.asarray((0.001,0.002, 0.004, 0.006, 0.008, 0.01,0.012, 0.014, 0.016, 0.018, 0.1))#11 of them
nn_vector = np.asarray((10,15,20))#3 of them
#5*3*3*3=135
hyper_grid_list = []

for k in range(len(bs_vector)):
    for l in range(len(ni_vector)):
        for m in range(len(lr_vector)):
            for n in range(len(nn_vector)):
                model = neural_network(a1a_x_train, a1a_y_train, batchsize = bs_vector[k], h1=nn_vector[n])
                model.fit(num_iterations = ni_vector[l],learning_rate = lr_vector[m], stifle = True)
                model.predict(a1a_x_test,a1a_y_test, stifle = True)
                hyper_grid_list.append([bs_vector[k],ni_vector[l],lr_vector[m],nn_vector[n],model.number_correct,model.accuracy])

names = ["Batchsize", "# Iterations", "Learning Rate","# Hidden Layer Nodes" ,"# Correct", "Accuracy"]
hyper_grid = pd.DataFrame(hyper_grid_list, columns = names)
best_hyper_grid = hyper_grid.sort_values(by=['Accuracy'],ascending=0)
best_hyper_grid.head(10)

Unnamed: 0,Batchsize,# Iterations,Learning Rate,# Hidden Layer Nodes,# Correct,Accuracy
1079,70,250,0.014,20,275,85.669782
1813,110,50,0.1,15,275,85.669782
787,55,250,0.018,15,275,85.669782
61,20,150,0.018,15,274,85.358255
1051,70,150,0.018,15,274,85.358255
1713,105,50,0.1,10,274,85.358255
1471,90,250,0.012,15,274,85.358255
1149,75,150,0.018,10,274,85.358255
1842,110,150,0.018,10,274,85.358255
980,65,250,0.014,20,274,85.358255


#### <br>Refine Grid Search with One-Layer Neural Network

In [100]:
bs_vector = np.arange(40,80,10)#40 to 70 by 10's, 5 of them
ni_vector = np.arange(25,125,25)#25 to 100 by 25's, 4 of them
lr_vector = np.asarray((0.005,0.01,0.015))#3 of them
nn_vector = np.asarray((10,15,20,25,30))#5 of them
#5*4*3*5=300
hyper_grid_list = []

for k in range(len(bs_vector)):
    for l in range(len(ni_vector)):
        for m in range(len(lr_vector)):
            for n in range(len(nn_vector)):
                model = neural_network(a1a_x_train, a1a_y_train, batchsize = bs_vector[k], h1=nn_vector[n])
                model.fit(num_iterations = ni_vector[l],learning_rate = lr_vector[m], stifle = True)
                model.predict(a1a_x_test,a1a_y_test, stifle = True)
                hyper_grid_list.append([bs_vector[k],ni_vector[l],lr_vector[m],nn_vector[n],model.number_correct,model.accuracy])

names = ["Batchsize", "# Iterations", "Learning Rate","# Hidden Layer Nodes" ,"# Correct", "Accuracy"]
hyper_grid = pd.DataFrame(hyper_grid_list, columns = names)
best_hyper_grid = hyper_grid.sort_values(by=['Accuracy'],ascending=0)
best_hyper_grid.head(10)

Unnamed: 0,Batchsize,# Iterations,Learning Rate,# Hidden Layer Nodes,MSE,Accuracy
74,50,25,0.015,30,0.056147,85.669782
156,60,75,0.01,15,0.055004,85.046729
140,60,50,0.01,10,0.056808,85.046729
90,50,75,0.005,10,0.057617,85.046729
202,70,50,0.01,20,0.056439,84.735202
237,70,100,0.015,20,0.055702,84.735202
24,40,50,0.01,30,0.056338,84.735202
25,40,50,0.015,10,0.056508,84.735202
31,40,75,0.005,15,0.05566,84.735202
38,40,75,0.01,25,0.054937,84.735202


#### <br>Demonstrate Grid Search Implementation with Two-Layer Neural Network

In [8]:
bs_vector = np.arange(20,140,40)#20 to 100 by 40's, 3 of them
ni_vector = np.arange(50,250,50)#50 to 200 by 50's, 4 of them
lr_vector = np.asarray((0.001, 0.01, 0.1))#3 of them
nn1_vector = np.arange(8,23,3)#8 to 20 by 3's, 5 of them
nn2_vector = np.arange(8,23,3)#8 to 20 by 3's, 5 of them
#3*4*3*5*5=1296
hyper_grid_list = []

for k in range(len(bs_vector)):
    for l in range(len(ni_vector)):
        for m in range(len(lr_vector)):
            for n in range(len(nn1_vector)):
                for o in range(len(nn2_vector)):
                    model = neural_network(a1a_x_train, 
                                           a1a_y_train, 
                                           batchsize = bs_vector[k], 
                                           h1=nn1_vector[n],
                                           h2=nn2_vector[o], 
                                           num_hidden_layers = 2)

                    model.fit(num_iterations = ni_vector[l],learning_rate = lr_vector[m], stifle = True)
                    model.predict(a1a_x_test,a1a_y_test, stifle = True)
                    hyper_grid_list.append([bs_vector[k],ni_vector[l],lr_vector[m],nn1_vector[n],nn2_vector[o],model.number_correct,model.accuracy])

names = ["Batchsize", "# Iterations", "Learning Rate","# Hidden Layers (1)","# Hidden Layers (2)","# Correct", "Accuracy"]
hyper_grid = pd.DataFrame(hyper_grid_list, columns = names)
best_hyper_grid = hyper_grid.sort_values(by=['Accuracy'],ascending=0)
best_hyper_grid.head(20)

Unnamed: 0,Batchsize,# Iterations,Learning Rate,# Hidden Layers (1),# Hidden Layers (2),# Correct,Accuracy
721,100,100,0.01,20,11,288,89.719626
707,100,100,0.01,11,14,288,89.719626
709,100,100,0.01,11,20,288,89.719626
414,60,100,0.01,14,20,288,89.719626
712,100,100,0.01,14,14,288,89.719626
409,60,100,0.01,11,20,287,89.4081
674,100,50,0.1,20,20,287,89.4081
722,100,100,0.01,20,14,287,89.4081
492,60,150,0.01,17,14,286,89.096573
779,100,150,0.01,8,20,286,89.096573


#### <br>Demonstrate Score Method

In [29]:
train2.score(test_x=a1a_x_test,test_y=a1a_y_test)


Neural Network with 2 Hidden-Layers Prediction

MSE:  0.067
Mean Absolute Error:  0.250
257 / 321  predicted correctly ->  80.1 % accuracy


0.3293290309317157

#### <br>Demonstrate Coefficients Method

In [12]:
train2.coef()

W0 [ 0.11582949 -0.61591208  0.22982466  0.59985338 -1.17224137  1.17458366
  0.49430813 -0.57496316 -0.65857368  0.57571618  1.78339198 -2.63364594
 -1.46736565  0.22604573] 

W1 [[-0.64138555  1.07071366 -0.45420362  0.00888695  0.49716639  0.70526263
  -0.43396605  0.27494407  0.85630112  0.81498079  0.6291566  -2.3740532
  -0.86687969]
 [ 1.04966597 -1.7104155  -1.24695801 -0.75354514  0.17208467  0.60085087
   0.62751832 -0.3786915  -1.20164854 -3.10616017 -0.17842662  0.2281082
  -0.43174588]
 [ 0.66502049  0.49928211 -0.02244262 -0.40924094 -0.24265066  0.58764316
  -0.16523566 -2.00064669 -0.60104784  0.11797181 -1.45436345  2.03429809
  -2.0703829 ]
 [ 0.21993944 -0.93703406 -0.14210281 -0.84200558  0.08476769  1.25085856
  -2.48502369 -1.93949214 -1.66005505  2.34292042 -1.84101317 -0.96986635
   0.35724802]
 [-0.32293952  0.13049623 -1.5481438   0.31587853 -1.75552459  0.94540566
  -0.44612431  1.0742524  -1.37375964  0.97475448  2.40698834  1.99219374
   0.12445939]
 [ 1.71