In [46]:
from itertools import combinations,permutations
import random
import sys
import matplotlib.pyplot as plt

np.set_printoptions(threshold=sys.maxsize)

%run DataPreProcessing.ipynb

# Activation and Loss Functions

In [27]:
def sigmoid(inp):
    '''
    Sigmoid function calculates the sigmoid by the taking the exponent of the negative input, 
    adding it to 1, and dividing 1 by the previous sum
    
    @param inp: input value to calculate sigmoid of(can be indiviudal number or array)  
    '''
    return 1/(1+np.exp(-inp))

def Loss(true,pred,task_type):
    '''
    Calculates loss of the gradient descent
    
    @param true: the target values of the function
    @param pred: the predicted values of the function
    @param task_type: Indicates if the dataset is a classification or regression set
    '''
    #Calculates and returns cross entropy loss for classificaction tasks
    if task_type == 'Classification':
        cross_entropy_loss = -np.sum(true * np.log(pred))
        return cross_entropy_loss
    
    #Calculates and returns mean squared error for regression tasks
    if task_type == 'Regression':
        MSE = np.square(np.subtract(true,pred)).mean()
        return MSE
    
    
def Accuracy(pred,true):
    accuracy = pred.argmax(axis=1) == true.argmax(axis=1)
    return accuracy.mean()

# Linear Networks

In [28]:
def LogisticRegression(dataset, class_label,train=False,test=False,trained_weights=None):
    '''
    Logistic regression algorithm predicts the outcomes of a linear classification network by initializing
    weights to each feature and calculating the softmax of each input to get the prediction. Updates weights 
    and repeats until convergence
    
    @param dataset: dataframe
    @param class_label: name of class column
    @param train: true if model is being trained
    @param test: true if model is being tested
    @param trained_weights: the weights array from trained model(only needed when test == True)    
    '''
    
    #Initilize the target columns and features columns, as well as the inidividual classes
    target = dataset[class_label].to_numpy()
    features = dataset.drop(columns = [class_label]).to_numpy()
    classes = list(dataset[class_label].unique())
    num_classes = len(classes)
    num_features = len(features[0])
    
    #Initialize the learning rate to be used in the weight update as well as the number of iterations until convergence
    step_size = (1/len(target))*0.01
    iterations = 100
    
    #Add biases column to features
    biases = np.ones((len(target),1))
    features = np.hstack((biases,features))
    
    #Initialize the dataframe to keep track of the loss over the iterations(to be used in graph)
    loss_df = pd.DataFrame(columns=['Loss'])
       
    #Perform one hot encoding of class label column in order to be used for the target predicted comparison
    target_encoded=np.zeros((len(target),num_classes))      
    for i in range(0,len(target)):
        class_index = classes.index(target[i])
        target_encoded[i][class_index]=1

    #Inititalize weights randomly between -0.01 and 0.01
    if train == True:       
        weights = np.zeros((num_classes,num_features+1))
        for i in range (0,num_classes):
            for j in range(0,num_features):
                weights[i][j]=random.uniform(-0.01,0.01)

    new_weights = np.zeros((num_classes,num_features+1))
    delta_weights = np.zeros((num_classes,num_features+1))
     
    #Use trained weights as weights during testing   
    if test == True:
        weights=trained_weights
        
    #Begin loop to make predictions and update weights
    k=0
    while k<iterations:
        final_predictions = []
        row_index=0
        
        #Initialize the change in weights to 0 during each iteration
        for i in range(0,num_classes):
            for j in range(0,num_features+1):
                delta_weights[i][j]=0 
        #Loop through each row in features 
        for row in features:
            outputs = [0]*num_classes
            predictions = [0]*num_classes
            
            print('')
            print('Weight*Input Calculation') 
            #Calculate the output value for each input row by multipliying the weights with the feature values
            for i in range(0,num_classes):
                for j in range(0,num_features+1):
                    outputs[i]=outputs[i]+weights[i][j]*row[j]  
            print(outputs)
            print('Softmax Activation Calculation')
            #Calculate softmax of the outputs, which is the prediction of that row       
            exp_sum = np.sum(np.exp(outputs))
            for i in range(0,num_classes):
                predictions[i]= (np.exp(outputs[i]))/exp_sum
            print(predictions)
            #Add the predictions to a list to keep track of the predictions for each input vector
            final_predictions.append(predictions)
            
            #Calculate the change in weights by calculating the differnece in the target and predicted values and muliplying by feature values
            for i in range(0,num_classes):
                target_val = target_encoded[row_index][i]
                predicted_val = predictions[i]
                for j in range(0,num_features+1):
                    delta_weights[i][j]=delta_weights[i][j]+(target_val-predicted_val)*row[j]
        
            row_index = row_index+1 
        #When testing, return the predicted and final arrays as the output after 1 iteration
        if test == True:
            return [np.asarray(final_predictions),target_encoded]
        
        loss_df=loss_df.append({"Loss":Loss(target_encoded,final_predictions,'Classification')},ignore_index=True)
        print('Weights Before Update')
        print(weights)
        #Update the weights by adding the change in weights*the learning rate to each of the existing weights
        print('Gradient Calculation')
        for i in range(0,num_classes):
            for j in range(0,num_features+1):
                print(delta_weights[i][j])
                weights[i][j]=weights[i][j]+delta_weights[i][j]*step_size
        print('Weights After Update')
        print(weights)
        print('')
        k=k+1
    #Plot the loss and return the final weights  
    loss_df.Loss.plot(title='Loss')
    return weights
        

In [29]:
def LinearRegressionNetwork(dataset,class_label,train=False,test=False,trained_weights=None):
    '''
    Linear regression algorithm predicts the outcomes of a linear classification network by initializing
    weights to each feature and calculating the dot products of the weights and features to get the prediction. 
    Updates weights and repeats until convergence
    
    @param dataset: dataframe
    @param class_label: name of class column
    @param train: true if model is being trained
    @param test: true if model is being tested
    @param trained_weights: the weights array from trained model(only needed when test == True) 
    
    '''
    
    #Initilize the target columns and features columns, as well as the inidividual classes
    target = dataset[class_label].to_numpy()
    sample_size = len(target)
    features = dataset.drop(columns = [class_label]).to_numpy()
    num_features = len(features[0])
    
    #Initialize the learning rate to be used in the weight update as well as the number of iterations until convergence
    learning_rate = 0.01
    iterations = 100
    
    #Add intercepts column to features
    beta_0 = np.ones((sample_size,1)) 
    features = np.hstack((beta_0,features))
    
    #Initialize the dataframe to keep track of the loss over the iterations(to be used in graph)
    loss_df = pd.DataFrame(columns=['Loss'])
    
    #Initialize the weights of each feature randomly between -0.01 and 0.01
    k=0
    if train == True:       
        weights = np.zeros(num_features+1)
        for j in range(0,num_features):
            weights[j]=random.uniform(-0.01,0.01)
        
    #Use trained weights as weights during testing 
    if test == True:
        weights=trained_weights
    
    #Begin loop to make predictions and update weights
    while k<iterations:
        
        #Initialize the change in weights to 0 during each iteration
        delta_weights = np.zeros_like(weights)
        
        #Caclulate the prediction by taking the dot product of the features and weights
        print('')
        print('Weight*Input Calculation') 
        pred = features.dot(weights)
        print(pred)
        print('')
        target = target.reshape(pred.shape)
        
        #When testing, return the predicted and final arrays as the output after 1 iteration
        if test == True:
            return [pred,target]
        
        loss_df=loss_df.append({"Loss":Loss(target,pred,'Regression')},ignore_index=True)
        
        #Calculate change in weights by taking the differences of the target and predictions and mulyiplying 
        #by the features and dividing by sample size
        delta_weights = (((target-pred).T).dot(features))/sample_size
        
        print('Weights Before Update')
        print(weights)
        #Update the weights by adding the change in weights*the learning rate to each of the existing weights
        weights = weights + delta_weights*learning_rate
        print('Weights After Update')
        print(weights)
        k=k+1
    #Plot the loss and return the final weights  
    loss_df.Loss.plot(title='Loss')
    return weights
   

# Hidden Layer Tuning

In [30]:
def GetHiddenCombos(num_hidden_layers,num_features):
    '''
    Gets all possible combos of hidden layer node sizes 
    
    @param num_hidden_layers: the number of hidden layers in the network
    @param: num_features: number of features the input has    
    '''
    #gets the possible node size values from 1 to 1 less than tbe number o features since the hidden layer
    #nodes must be smaller than the input layer
    features_range = list(range(1,num_features))
    
    #gets all possible combos of node sizes by calculating the permutations of the features range list. Each combo
    #is of the size of the number of layers, so that one node size value correlates to each layer
    combos = list(permutations(features_range,num_hidden_layers))
    
    #The permutations function does not include combos with the same values, such as (1,1). Therefore,
    #this for loop adds all matching node size combos to the permutations list
    for i in range(1,num_features):
        combos.append(tuple([i,i]))
        
    return combos

In [31]:
def TuneHidden(model_type,task_type, dataset, class_label, num_features):
    '''
    Tunes the best hidden layer nodes combo by running through all the combos from the GetHiddenCombos function,
    and testing them on the needed function to determine which has the best performance based on the lowest error
    
    @param model_type: indicates if this is a 2 hidden layer feedforward neural network, or a an autoencoder network
    @param task_type: indicates the dataset as a classification or regression dataset
    @param dataset: input dataframe
    @param class_label: name of class column
    @param num_features: number of features in the dataset
    '''
    
    #gets all possible node size combos for 2 hidden layer networks based on the 
    #number of features using the GetHiddenCombos function
    combos = GetHiddenCombos(2,num_features)
    
    #initializes a dictionary to track performance of each combo 
    performance = {}
    
    
    #tunes hidden layers for feedforward neural networks
    if model_type == 'FF':
        
        #iterates through all the combos and for each combo, trains the feedforward network, tests it,
        #and stores the error in a dictionary
        for combo in combos:
            weights = FeedForwardNN_2H(dataset,class_label,list(combo),task_type,train=True,tuning=True)
            model = FeedForwardNN_2H(dataset,class_label,list(combo),task_type,test=True,trained_weights=weights,tuning=True)
            error = Evaluate(task_type,model[0],model[1])
            performance[combo]= error
            
        #gets the best performing combo by getting the key of the lowest error value in the dict
        best_combo = min(performance, key=performance.get)

    #tunes hidden layers for auotoencoder neural networks
    if model_type == 'Autoencoder':
        
        #iterates through all the combos and for each combo, trains the autoencoder network, tests it,
        #and stores the error in a dictionary
        for combo in combos:
            weights = Autoencoder(dataset,class_label,list(combo),task_type,train=True,tuning=True)
            model = Autoencoder(dataset,class_label,list(combo),task_type,test=True,trained_weights=weights,tuning=True)
            error = Evaluate(task_type,model[0],model[1])
            performance[combo]= error
            
        #gets the best performing combo by getting the key of the lowest error value in the dict
        best_combo = min(performance, key=performance.get)
    
    #returns the best combo to use
    return list(best_combo)
            


# Feedforward NN with 2 Hidden Layers

In [32]:
def FeedForwardNN_2H(dataset,class_label,nodes_sizes:list,task_type,train=False,test=False,trained_weights=None,tuning=False):
    '''
    Feed forward neural network with 2 hidden layers that moves forward through the differeny layers using the weights at each
    layer, then backpropagates to update the weights. Process is repeated until convergence
    
    @param dataset: dataframe
    @param class_label: name of class column
    nodes_sizes: a size 2 list that indicates the number of nodes of each of the hidden layers
    @param task_type: indicates whether dataset is a classification or regression dataset
    @param train: true if model is being trained
    @param test: true if model is being tested
    @param trained_weights: the weights array from trained model(only needed when test == True)
    @param tuning: if True, indicates that tuning of the hidden layers is being performed
    
    '''
    
    #Initilize the target columns and features columns, as well as the individual classes
    target = dataset[class_label].to_numpy()
    features = dataset.drop(columns = [class_label]).to_numpy()
    classes = list(dataset[class_label].unique())
    num_classes=len(classes)
    num_features = len(features[0])
    
    #Initialize the dataframe to keep track of the loss over the iterations(to be used in graph)
    loss_df = pd.DataFrame(columns=['Loss'])
       
    #Initialize the learning rate to be used in the weight update as well as the number of iterations until convergence
    learning_rate = 0.1
    iterations = 100
    sample_size = target.size
    
    
    #Performs one hot encoding of class column for classification tasks in order to be used for the target predicted comparison
    if task_type == 'Classification':
        output_size=len(classes) 
        target_encoded=np.zeros((len(target),output_size))  
        for i in range(0,len(target)):
            class_index = classes.index(target[i])
            target_encoded[i][class_index]=1
        target=target_encoded
        
    #Sets the output size to 1 for regression tasks
    if task_type == 'Regression':
        output_size = 1
    
    #Initializes weights for the input to 1st hidden layer, 1st hidden layer to 2nd hidden layer,
    #and the second hidden layer to the output layer
    if train==True:
        weights_inp_h1= np.random.uniform(-0.01,0.01,(num_features, nodes_sizes[0]))
        weights_h1_h2= np.random.uniform(-0.01,0.01,(nodes_sizes[0],nodes_sizes[1]))
        weights_h2_out= np.random.uniform(-0.01,0.01,(nodes_sizes[1],output_size))
        
    ##Use trained weights as weights during testing 
    if test==True:
        weights_inp_h1= trained_weights[0]
        weights_h1_h2= trained_weights[1]
        weights_h2_out= trained_weights[2]
    
    #Begin loop to make predictions and update weights
    k=0
    while k<iterations:    

        #FEEDFORWARD
        
        #Implementing feedforward propagation to calcuate 1st hidden layer using the logistic activation function
        print('Hidden Layer 1 Calculation: sigmoid(weights*features)')
        z_h1 = np.dot(features, weights_inp_h1)
        hidden_1 = sigmoid(z_h1)
        print(hidden_1)
        
        #Implementing feedforward propagation to get 2nd hidden layer
        print('Hidden Layer 2 Calculation: sigmoid(weights*hidden layer 1)')
        z_h2 = np.dot(hidden_1, weights_h1_h2)
        hidden_2 = sigmoid(z_h2)
        ##print(hidden_2)
        
        #Implementing feedforward propagation to get output layer
        print('Output Layer Calculation: sigmoid(weights*hidden layer 2)')
        z_output = np.dot(hidden_2, weights_h2_out)
        prediction = sigmoid(z_output)      
        print(prediction)
        target=target.reshape(prediction.shape)
         
        
        #When testing, return the predicted and final arrays as the output after 1 iteration
        if test == True:
            return [prediction,target]
        
        loss_df=loss_df.append({"Loss":Loss(target,prediction,task_type)},ignore_index=True)
        
        #BACKPROPAGATION
        print('')
        print('Backpropagation')
        #weights change output to hidden 2(dW2)
        weights_out_h2 = (prediction - target) * prediction * (1 - prediction)
        #weights change hidden 2 to hidden 1
        weights_h2_h1 = (np.dot(weights_out_h2, weights_h2_out.T)) * hidden_2 * (1 - hidden_2)
        #weights change hidden 1 to input
        weights_h1_inp = (np.dot(weights_h2_h1, weights_h1_h2.T)) * hidden_1 * (1 - hidden_1)


        # Updating the weights
        
        #hidden 2 to outer update = hidden 2*weights change output to hidden 2
        delta_weights_h2_out = np.dot(hidden_2.T,weights_out_h2)/sample_size
        
        #hidden 1 to hidden 2 update = hidden 1*weights change hidden 2 to hidden 1
        delta_weights_h1_h2 = np.dot(hidden_1.T, weights_h2_h1) / sample_size
        
        #input to hidden 1 update = input features*weights change hidden 1 to input
        delta_weights_inp_h1 = np.dot(features.T, weights_h1_inp) / sample_size

      #  '''
        print('Weights Before Update')
        print('Input to Hidden 1')
        print(weights_inp_h1)
        print('Hidden 1 to Hidden 2')
        print(weights_h1_h2)
        print('Hidden 2 to Output')
        print(weights_h2_out)
       # '''
        #Update weights by subtracting change in weights from current weight values
        weights_h2_out = weights_h2_out - learning_rate * delta_weights_h2_out
        weights_h1_h2 = weights_h1_h2 - learning_rate * delta_weights_h1_h2
        weights_inp_h1 = weights_inp_h1 - learning_rate * delta_weights_inp_h1
        #'''
        print('')
        print('Weights After Update')
        print('Input to Hidden 1')
        print(weights_inp_h1)
        print('Hidden 1 to Hidden 2')
        print(weights_h1_h2)
        print('Hidden 2 to Output')
        print(weights_h2_out)
        print('')
        print('------------------------')
        print('')
       # '''
        k=k+1
    
    #Doesn't produce loss graph when tuning hidden layers
    if tuning == False:  
        loss_df.Loss.plot(title='Loss')
        
    #returns the final trained weight values
    return [weights_inp_h1,weights_h1_h2,weights_h2_out]

# NN with Autoencoder

In [33]:
def Autoencoder(dataset,class_label,nodes_sizes,task_type,train=False,test=False,trained_weights=None,tuning=False):
    '''
    Autoencoder network first trains a 1 hidden layer feedforward network, and the output equivalent to the input vector size.
    Once trained, the output layer is dropped and the hidden layer is stored as the autoencoder. Then, a 2 hidden layer feedforward network
    is implemented with the trained autoencoder being the first hidden layer, a new second hidden layer, and an output classification/
    regression layer
    
    @param dataset: dataframe
    @param class_label: name of class column
    nodes_sizes: a size 2 list that indicates the number of nodes of each of the hidden layers
    @param task_type: indicates whether dataset is a classification or regression dataset
    @param train: true if model is being trained
    @param test: true if model is being tested
    @param trained_weights: the weights array from trained model(only needed when test == True)
    @param tuning: if True, indicates that tuning of the hidden layers is being performed
    
    '''
    #Initilize the target columns and features columns, as well as the individual classes
    target = dataset[class_label].to_numpy()
    features = dataset.drop(columns = [class_label]).to_numpy()
    classes = list(dataset[class_label].unique())
    num_classes=len(classes)
    num_features = len(features[0])
    
    #Initialize the learning rate to be used in the weight update as well as the number of iterations until convergence
    learning_rate = 0.1
    iterations = 100
    sample_size = target.size

    #Initialize the dataframe to keep track of the loss over the iterations when training the encoder(to be used in graph)
    encoder_loss_df = pd.DataFrame(columns=['Loss'])
    
    
    #Initializes weights for the input to hidden layer, and the hidden layer to the output layer
    if train == True:
        weights_inp_encoder= np.random.uniform(-0.01,0.01,(num_features, nodes_sizes[0]))
        weights_encoder_out= np.random.uniform(-0.01,0.01,(nodes_sizes[0],num_features))
    
    #Use trained weights as weights during testing 
    if test==True:
        weights_inp_encoder= trained_weights[0]
        weights_encoder_out = trained_weights[1]

     #Begin loop to make predictions and update weights
    k=0
    ##print('Encoder Training')
    while k<iterations:    

        #FEEDFORWARD
        #Same process as two hidden layer feedforward network except with just one hidden layer
        print('Encoder Layer Calculation: sigmoid(weights*features)')
        z_encoder = np.dot(features, weights_inp_encoder)
        encoder = sigmoid(z_encoder)
        print(encoder)
        
        ##print('Output Layer Calculation: sigmoid(weights*encoder)')
        z_output = np.dot(encoder, weights_encoder_out)
        encoder_output = sigmoid(z_output)
        print(encoder_output)
        
        encoder_loss_df=encoder_loss_df.append({"Loss":Loss(encoder_output,features,task_type)},ignore_index=True)
        
        print('')
        print('Backpropagation')
        #BACKPROPAGATION
        weights_out_encoder = (encoder_output - features) * encoder_output * (1 - encoder_output)
        weights_encoder_inp = (np.dot(weights_out_encoder, weights_encoder_out.T)) * encoder * (1 - encoder)


        # Updating the weights
        delta_weights_encoder_out = np.dot(encoder.T,weights_out_encoder)/sample_size
        delta_weights_inp_encoder = np.dot(features.T, weights_encoder_inp) / sample_size

        #'''
        print('Weights Before Update')
        print('Input to Encoder')
        print(weights_inp_encoder)
        print('Encoder to Output')
        print(weights_encoder_out)
        #'''
        weights_encoder_out = weights_encoder_out - learning_rate * delta_weights_encoder_out
        weights_inp_encoder = weights_inp_encoder - learning_rate * delta_weights_inp_encoder
       # '''
        print('Weights Before Update')
        print('Input to Encoder')
        print(weights_inp_encoder)
        print('Encoder to Output')
        print(weights_encoder_out)
        print('')
        print('------------------------')
        print('')
        #'''
        
        k=k+1
    #Once function converges, the final hidden layer is saved as "encoder" 
    
    #Initialize the dataframe to keep track of the loss over the iterations(to be used in graph)
    loss_df = pd.DataFrame(columns=['Loss'])
    loss_df = pd.DataFrame(columns=['Loss'])
    
    
    #Performs one hot encoding of class column for classification tasks in order to be used for the target predicted comparison
    if task_type == 'Classification':
        output_size=len(classes) 
        target_encoded=np.zeros((len(target),output_size))  
        for i in range(0,len(target)):
            class_index = classes.index(target[i])
            target_encoded[i][class_index]=1
        target=target_encoded
        
    #Sets the output size to 1 for regression tasks
    if task_type == 'Regression':
        output_size = 1
    
    #Initializes weights for the input to 1st hidden layer, 1st hidden layer to 2nd hidden layer,
    #and the second hidden layer to the output layer
    if train==True:
        weights_inp_h1= np.random.uniform(-0.01,0.01,(num_features, nodes_sizes[0]))
        weights_h1_h2 = np.random.uniform(-0.01,0.01,(nodes_sizes[0], nodes_sizes[1]))
        weights_h2_out= np.random.uniform(-0.01,0.01,(nodes_sizes[1],output_size))
        
    #Use trained weights as weights during testing  
    if test==True:
        weights_inp_h1= trained_weights[2]
        weights_h1_h2 = trained_weights[3]
        weights_h2_out= trained_weights[4]
    

    #Begin loop to make predictions and update weights
    print(encoder)
    l=0
    while l<iterations:    

        #Same process as 2 layer feedforward network function except that the first hidden layer is set as the 
        #traind autoencoder from the first iteration
        
        #FEDEDFORWARD
        hidden_1 = encoder

        print('1st Hidden layer in autoencoder')
        print(encoder)
        print('')
        
        z_h2 = np.dot(hidden_1, weights_h1_h2)
        hidden_2 = sigmoid(z_h2)
        
        z_output = np.dot(hidden_2, weights_h2_out)
        prediction = sigmoid(z_output)
        target=target.reshape(prediction.shape)
     
    
         #When testing, return the predicted and final arrays as the output after 1 iteration
        if test == True:
            return [prediction,target]
        
        loss_df=loss_df.append({"Loss":Loss(target,prediction,task_type)},ignore_index=True)
        
        #BACKPROPAGATION
        weights_out_h2 = (prediction - target) * prediction * (1 - prediction)
        weights_h2_h1 = (np.dot(weights_out_h2, weights_h2_out.T)) * hidden_2 * (1 - hidden_2)
        weights_h1_inp = (np.dot(weights_h2_h1, weights_h1_h2.T)) * hidden_1 * (1 - hidden_1)


        # Updating the weights
        delta_weights_h2_out = np.dot(hidden_2.T,weights_out_h2)/sample_size
        delta_weights_h1_h2 = np.dot(hidden_1.T, weights_h2_h1) / sample_size
        delta_weights_inp_h1 = np.dot(features.T, weights_h1_inp) / sample_size
        
        l=l+1
        
    #Doesn't produce loss graph when tuning hidden layers
    if tuning == False:
        fig, axes = plt.subplots(nrows=1, ncols=2)
        encoder_loss_df.Loss.plot(title='Encoder Training Loss',ax=axes[0])
        loss_df.Loss.plot(title='Final Model Loss',ax=axes[1])
        
    #returns the final weight trained values from the encoder training as well as the final model training
    return [weights_inp_encoder,weights_encoder_out, weights_inp_h1,weights_h1_h2, weights_h2_out]

# Evaluation

In [None]:
def Evaluate(task_type, true_values, predicted_values):
    '''
    Evaluates the data based on task type
    
    @param task_type: specifies whether the task is 'Classification' or 'Regression'
    @param true_values: actual values of the data
    @param predicted_values: values predicted by the model
    
    '''
    #Calculates mean square error for regression data
    if task_type == 'Regression':
        mean_squared_error = np.square(np.subtract(true_values,predicted_values)).mean()
        return mean_squared_error
    
    #Calculates classification accuracy and error rate for classification data
    elif task_type == 'Classification':
        classification_accuracy = Accuracy(predicted_values,true_values)
        error_rate = 1-classification_accuracy
        return classification_accuracy, error_rate

In [37]:
def RunExperiment(dataset, task_type, model_type='linear', k = None, class_label = None):
    '''
    Runs the experiments using the processed data, performs cross validation to get testing and training data,
    standardizes the data, feeds into the algorithm, and uses the output to evaluate the accuracy
    
    @param dataset: pre-processed dataframe
    @param validation_type: specifies whether the task is 'k-fold' or 'kx2 '
    @param task_type: specifies whether the task is 'Classification' or 'Regression'
    @param num_folds: number of folds--only needed when validation type is k-fold
    @param k: k value to be used for cross validation-only needed when validation type is kx2
    @param class_label: name of class column
    
    '''
    
    classification = False
    if task_type == 'classification' or task_type == 'Classification':
        classification = True
     
    avg_accuracy = 0
    avg_error = 0
    avg_MSE = 0
    
    if model_type == 'FF' or model_type == 'Autoencoder':
        nodes_sizes = TuneHidden(model_type,task_type,dataset, class_label,len(dataset.columns)-1)

    #Performs k x 2 cross validation with parameter data
    validation_output = k_x_2_cross_validation(dataset, k, classification, class_label)
    for k in validation_output:
        print('K = ' + str(k))
        for num in validation_output[k]:
            print('  Experiment ' + str(num))

            #Gets test and training data for each k
            test_data = validation_output[k][num]['test']
            train_data = validation_output[k][num]['train']
            '''
            print('Data Before Standardization: ')
            print('train')
            print(train_data)
            print('test')
            print(test_data)
            print('')
            '''
            #Standardizes test and training data
            standardized_data = Standardize(train_data,test_data)
            train_data = standardized_data[0]
            test_data = standardized_data[1]

            '''
            print('Data After Standardization: ')
            print('train')
            print(train_data)
            print('test')
            print(test_data)
            '''

            print('   Test Fold Size: ' + str(len(test_data)))
            print('   Train Fold Size: ' + str(len(train_data)))

            #Gets the prediction value from the models
            if model_type == 'linear':
                if task_type == 'Classification':
                    trained_weights = LogisticRegression(train_data,class_label,train=True)
                    model_output = LogisticRegression(test_data,class_label,test=True,trained_weights=trained_weights)
                if task_type == 'Regression':
                    trained_weights = LinearRegressionNetwork(train_data,class_label,train=True)
                    model_output = LinearRegressionNetwork(test_data,class_label,test=True,trained_weights=trained_weights)
            if model_type == 'FF':
                trained_weights=FeedForwardNN_2H(train_data,class_label,nodes_sizes,task_type,train=True)
                model_output=FeedForwardNN_2H(test_data,class_label,nodes_sizes,task_type,test=True,trained_weights=trained_weights)
            if model_type == 'Autoencoder':
                trained_weights=Autoencoder(train_data,class_label,nodes_sizes, task_type,train=True)
                model_output= Autoencoder(test_data,class_label,nodes_sizes, task_type,test=True,trained_weights=trained_weights)
           
            predicted_values = model_output[0]
            true_values = model_output[1]
                
           
            #Uses the true values from the test set, and predicted values from the model to evaluate accuracy/error
            evaluation = Evaluate(task_type,true_values,predicted_values)
            #Gets corresponding evaluation metric depending on task type and prints to console
            if task_type == 'Classification':
                accuracy = evaluation[0]
                error = evaluation[1]
                avg_accuracy = avg_accuracy + accuracy
                avg_error = avg_error + error
                print('    Classification Accuracy: '+ str(accuracy))
                print('    Error: ' + str(error))
            elif task_type == 'Regression':
                MSE = evaluation
                avg_MSE = avg_MSE + MSE
                print('    Mean Squared Error: ' + str(MSE))
            print('')

    #Gets the average of the evaluation metric over all folds and prints it
    if task_type == 'Classification':
            print('Average Classification Accuracy: '+ str(avg_accuracy/(2*k)))
            print('Average Error: ' + str(avg_error/(2*k)))
    elif task_type == 'Regression':
        print('Average Mean Squared Error: ' + str(avg_MSE/(2*k)))
    print('')
        