In [5]:
import math
from heapq import nsmallest
import copy
from scipy.stats import mode

%run DataPreProcessing.ipynb

# Distance Functions

In [6]:
def EuclideanDistance(query_point_vector,comparison_vector,):
    '''
    Calculate Euclidean distance between two points
    
    @param query_point_vector: test input data vector
    @param comparison_vector: train vector to compare with
    
    '''
    
    #Mathematically calculates the Euclidean distance between two vectors
    euclidean_distance = np.sqrt(np.sum((query_point_vector-comparison_vector)**2))  
    
    return euclidean_distance
        

In [7]:
def RBF(distances,bandwidth):
    '''
    
    Applies a Radial Basis Function Kernel to make the predictions for regression sets
    
    @param distances: list of k nearest distances from the KNN function
    @param bandwidth: bandwidth value(1/sigma) for the kernel function
    
    '''       
    #Calculate Gaussian(RBF) kernel
    #print(-bandwidth*(distances**2))
    #print(np.exp(-bandwidth*(distances**2)))
    #print((np.exp(-bandwidth*(distances**2))).mean())
    kernel_value = (np.exp(-bandwidth*(distances**2))).mean()
    
    return kernel_value


In [8]:
def ValueDifferenceMetric(dataset,class_label):
    '''
    
    Calculates the value difference metric for categorical data, 
    and returns a dictionary of the value differences between each 
    nominal feature value of each class
    
    @param dataset: The complete dataset
    @param class_label: The label of the class column in the dataset
    
    '''   
    #Initializes the final vdm dict to be returned
    vdm_tables = {}
    
    #Gets all the unique class and all feature names
    classes = dataset[class_label].unique()
    features = list((dataset.drop(columns=class_label)).columns)
    
    #Iterates through each of the features    
    for feature in features:
        #Gets all the individual categorical values of each feature
        feature_vals = dataset[feature].unique()
        feature_distances = {}
        
        #Gets all possible combinations of the feature values
        feature_combos = itertools.combinations(feature_vals,2)
        
        #Calculates the VDM value for each combo of each class and 
        #adds it to total VDM for that combo
        for combo in feature_combos:
            vdm = 0
            total_feature_1 = dataset[feature].value_counts()[combo[0]]
            total_feature_2 = dataset[feature].value_counts()[combo[1]]
            for class_val in classes:
                inst_vdm = 0
                
                feature_1_curr_class_count = len(dataset[(dataset[feature]==combo[0]) & (dataset[class_label]==class_val)])
                feature_2_curr_class_count = len(dataset[(dataset[feature]==combo[1]) & (dataset[class_label]==class_val)])
                
                inst_vdm = np.abs((feature_1_curr_class_count/total_feature_1)-(feature_2_curr_class_count/total_feature_2))
                
                vdm = vdm + inst_vdm
            feature_distances[combo]=vdm
        #Adds the dictionary of all the combos of one feature to the complete
        #VDM dict for all the features
        vdm_tables[feature]=feature_distances  
        
    return [vdm_tables,features]          
            

In [9]:
def CategoricalDistances(query_point_vector,comparison_vector,features,vdm_table):
    '''
    Calculate distance between two categorical vectors using the Value Difference Metric
    
    @param query_point_vector: test input data vector
    @param comparison_vector: train vector to compare with
    @param features: categorical feature names
    @param vdm_tables: dictionary of all the feature value combos 
    and their corresponding VDM(result of the previous Value Difference Metric function)
    
    '''
    
    
    #Creates a dataframe with the feature names as the headers
    #the two input vectors as rows of the dataframe
    temp_df = pd.DataFrame(columns=features)    
    temp_df.loc[len(temp_df)] = query_point_vector
    temp_df.loc[len(temp_df)] = comparison_vector
    
    #Initializes the distance to 0
    dist = 0
   
    #Iterates through each feature 
    for (feature,data) in temp_df.iteritems():
        if feature in vdm_table:
            #Gets the feature VDM dictionary from the overall VDM dict input
            feature_vdm = vdm_table.get(feature)
            
            #Gets the VDM distance value between the 
            #query and comparison vector values for the specific feature
            if data[0] != data[1]:
                if (str(data[1]),str(data[0])) in feature_vdm:
                    combo = (str(data[1]),str(data[0]))
                else:
                    combo = (str(data[0]),str(data[1]))
                
                #All individual feature distances summed together for final distance
                curr_dist = feature_vdm.get(combo)
                dist = dist+float(curr_dist)
    return dist


# Tuning

In [10]:
def getThresholds(target):   
    '''
    Gets error threshold values for tuning
    
    @param target: target vector
    
    '''
     #Gets the range of the target and gets the threholds for the 90%, 95%, and 99% confidence intervals
    arr_range = np.max(target) - np.min(target)
    accuracies = [0.9,0.95,0.99]
    thresholds = []
    
    #Adds each threshold value to list and returns
    for value in accuracies:
        thresholds.append((1-value)*arr_range)
    return thresholds
            
def getAllTuneVals(dataset,task_type,class_label,categorical=False):
    '''
    Tunes the numer of neighbors, error, and bandwidth
    
    @param dataset: the 20% of dataset for tuning
    @param class_label: class label
    @param nominal_features: list of nominal features, if any
    '''
    
    k_list = list(range(1,11))
    
    if task_type=='Classification':
        errors_dict = {}
        for num in k_list:
            class_error = RunExperiment(dataset, task_type, k = 5, class_label = class_label, tuning=True, knn=num, error=None, bandwidth=None, categorical=categorical)
            errors_dict[num]=class_error
        return min(errors_dict, key=errors_dict.get)
    if task_type == 'Regression':
        sigmas = [.1,1,10,100]
        bandwidths = [1/sigma for sigma in sigmas]
        thresholds = getThresholds(dataset[class_label].to_numpy()) 
        
        params_list = [k_list,bandwidths,thresholds]
        param_combinations = [p for p in itertools.product(*params_list)]
        errors_dict = {}
        
        for combo in param_combinations:
            mse = RunExperiment(dataset, task_type, k = 5, knn_type='knn',class_label = class_label, tuning=True, knn = combo[0], error=combo[1], bandwidth=combo[2],categorical=categorical)
            errors_dict[tuple(combo)]=mse
        return list(min(errors_dict, key=errors_dict.get))

# KNN

In [11]:
#Function to calculate KNN
def KNN(train, test, k, class_label,task_type,bandwidth=None,categorical=False,vdm_dict=None,nominal_features=None):
    '''
    Runs the K nearest neighbors algorithms and returns the predictions
    
    @param train: training data
    @param test: testing data
    @param k: number of nearest neighbors
    @param class_label: class label
    @param task_type: indicates classification or regression
    @param bandwidth: tuned bandwidth for the RBF kernel
    @param categorical: boolean value indicating if data is categorical
    @param vdm_dict: value difference metric mapping table (only needed if categorical is True)
    @param nominal_features: list of nominal features (only needed if categorical is True)
    
    '''
    
    train_data = (train.drop(columns=[class_label])).to_numpy()
    train_target=train[class_label].to_numpy()
    test_data = (test.drop(columns=[class_label])).to_numpy()
    
    predicted_values = []
    
    if task_type == 'Classification':
        #Loop through the Datapoints to be classified
        for query_row in test_data: 

            #Array to store distances
            distances = []

            #Loop through each training Data
            for j in range(len(train_data)):
                if categorical == False:
                    distance = EuclideanDistance(query_row, np.array(train_data[j,:])) 
                if categorical == True:
                    distance = CategoricalDistances(query_row,np.array(train_data[j,:]),nominal_features, vdm_dict)
                
                #Calculating the distance
                distances.append(distance) 
            distances = np.array(distances) 
            #Sorting the array while preserving the index
            #Keeping the first K datapoints
            k_nearest_distances = np.argsort(distances)[:k] 
   

            #Labels of the K datapoints from above
            k_nearest_classes = train_target[k_nearest_distances]
    
            
            print('Query Row')
            print(query_row)
            print('K Nearest Distances')
            print(k_nearest_distances)
            
            print('K Nearest Classes')
            print(k_nearest_classes)
            
            
            #Majority voting
            #print(k_nearest_classes)
            prediction = mode(k_nearest_classes).mode[0]
            predicted_values.append(prediction)        
            
    if task_type == 'Regression':
        #Loop through the Datapoints to be classified
        for query_row in test_data: 
            #Array to store distances
            distances = []

            #Loop through each training Data
            for j in range(len(train_data)): 
                distance = EuclideanDistance(query_row, np.array(train_data[j,:])) 
                #Calculating the distance
                distances.append(distance) 
            distances = np.array(distances) 
            #Sorting the array while preserving the index
            #Keeping the first K datapoints
            k_nearest_distances = np.argsort(distances)[:k] 
            

            #Majority voting
            prediction = RBF(k_nearest_distances,bandwidth)
            predicted_values.append(prediction)
            
            
            print('Query Row')
            print(query_row)
            print('K Nearest Distances')
            print(k_nearest_distances)
            
            print('Prediction')
            print(prediction)
           

    return predicted_values

In [12]:
#Function to calculate KNN
def EditedKNN(train, test, k, class_label,task_type,bandwidth=None,error=None, categorical=False,vdm_dict=None,nominal_features=None,prev_accuracy=0):
    '''
    Runs the edited nearest neighbors algorithms and returns the edited training data
    
    @param train: training data
    @param test: testing data
    @param k: number of nearest neighbors
    @param class_label: class label
    @param task_type: indicates classification or regression
    @param bandwidth: tuned bandwidth for the RBF kernel
    @param categorical: boolean value indicating if data is categorical
    @param vdm_dict: value difference metric mapping table (only needed if categorical is True)
    @param nominal_features: list of nominal features (only needed if categorical is True)
    
    '''
    train_data = (train.drop(columns=[class_label])).to_numpy()
    train_target=train[class_label].to_numpy()
    test_data = (test.drop(columns=[class_label])).to_numpy()
    test_target = test[class_label].to_numpy()
    predicted_values = []
    remove_list = []
    edit_knn=1
    completed=False

    if task_type == 'Classification':
        #Loop through the Datapoints to be classified
        for idx in range(0,len(test_data)): 
            #Array to store distances
            distances = []

            #Loop through each training Data
            for j in range(len(train_data)):
                if categorical == False:
                    distance = EuclideanDistance(test_data[idx], np.array(train_data[j,:])) 
                if categorical == True:
                    distance = CategoricalDistances(test_data[idx],np.array(train_data[j,:]),nominal_features, vdm_dict)
                
                #Calculating the distance
                distances.append(distance) 
            distances = np.array(distances) 
            #Sorting the array while preserving the index
            #Keeping the first K datapoints
            k_nearest_distances = np.argsort(distances)[:edit_knn]
            #Labels of the K datapoints from above
            k_nearest_classes = train_target[k_nearest_distances]

            #Majority voting
            prediction = mode(k_nearest_classes).mode[0]
            
            #Adds the incorrect prediction index to the list of indices to remove
            if prediction != test_target[idx]:
                remove_list.append(k_nearest_distances[0])
         
        #Removes the specified indices and gets the accuracy of the edited trian data
        edited_train = train.drop(train.index[remove_list])
        
        print('To Remove')
        print(remove_list)
        print(len(remove_list))
        print('Before Editing')
        print(len(train))
        print('After Editing')
        print(len(edited_train))
        
        new_predicted = KNN(edited_train, test, k, class_label,task_type,categorical=categorical,vdm_dict=vdm_dict, nominal_features=nominal_features)
        new_accuracy = Evaluate(task_type,test_target,new_predicted)[0]

        #If the accuracy is lower or the size of the edited train is unchanged, the training is complete
        if new_accuracy<prev_accuracy or len(edited_train)==len(train):
            completed==True
        #If not the function is called recursively with the new edited train
        else:
            EditedKNN(edited_train, test, k, class_label,task_type,categorical=categorical,vdm_dict=vdm_dict, nominal_features=nominal_features,prev_accuracy=new_accuracy)
        
    
        #returns the final edited train set
        return edited_train
            
    if task_type == 'Regression':
        #Loop through the Datapoints to be classified
        for idx in range(0,len(test_data)): 
            #Array to store distances
            distances = []

            #Loop through each training Data
            for j in range(len(train_data)): 
                distance = EuclideanDistance(test_data[idx], np.array(train_data[j,:])) 
                #Calculating the distance
                distances.append(distance) 
            distances = np.array(distances) 
            #Sorting the array while preserving the index
            #Keeping the first K datapoints
            k_nearest_distances = np.argsort(distances)[:edit_knn] 
            

            #Majority voting
            prediction = RBF(k_nearest_distances,bandwidth)
            if np.abs(prediction - test_target[idx])>error:
                remove_list.append(k_nearest_distances[0])
        
    
        edited_train = train.drop(train.index[remove_list])
        
  
        
        new_predicted = KNN(edited_train, test, k, class_label,task_type,bandwidth=bandwidth,categorical=categorical,vdm_dict=vdm_dict, nominal_features=nominal_features)
        new_accuracy = Evaluate(task_type,test_target,new_predicted,error=error)[0]

        if (new_accuracy<prev_accuracy) or (len(edited_train)==len(train)):
            completed==True
        else:
            EditedKNN(edited_train, test, k, class_label,task_type,categorical=categorical,vdm_dict=vdm_dict, nominal_features=nominal_features, bandwidth=bandwidth, error=error,prev_accuracy=new_accuracy)
        
        return edited_train


In [13]:
def CondensedKNN(condensed_train, test, k, class_label,task_type,bandwidth=None,error=None, categorical=False,vdm_dict=None,nominal_features=None):
    '''
    Runs the condensed nearest neighbors algorithms and returns the edited training data
    
    @param train: training data
    @param test: testing data
    @param k: number of nearest neighbors
    @param class_label: class label
    @param task_type: indicates classification or regression
    @param bandwidth: tuned bandwidth for the RBF kernel
    @param categorical: boolean value indicating if data is categorical
    @param vdm_dict: value difference metric mapping table (only needed if categorical is True)
    @param nominal_features: list of nominal features (only needed if categorical is True)
    
    '''
    train_data = (condensed_train.drop(columns=[class_label])).to_numpy()
    train_target=condensed_train[class_label].to_numpy()
    test_data = (test.drop(columns=[class_label])).to_numpy()
    test_target = test[class_label].to_numpy()
    
    add_to_condensed_list = []
    condense_knn=1
    
    completed=False

    if task_type == 'Classification':
        #Loop through the Datapoints to be classified
        for idx in range(0,len(test_data)): 
 
            #Array to store distances
            distances = []

            #Loop through each training Data
            for j in range(0,len(train_data)):
                if categorical == False:
                    distance = EuclideanDistance(test_data[idx], np.array(train_data[j,:])) 
                if categorical == True:
                    distance = CategoricalDistances(test_data[idx],np.array(train_data[j,:]),nominal_features, vdm_dict)

                #Calculating the distance
                distances.append(distance)

            distances = np.array(distances)
            #Sorting the array while preserving the index
            #Keeping the first K datapoints
            k_nearest_distances = np.argsort(distances)[:condense_knn]
            #Labels of the K datapoints from above
            k_nearest_classes = train_target[k_nearest_distances]

            #Majority voting
            prediction = mode(k_nearest_classes).mode[0]
            
            #Adds the incorrect prediction index to the list of indices to move to the condensed training list
            if prediction != test_target[idx]:
                add_to_condensed_list.append(idx)

        print('To Move to Condensed')
        print(add_to_condensed_list)
        print(len(add_to_condensed_list))
        print('Before Condensing')
        print(len(condensed_train))
        
        #If there are no points to add to the condensed list, the list is complete 
        if len(add_to_condensed_list) == 0:
            completed==True
           
        #Adds the specified indices to the condensed list and removes them from the original list
        else:          
            add_df = test.iloc[add_to_condensed_list] 

            condensed_train = condensed_train.append(add_df)
            test = test.drop(test.index[add_to_condensed_list])
            
            print('After Condensing')
            print(len(condensed_train))
            #Runs recursively with new condensed training list
            CondensedKNN(condensed_train, test, k, class_label,task_type,categorical=categorical,vdm_dict=vdm_dict, nominal_features=nominal_features)
        
        return condensed_train
    
    if task_type == 'Regression':
        #Loop through the Datapoints to be classified
        for idx in range(0,len(test_data)): 
            #print(idx)
            #Array to store distances
            distances = []

            #Loop through each training Data
            for j in range(0,len(train_data)):
                if categorical == False:
                    distance = EuclideanDistance(test_data[idx], np.array(train_data[j,:])) 
                if categorical == True:
                    distance = CategoricalDistances(test_data[idx],np.array(train_data[j,:]),nominal_features, vdm_dict)

                #Calculating the distance
                distances.append(distance)

            distances = np.array(distances)
            #Sorting the array while preserving the index
            #Keeping the first K datapoints
            k_nearest_distances = np.argsort(distances)[:condense_knn]
            #Labels of the K datapoints from above
            k_nearest_classes = train_target[k_nearest_distances]

            #Majority voting
            prediction = RBF(k_nearest_distances,bandwidth)
            if np.abs(prediction - test_target[idx])>error:
                add_to_condensed_list.append(idx)

        if len(add_to_condensed_list) == 0:
            completed==True
            
        else:          
            add_df = test.iloc[add_to_condensed_list] 
            
            condensed_train = condensed_train.append(add_df)
            test = test.drop(test.index[add_to_condensed_list])

            CondensedKNN(condensed_train, test, k, class_label,task_type,categorical=categorical,vdm_dict=vdm_dict, nominal_features=nominal_features,error=error,bandwidth=bandwidth)
        
        #returns the final condensed train set
        return condensed_train
            