In [14]:
import numpy as np

from scipy.io import loadmat

import pandas as pd

from numpy.linalg import eig

from scipy.stats import mode

from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import StratifiedKFold

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import warnings

warnings.filterwarnings("ignore")

from sklearn.utils import shuffle

import scipy.stats as stats

# PCA

In [15]:
class PCA :
    
    
    
    
    # ---------------function to calculate eigen values and eigen vector for any matrix
    
    
    def eig_vector( self, X ) :
        
        
        
        # centralize
    
        mean = np.mean( X, 0 )
        
        X_stand = X - mean
        
        
    
        # calculate correlation matrix
    
        X_cov = np.corrcoef( np.transpose( X_stand ) )
        
        
    
        # find the eigenvalues and eigenvectors
    
        e, V = eig( X_cov )
        
        
    
        # sort eigen vector according to eigen values 
        
        idx = np.argsort( -e )

        e = e[idx]

        V = V[:,idx]
        
        m, n = V.shape
        
        return e, V 



    # ----------------projection of X--------------------
    
    
    def transformation( self, X, no_of_components ) :
        
        
        
        e, V = self.eig_vector( X )
        
        p = V[:, : no_of_components ]
        
        
    
        # project the original dataset
    
        mean = np.mean( X, 0 )
        
        X_stand = X - mean
    
        X_transform = np.dot( X_stand, p )
        
        return X_transform

# Conditional Number    ---      ( max( lamda ) / 10 ) < lamda

In [16]:
# function return number of components 

def conditional_number( X ) :
    
    pca = PCA()
    
    e, V = pca.eig_vector( X )
    
    e_max = e[0]
    
    condition = e_max / 10
    
    no_of_components = np.argmax( e < condition )
    
    if( no_of_components == 0 ) :
        
        return 1
    
    else :
        
        return no_of_components

# KNN 

In [17]:
# K Nearest Neighbors Classification

class K_Nearest_Neighbors_Classifier() : 
    
    
    def __init__( self, K ) :
        
        self.K = K
        
    
    
    # Function to store training set
        
    def fit( self, X_train, Y_train ) :
        
        self.X_train = X_train
        
        self.Y_train = Y_train
        
        # no_of_training_examples, no_of_features
        
        self.m, self.n = X_train.shape
    
    
    
    # Function for prediction
        
    def predict( self, X_test ) :
        
        self.X_test = X_test
        
        # no_of_test_examples, no_of_features
        
        self.m_test, self.n = X_test.shape
        
        # initialize Y_predict
        
        Y_predict = np.zeros( self.m_test )
        
        for i in range( self.m_test ) :
            
            x = self.X_test[i]
            
            # find the K nearest neighbors from current test example
            
            neighbors = np.zeros( self.K )
            
            neighbors = self.find_neighbors( x )
            
            # most frequent class in K neighbors
            
            Y_predict[i] = mode( neighbors )[0][0]    
            
        return Y_predict
    
    
    
    # Function to find the K nearest neighbors to current test example
          
    def find_neighbors( self, x ) :
        
        # calculate all the euclidean distances between current test example x and training set X_train
        
        euclidean_distances = np.zeros( self.m )
        
        for i in range( self.m ) :
            
            d = self.euclidean( x, self.X_train[i] )
            
            euclidean_distances[i] = d
        
        # sort Y_train according to euclidean_distance_array and store into Y_train_sorted
        
        inds = euclidean_distances.argsort()
        
        Y_train_sorted = self.Y_train[inds]
        
        return Y_train_sorted[:self.K]
    
    
    
    # Function to calculate euclidean distance
            
    def euclidean( self, x, x_train ) :
        
        return np.sqrt( np.sum( np.square( x - x_train ) ) )

# Banknote Dataset 

In [18]:
data = loadmat( 'Databases/Banknote.mat' )

X = data['X']


# standarise data
    
X = stats.zscore( X )


y = data['Y']

print( X.shape )

print( y.shape )

(1372, 4)
(1372, 1)


In [19]:
no_of_components_conditonal_number = conditional_number( X )

print( "Conditional number : ", no_of_components_conditonal_number )
    
pca = PCA()

X_conditional_number = pca.transformation( X, no_of_components_conditonal_number )

Conditional number :  3


In [20]:
# return the average of balanced accuracy after running 10 times with 10-fold stratified cross-validation

def magic( X, y, model ) :
    
    
    # outer loop to iterate 10 times
    
    balanced_accuracies = []
    
    iterations = 1
    
    for i in range( 0, 10 ) :
        
        
        # shuffle X, y before Splitting
        
        shuffle( X, y )
        
        
        
        skfold = StratifiedKFold( n_splits = 10, shuffle = True )
    
        fold_no = 1

        balanced_accuracy_fold = []
        
        
        
        # inner loop for 10 fold stratified cross validation

        for train_index, test_index in skfold.split( X, y ) :
            
            X_train, X_test = X[train_index], X[test_index]
    
            y_train, y_test = y[train_index], y[test_index]
         
            model.fit( X_train, y_train )
    
            balanced_accuracy_fold.append( balanced_accuracy_score( y_test, model.predict( X_test ) ) )

            fold_no += 1
                
                
        balanced_accuracies.append( np.mean( balanced_accuracy_fold ) )
        
        iterations += 1
        
    return np.mean( balanced_accuracies )

In [21]:
K = 1

balanced_accuracy_K = []

for i in range( 1, 30 ) :    
    
    knn = K_Nearest_Neighbors_Classifier( K = i )

    balanced_accuracy_K.append( magic( X_conditional_number, y, knn ) )    

In [22]:
K = range( 1, 30 )

In [23]:
Neighbors = { 'K' : K, 'balanced_accuracy' : balanced_accuracy_K }

In [24]:
Neighbors = pd.DataFrame( Neighbors )

In [25]:
Neighbors.to_csv( "Neighbors.csv" )