In [15]:
import numpy as np

from scipy.io import loadmat

import pandas as pd

from numpy.linalg import eig

import matplotlib.pyplot as plt

from scipy.stats import mode

# Model KNN

In [16]:
# K Nearest Neighbors Classification

class K_Nearest_Neighbors_Classifier() : 
    
    def __init__( self, K ) :
        
        self.K = K
        
    # Function to store training set
        
    def fit( self, X_train, Y_train ) :
        
        self.X_train = X_train
        
        self.Y_train = Y_train
        
        # no_of_training_examples, no_of_features
        
        self.m, self.n = X_train.shape
    
    # Function for prediction
        
    def predict( self, X_test ) :
        
        self.X_test = X_test
        
        # no_of_test_examples, no_of_features
        
        self.m_test, self.n = X_test.shape
        
        # initialize Y_predict
        
        Y_predict = np.zeros( self.m_test )
        
        for i in range( self.m_test ) :
            
            x = self.X_test[i]
            
            # find the K nearest neighbors from current test example
            
            neighbors = np.zeros( self.K )
            
            neighbors = self.find_neighbors( x )
            
            # most frequent class in K neighbors
            
            Y_predict[i] = mode( neighbors )[0][0]    
            
        return Y_predict
    
    # Function to find the K nearest neighbors to current test example
          
    def find_neighbors( self, x ) :
        
        # calculate all the euclidean distances between current test example x and training set X_train
        
        euclidean_distances = np.zeros( self.m )
        
        for i in range( self.m ) :
            
            d = self.euclidean( x, self.X_train[i] )
            
            euclidean_distances[i] = d
        
        # sort Y_train according to euclidean_distance_array and store into Y_train_sorted
        
        inds = euclidean_distances.argsort()
        
        Y_train_sorted = self.Y_train[inds]
        
        return Y_train_sorted[:self.K]
    
    # Function to calculate euclidean distance
            
    def euclidean( self, x, x_train ) :
        
        return np.sqrt( np.sum( np.square( x - x_train ) ) )

# Model Logistic Regression

In [6]:
# # Logistic Regression

class LogitRegression() :
    
    def __init__( self, learning_rate, iterations ) :        
        
        self.learning_rate = learning_rate        
        
        self.iterations = iterations
          
    # Function for model training    
    
    def fit( self, X, Y ) :        
        
        # no_of_training_examples, no_of_features        
        
        self.m, self.n = X.shape        
        
        # weight initialization        
        
        self.W = np.zeros( self.n )        
        
        self.b = 0        
        
        self.X = X        
        
        self.Y = Y
          
        # gradient descent learning
                  
        for i in range( self.iterations ) :            
            
            self.update_weights()            
        
        return self
      
    # Helper function to update weights in gradient descent
      
    def update_weights( self ) :           
        
        A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
          
        # calculate gradients        
        
        tmp = ( A - self.Y.T )        
        
        tmp = np.reshape( tmp, self.m )        
        
        dW = np.dot( self.X.T, tmp ) / self.m         
        
        db = np.sum( tmp ) / self.m 
          
        # update weights    
        
        self.W = self.W - self.learning_rate * dW    
        
        self.b = self.b - self.learning_rate * db
          
        return self
      
    # Hypothetical function  h( x ) 
      
    def predict( self, X ) :    
        
        Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )        
        
        Y = np.where( Z > 0.5, 1, 0 )        
        
        return Y

# Dataset set up

## Breast Cancer 

In [17]:
data = loadmat( 'Databases/breastCancer.mat' )

In [18]:
data = loadmat( 'Databases/breastCancer.mat' )

X = data['X']

y = data['Y']

print( X.shape )

print( y.shape )

In [19]:
print( X.shape )

print( y.shape )

(569, 30)
(569, 1)


# PCA and selection methods

In [20]:
def eig_vector( X ) :
        
    # centralize
    
    mean = np.mean( X, 0 )
        
    X_stand = X - mean
    
    # calculate co-variance matrix
    
    X_cov = np.cov( np.transpose( X_stand ) )
    
    # find the eigenvalues and eigenvectors
    
    e, V = eig( X_cov )
    
    # sort eigen vector according to eigen values 
        
    idx = np.argsort( -1 * e )

    e = e[idx]

    V = V[:,idx]
        
    m, n = V.shape
        
    return e, V 



# projection of X

def transformation( X, no_of_components, V ) :
        
    p = V[:, : no_of_components ]
    
    # project the original dataset
    
    mean = np.mean( X, 0 )
        
    X_stand = X - mean
    
    X_transform = np.dot( X_stand, p )
        
    return X_transform

# Conditional Number

## ( max( lamda ) / lamda ) < 10 

In [59]:
# function return number of components 

def conditional_number( X ) :
    
    e, V = eig_vector( X )
    
    e_max = e[0]
    
    condition = e_max / 10
    
    no_of_components = 0
    
    for i in e :
        
        if( i > ( condition ) ) :
            
            no_of_components = no_of_components + 1
    
    return no_of_components

# kaiser rule  
## ( lamda > 1 )

In [22]:
# function return transformed X after apply PCA-kaiser rule

def kaiser_rule( X ) :
    
    e, V = eig_vector( X )
    
    e_1 = ( e > 1 )
    
    no_of_components = np.count_nonzero( e_1 )
    
    return transformation( X, no_of_components, V)

In [23]:
# broken stick rule

In [24]:
# function return transformed X after apply PCA-broken stick rule

def broken_stick( X ) :
    
    e, V = eig_vector( X )
    
    # Calculate the proportional variance
    
    propvar = e / sum( e )
    
    # calculate the the expected length of the k-th longest segment
    
    p = np.size( e )
    
    g = np.zeros( ( p ) )
    
    k = 0
    
    while( k < p ) :
        
        i = k
        
        while( i < p ) :
            
            g[k] = g[k] + ( 1 / ( i + 1 ) )
            
            i = i + 1
            
        k = k + 1

    g = g / p                        
    
    # Find the cumulative variances that are larger than chance:

    inds = find( propvar < g );                      
    
    return transformation( X, no_of_components, V)

In [25]:
# e, V = eig_vector( X )

# p = np.size( e )
 
# g = np.zeros( ( p ) )
    
# k = 0
    
# while( k < p ) :
    
  #  i = k
        
   # while( i < p ) :
        
    #    g[k] = g[k] + ( 1 / ( i + 1 ) )
        
     #   i = i + 1
                               
    # k = k + 1

# g = g / p                  

In [26]:
# g

In [27]:
# propvar = e / sum( e )

In [28]:
# g

# propvar < g

In [29]:
# inds = np.count_nonzero( propvar < g )

# inds - 1

In [30]:
# propvar

# Modelling after PCA Kaiser 

## KNN, Logit & sklearn LDA because I compared it my with my own build LDA which is less optimized so I went with LDA

In [31]:
X_transform_kaiser = kaiser_rule( X )

print( "Selected Components by kaiser rule : ", X_transform_kaiser.shape[1] )

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X_transform_kaiser, y, test_size = 0.25 )

Selected Components by kaiser rule :  7


In [32]:
model1 = K_Nearest_Neighbors_Classifier( K = 5 )

In [33]:
model1.fit( X_train, y_train )

In [34]:
y_pred1 = model1.predict( X_test )

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
print( "KNN kaiser : ", accuracy_score( y_test, y_pred1 ) )

KNN kaiser :  0.9440559440559441


In [310]:
model2 = LogitRegression( learning_rate = 0.01, iterations = 100 )

In [311]:
model2.fit( X_train, y_train )



<__main__.LogitRegression at 0x7f09e9a870d0>

In [312]:
y_pred2 = model2.predict( X_test )

In [313]:
print( "LR kaiser : ", accuracy_score( y_test, y_pred2 ) )

LR kaiser :  0.9230769230769231


In [314]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [315]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model3 = LinearDiscriminantAnalysis()

model3.fit( X_train, y_train )

y_pred3 = model3.predict( X_test )

  return f(**kwargs)


In [316]:
print( "Linear Discriminant kaiser : ", accuracy_score( y_test, y_pred3 ) )

Linear Discriminant kaiser :  0.9370629370629371


# Modelling after PCA - conditional number selection

In [317]:
X_transform_conditional = conditional_number( X )

print( "Selected Components by conditional number rule : ", X_transform_conditional.shape[1] )

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X_transform_conditional, y, test_size = 0.25 )

Selected Components by conditional number rule :  1


In [318]:
# Model KNN

In [319]:
model1 = K_Nearest_Neighbors_Classifier( K = 5 )

model1.fit( X_train, y_train )

y_pred1 = model1.predict( X_test )

print( "KNN : ", accuracy_score( y_test, y_pred1 ) )

KNN :  0.9020979020979021


In [320]:
# Model Logistic Regression

model2 = LogitRegression( learning_rate = 0.01, iterations = 100 )

model2.fit( X_train, y_train )

y_pred2 = model2.predict( X_test )

print( "Logistic Regression : ", accuracy_score( y_test, y_pred2 ) )

Logistic Regression :  0.916083916083916




In [322]:
# Model LDA

model3 = LinearDiscriminantAnalysis()

model3.fit( X_train, y_train )

y_pred3 = model3.predict( X_test )

print( "Logistic Regression : ", accuracy_score( y_test, y_pred3 ) )

Logistic Regression :  0.9090909090909091


  return f(**kwargs)


# Performance Metrics

In [53]:
# dataset loading and X, y slicing

data = loadmat( 'Databases/breastCancer.mat' )

X = data['X']

y = data['Y']

print( X.shape )

print( y.shape )

population = y.shape[0]

(569, 30)
(569, 1)


In [54]:
# rehshape to 1-d numpy array

y = np.reshape( y, ( population ) )

In [57]:
# unique value counts

pd.value_counts( y )

0    357
1    212
dtype: int64

In [62]:
from sklearn.decomposition import PCA

In [63]:
# apply PCA

no_of_components = conditional_number( X )

model = PCA( no_of_components )

X_transform_conditional = model.fit_transform( X )

# Model Validation

In [64]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()