In [1]:
import os

import numpy as np

from scipy.io import loadmat

from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import StratifiedKFold

import warnings

warnings.filterwarnings("ignore")

In [2]:
# # Logistic Regression

class LogitRegression() :
    
    
    
    def __init__( self, learning_rate, iterations, threshold ) :        
        
        self.learning_rate = learning_rate        
        
        self.iterations = iterations
        
        self.threshold = threshold
        
        
          
    # Function for model training   
    
    def fit( self, X, Y ) :        
        
        # no_of_training_examples, no_of_features        
        
        self.m, self.n = X.shape        
        
        # weight initialization        
        
        self.W = np.zeros( self.n )        
        
        self.b = 0        
        
        self.X = X        
        
        self.Y = Y
          
        # gradient descent learning
                  
        for i in range( self.iterations ) :            
            
            self.update_weights()            
        
        return self
      
    
    
    # Helper function to update weights in gradient descent
      
    def update_weights( self ) :           
        
        A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
          
        # calculate gradients        
        
        tmp = ( A - self.Y.T )        
        
        tmp = np.reshape( tmp, self.m )        
        
        dW = np.dot( self.X.T, tmp ) / self.m         
        
        db = np.sum( tmp ) / self.m 
          
        # update weights    
        
        self.W = self.W - self.learning_rate * dW    
        
        self.b = self.b - self.learning_rate * db
          
        return self
      
    
    
    # Hypothetical function  h( x ) 
      
    def predict( self, X ) :    
        
        Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )        
        
        Y = np.where( Z > self.threshold, 1, 0 )        
        
        return Y

In [3]:
def fun( X, y, model ) :
    
    skfold = StratifiedKFold( n_splits = 10, shuffle = False )
    
    fold_no = 1

    balanced_accuracies = []
        
    # inner loop for 10 fold stratified cross validation

    for train_index, test_index in skfold.split( X, y ) :
        
        X_train, X_test = X[train_index], X[test_index]
    
        y_train, y_test = y[train_index], y[test_index]
         
        model.fit( X_train, y_train )
    
        balanced_accuracy = balanced_accuracy_score( y_test, model.predict( X_test ) )
    
        balanced_accuracies.append( balanced_accuracy )

        fold_no += 1

    return np.mean( balanced_accuracies )

In [4]:
def threshold_selection( X, y ) :

    maximum = ( 0, 0.1 )   # ( accuracy, threshold )
    
    i = 0

    while( i < 1 ) :
        
        learning_rate = 0.1
        
        iterations = 500
        
        model = LogitRegression( learning_rate, iterations, i )
    
        tmp = fun( X, y, model )
    
        if( maximum[0] < tmp ) :
            
            maximum = ( tmp, i )
        
        i = i + 0.1
        
    return maximum

In [None]:
files = os.listdir('./Databases/')

Dataset = []

Threshold = []

Balanced_Accuracy = []

for file in files:
    
    data = loadmat( './Databases/' + file )
    
    X = data['X']

    y = data['Y']
    
    try :
        
        ( accuracy, threshold ) = threshold_selection( X, y )
        
        Threshold.append( threshold )
        
        Balanced_Accuracy.append( accuracy )
        
        Dataset.append( file )
    
        print( "Highest Balanced accuracy achieved for dataset %s is %f at threshold %f" % ( file, accuracy, threshold  ) )
        
    except :
        
        continue

Highest Balanced accuracy achieved for dataset minboone.mat is 0.769275 at threshold 0.000000
Highest Balanced accuracy achieved for dataset spect.mat is 0.775693 at threshold 0.800000
Highest Balanced accuracy achieved for dataset Banknote.mat is 0.989348 at threshold 0.400000
Highest Balanced accuracy achieved for dataset liver.mat is 0.540568 at threshold 1.000000
Highest Balanced accuracy achieved for dataset vertebral.mat is 0.768333 at threshold 0.100000
Highest Balanced accuracy achieved for dataset skin.mat is 0.750062 at threshold 0.300000
Highest Balanced accuracy achieved for dataset diabetic.mat is 0.605073 at threshold 1.000000
Highest Balanced accuracy achieved for dataset sonar.mat is 0.635581 at threshold 0.500000
Highest Balanced accuracy achieved for dataset spectf.mat is 0.525455 at threshold 0.000000
Highest Balanced accuracy achieved for dataset Musk2.mat is 0.728990 at threshold 0.000000
Highest Balanced accuracy achieved for dataset Musk.mat is 0.662829 at thresh