## Kernel Density Estimation

In [1]:
%matplotlib inline
import os
import math
import random
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
np.random.seed(24)

## Method 1: Replace True Labels with Probabilities from KDE + Fitting LR
#### y = Score_Positive / (Score_positive + Score_negative)


## Method 2: Generate double points from KDE + LR fitting

### Logistic Regression Model

In [2]:
class LR:
    def __init__(self, learnRate = 0.001, nIter = 1000, use_intercept = True, smoothed = False):
        self.learnRate = learnRate
        self.nIter = nIter
        self.intercept = use_intercept
        self.smooth = smoothed
        
    def crossEntropy(self, P, Y):
        return (-Y * np.log(P) - (1 - Y + 1e-9) * np.log(1 - P + 1e-9)).mean()
    
    def sigmoid(self, Z):
        return 1 / (1 + np.exp(-Z))
    
    def SGD(self, X_train, y_train, X_test, y_test, y_train_smoothed = np.zeros(1)):
        if self.smooth == False:
            y_train_smoothed = np.zeros(len(y_train))
        #Add Intercept
        if self.intercept == True:
            X_train = np.concatenate((np.ones((X_train.shape[0], 1)), X_train), axis=1)
            X_test = np.concatenate((np.ones((X_test.shape[0], 1)), X_test), axis=1)
        #Initialize Weights by zeros
        self.Ws = np.zeros(X_train.shape[1])
        oldWs = np.zeros(X_train.shape[1])
        CETEST = 1
        self.converge = self.nIter
        #Update weights for n Iterations
        for i in range(self.nIter):
            if i % 5000 == 0 and i != 0:
                print('Finished ', i, ' iterations --> Test CE:', CETEST)
            #Shuffle indeces
            p = np.random.permutation(len(X_train))
            X_train = X_train[p]
            y_train = y_train[p]
            y_train_smoothed = y_train_smoothed[p]
            for row,y_t, y_t_s in zip(X_train, y_train, y_train_smoothed):
                Z = np.dot(row, self.Ws)
                y_p = self.sigmoid(Z)
                if self.smooth:
                    gradient = np.dot(row.T, (y_p - y_t_s))
                else:
                    gradient = np.dot(row.T, (y_p - y_t))
                self.Ws -= self.learnRate * gradient
            
            if i % 50 == 0:
                diff = np.absolute(oldWs - self.Ws).sum() / len(self.Ws)
                #print(diff, oldWs, self.Ws)
                if diff <= 0.001:
                    self.converge = i
                    break
                oldWs = self.Ws
    
    def predict_prob(self, X):
        #Add Intercept
        if self.intercept == True:
            intercept = np.ones((X.shape[0], 1))
            X = np.concatenate((intercept, X), axis=1)
        return self.sigmoid(np.dot(X, self.Ws))
    
    def predict(self, X, threshold = 0.5):
        return self.predict_prob(X) >= threshold

# Evaluation Experiments

In [3]:
files = []
dataset_names = []
# r=root, d=directories, f = files
for r, d, f in os.walk('Datasets\\'):
    for file in f:
        if '.csv' in file:
            dataset_names.append(file)
            files.append(os.path.join(r, file))

In [4]:
res = {'Name':[], 'Instances':[], 'Features':[], 'PosClassRatio':[], 
        'NTestCE':[], 'M1TestCE':[], 'M2TestCE':[], 'PTestCE':[], 'SKTestCE':[], 
        'NTrainCE':[], 'M1TrainCE':[], 'M2TrainCE':[], 'PTrainCE':[], 'SKTrainCE':[], 
        'NTestAcc':[], 'M1TestAcc':[], 'M2TestAcc':[], 'PTestAcc':[], 'SKTestAcc':[],
        'NTrainAcc':[], 'M1TrainAcc':[], 'M2TrainAcc':[], 'PTrainAcc':[], 'SKTrainAcc':[],
       'NConverge':[], 'M1Converge':[], 'M2Converge':[], 'PConverge':[]
      }
results = pd.DataFrame(res)

### Define Hyper-parameters

In [5]:
LRate = 0.0001
nIter = 50000

### Data statistics

In [6]:
def meta_features(data):
    n_rows = len(data)
    n_feats = len(data.columns) - 1
    pos_class_ratio = (data['class'] == 1).sum() / n_rows
    return n_rows, n_feats, pos_class_ratio

### Normal LR Results

In [7]:
def normal_results(X_train, y_train, X_test, y_test):
    model = LR(learnRate = LRate, nIter = nIter)
    model.SGD(X_train.values, y_train.values, X_test.values, y_test.values)
    
    y_pred_test  = model.predict_prob(X_test)
    y_pred_train = model.predict_prob(X_train)
    
    CE_test  = model.crossEntropy(y_pred_test,  y_test)
    CE_train = model.crossEntropy(y_pred_train, y_train)
    
    y_pred_test_labels  = model.predict(X_test)
    y_pred_train_labels = model.predict(X_train)
    
    acc_test  = accuracy_score(y_test,  y_pred_test_labels ) * 100
    acc_train = accuracy_score(y_train, y_pred_train_labels) * 100
    
    return acc_test, CE_test, acc_train, CE_train, model.converge, model

### Platt Logistic Regression

In [8]:
def platt_results(X_train, y_train, X_test, y_test):
    #smoothed Labels
    NP = (y_train > 0.5).sum()
    NN = len(y_train) - NP
    y_train_smoothed = np.zeros(len(y_train))
    y_tmp = y_train.values
    for i in range(len(y_train)):
        if y_tmp[i] > 0:
            y_train_smoothed[i] = 1 - 1 / (NP + 2)
        else:
            y_train_smoothed[i] = 1 / (NN + 2)
        
    model = LR(learnRate = LRate, nIter = nIter, smoothed = True)
    model.SGD(X_train.values, y_train.values, X_test.values, y_test.values, y_train_smoothed)
    
    y_pred_test  = model.predict_prob(X_test)
    y_pred_train = model.predict_prob(X_train)
    
    CE_test  = model.crossEntropy(y_pred_test,  y_test)
    CE_train = model.crossEntropy(y_pred_train, y_train)
    
    y_pred_test_labels  = model.predict(X_test)
    y_pred_train_labels = model.predict(X_train)
    
    acc_test  = accuracy_score(y_test,  y_pred_test_labels ) * 100
    acc_train = accuracy_score(y_train, y_pred_train_labels) * 100
    
    return acc_test, CE_test, acc_train, CE_train, model.converge

### Sklearn Results

In [9]:
def sklearn_results(X_train, y_train, X_test, y_test, model):
    model_sklearn = LogisticRegression(random_state=24, solver='lbfgs').fit(X_train, y_train)
    y_pred_test  = model_sklearn.predict_proba(X_test)
    y_pred_train = model_sklearn.predict_proba(X_train)
    
    CE_test  = model.crossEntropy(y_pred_test[:,1] , y_test)
    CE_train = model.crossEntropy(y_pred_train[:,1], y_train)
    
    y_pred_test_labels  = model_sklearn.predict(X_test)
    y_pred_train_labels = model_sklearn.predict(X_train)
    
    acc_test  = accuracy_score(y_test,  y_pred_test_labels ) * 100
    acc_train = accuracy_score(y_train, y_pred_train_labels) * 100
    
    return acc_test, CE_test, acc_train, CE_train

### Create Kernels of positive and negative classes

In [10]:
def kernels(X_train, y_train):
    #split training set into positive and negative instances (S1, S0)
    poss = []
    negs = []
    y_tmp = y_train.values
    for i in range(len(y_train)):
        if y_tmp[i] == 0:
            negs.append(i)
        else:
            poss.append(i)
    S0 = (X_train.iloc[negs,:]).T.values
    S1 = (X_train.iloc[poss,:]).T.values
    #Create positive and negative classes kernels
    neg_kernel = stats.gaussian_kde(S0)
    pos_kernel = stats.gaussian_kde(S1)
    
    #Calculate smoothed labels based on values from the kernels
    P_S0 = pos_kernel.evaluate(S0) / (pos_kernel.evaluate(S0) + neg_kernel.evaluate(S0))
    P_S1 = pos_kernel.evaluate(S1) / (pos_kernel.evaluate(S1) + neg_kernel.evaluate(S1))
    y_train_prob = np.zeros(len(y_train))
    neg_index = 0
    pos_index = 0
    for i in range(len(y_train)):
        if y_tmp[i] == 0:
            y_train_prob[i] = P_S0[neg_index]
            neg_index += 1
        else:
            y_train_prob[i] = P_S1[pos_index]
            pos_index += 1
            
    return pos_kernel, neg_kernel, y_train_prob

### Method 1

In [11]:
def M1_results(X_train, y_train, X_test, y_test, y_train_prob):
    #Train the LR model with the smoothed labels
    model = LR(learnRate = LRate, nIter = nIter, smoothed = True)
    model.SGD(X_train.values, y_train.values, X_test.values, y_test.values, y_train_prob)
    
    y_pred_test  = model.predict_prob(X_test)
    y_pred_train = model.predict_prob(X_train)
    
    CE_test  = model.crossEntropy(y_pred_test,  y_test)
    CE_train = model.crossEntropy(y_pred_train, y_train)
    
    y_pred_test_labels  = model.predict(X_test)
    y_pred_train_labels = model.predict(X_train)
    
    acc_test  = accuracy_score(y_test,  y_pred_test_labels ) * 100
    acc_train = accuracy_score(y_train, y_pred_train_labels) * 100
    
    return acc_test, CE_test, acc_train, CE_train, model.converge

### Method 2

In [12]:
def M2_results(X_train, y_train, X_test, y_test, pos_kernel, neg_kernel):
    N0_2 = int((y_train == 0).sum() * 2)
    N1_2 = int((y_train == 1).sum() * 2)
    neg2 = neg_kernel.resample(N0_2)
    pos2 = pos_kernel.resample(N1_2)

    X_train2 = np.array(np.concatenate((neg2.T, pos2.T), axis=0) )
    y_train2 = np.asarray([0] * N0_2 + [1] * N1_2)
    
    #Train the LR model with Method2
    model = LR(learnRate = LRate, nIter = nIter)
    model.SGD(X_train2, y_train2, X_test, y_test)
    
    y_pred_test  = model.predict_prob(X_test)
    y_pred_train = model.predict_prob(X_train)
    
    CE_test  = model.crossEntropy(y_pred_test,  y_test)
    CE_train = model.crossEntropy(y_pred_train, y_train)
    
    y_pred_test_labels  = model.predict(X_test)
    y_pred_train_labels = model.predict(X_train)
    
    acc_test  = accuracy_score(y_test,  y_pred_test_labels ) * 100
    acc_train = accuracy_score(y_train, y_pred_train_labels) * 100
    
    return acc_test, CE_test, acc_train, CE_train, model.converge

### Collect Results

In [13]:
for f in files:
    # Read Dataset
    print('Dataset Name:', f)
    data = pd.read_csv(f)
    rows, cols, ratio = meta_features(data)
    X_train, X_test, y_train, y_test = train_test_split(data.drop('class', axis=1), data['class'], 
                                                        test_size=0.3, random_state=24)
    X_train = (X_train - X_train.mean()) / X_train.std()
    X_test = (X_test - X_train.mean()) / X_train.std()
    res['Name'] = f; res['Instances'] = rows; res['Features'] = cols; res['PosClassRatio'] = ratio;
   
    #Normal LR
    print('Start Normal:')
    ac1, ce1, ac2, ce2, converge, model = normal_results(X_train, y_train, X_test, y_test)
    res['NTestCE'] = ce1; res['NTestAcc'] = ac1; res['NTrainCE'] = ce2; res['NTrainAcc'] = ac2; res['NConverge'] = converge
    
    #Platt LR
    print('Start Platt:')
    ac1, ce1, ac2, ce2, converge = platt_results(X_train, y_train, X_test, y_test)
    res['PTestCE'] = ce1; res['PTestAcc'] = ac1; res['PTrainCE'] = ce2; res['PTrainAcc'] = ac2; res['PConverge'] = converge
    
    #Sklearn Results
    print('Start SKLearn:')
    ac1, ce1, ac2, ce2 = sklearn_results(X_train, y_train, X_test, y_test, model)
    res['SKTestCE'] = ce1; res['SKTestAcc'] = ac1; res['SKTrainCE'] = ce2; res['SKTrainAcc'] = ac2;
     
    #KDE Kernels
    pos_kernel, neg_kernel, y_train_prob = kernels(X_train, y_train)
    #Method 1
    print('Start Method1:')
    ac1, ce1, ac2, ce2, converge = M1_results(X_train, y_train, X_test, y_test, y_train_prob)
    res['M1TestCE'] = ce1; res['M1TestAcc'] = ac1; res['M1TrainCE'] = ce2; res['M1TrainAcc'] = ac2; res['M1Converge'] = converge
    #Method 2
    print('Start Method2:')
    ac1, ce1, ac2, ce2, converge = M2_results(X_train, y_train, X_test, y_test, pos_kernel, neg_kernel)
    res['M2TestCE'] = ce1; res['M2TestAcc'] = ac1; res['M2TrainCE'] = ce2; res['M2TrainAcc'] = ac2; res['M2Converge'] = converge
    
    results = results.append(res, ignore_index=True)
    print(res, '\n########################################\n')

Dataset Name: Datasets\aecoli.csv
Start Normal:
Start Platt:
Start SKLearn:
Start Method1:
Start Method2:
{'Name': 'Datasets\\aecoli.csv', 'Instances': 336, 'Features': 5, 'PosClassRatio': 0.4255952380952381, 'NTestCE': 0.9783244490579448, 'M1TestCE': 0.9203804579897958, 'M2TestCE': 1.1671208990533848, 'PTestCE': 0.9601715913251816, 'SKTestCE': 1.4090697931398577, 'NTrainCE': 0.1810728469896361, 'M1TrainCE': 0.18728417561358843, 'M2TrainCE': 0.15422011183586795, 'PTrainCE': 0.18515596817991994, 'SKTrainCE': 0.11386393770760568, 'NTestAcc': 54.45544554455446, 'M1TestAcc': 54.45544554455446, 'M2TestAcc': 54.45544554455446, 'PTestAcc': 54.45544554455446, 'SKTestAcc': 54.45544554455446, 'NTrainAcc': 95.74468085106383, 'M1TrainAcc': 96.17021276595744, 'M2TrainAcc': 95.31914893617022, 'PTrainAcc': 96.17021276595744, 'SKTrainAcc': 96.59574468085106, 'NConverge': 500, 'M1Converge': 500, 'M2Converge': 500, 'PConverge': 500} 
########################################

Dataset Name: Datasets\ballo

  if sys.path[0] == '':
  if __name__ == '__main__':


Start Platt:
Start SKLearn:


  np.exp(prob, prob)
  if __name__ == '__main__':


Start Method1:


  if sys.path[0] == '':
  if __name__ == '__main__':


Start Method2:
{'Name': 'Datasets\\eegeyestate.csv', 'Instances': 14980, 'Features': 14, 'PosClassRatio': 0.4487983978638184, 'NTestCE': 11.56776114225859, 'M1TestCE': 11.567758930711454, 'M2TestCE': 11.56776114225859, 'PTestCE': 11.56776114225859, 'SKTestCE': 11.56776114225859, 'NTrainCE': 0.6458341029950391, 'M1TrainCE': 0.6476193609024022, 'M2TrainCE': 0.649001980481579, 'PTrainCE': 0.6458342204233267, 'SKTrainCE': 0.6444785270318015, 'NTestAcc': 44.192256341789054, 'M1TestAcc': 44.192256341789054, 'M2TestAcc': 44.192256341789054, 'PTestAcc': 44.192256341789054, 'SKTestAcc': 44.192256341789054, 'NTrainAcc': 62.94106427617776, 'M1TrainAcc': 62.254434484074004, 'M2TrainAcc': 63.169940873545684, 'PTrainAcc': 62.90291817661644, 'SKTrainAcc': 63.21762349799733, 'NConverge': 500, 'M1Converge': 500, 'M2Converge': 500, 'PConverge': 500} 
########################################

Dataset Name: Datasets\fri_c0_1000_5.csv
Start Normal:
Start Platt:
Start SKLearn:
Start Method1:
Start Method2:


  if sys.path[0] == '':
  if __name__ == '__main__':


Start Platt:
Start SKLearn:


  np.exp(prob, prob)
  if __name__ == '__main__':


Start Method1:


  if sys.path[0] == '':
  if __name__ == '__main__':


Start Method2:
{'Name': 'Datasets\\house_8L.csv', 'Instances': 22784, 'Features': 8, 'PosClassRatio': 0.704002808988764, 'NTestCE': inf, 'M1TestCE': inf, 'M2TestCE': inf, 'PTestCE': inf, 'SKTestCE': inf, 'NTrainCE': 0.3823840584863952, 'M1TrainCE': 0.4609979205225054, 'M2TrainCE': 0.4141500298944101, 'PTrainCE': 0.3823858643746239, 'SKTrainCE': 0.3823838042872227, 'NTestAcc': 29.03744880046811, 'M1TestAcc': 29.03744880046811, 'M2TestAcc': 29.008191925102402, 'PTestAcc': 29.03744880046811, 'SKTestAcc': 29.03744880046811, 'NTrainAcc': 85.2771507399047, 'M1TrainAcc': 81.77200902934537, 'M2TrainAcc': 83.75344870830197, 'PTrainAcc': 85.29596187609731, 'SKTrainAcc': 85.2771507399047, 'NConverge': 500, 'M1Converge': 500, 'M2Converge': 500, 'PConverge': 500} 
########################################

Dataset Name: Datasets\MagicTelescope.csv
Start Normal:
Start Platt:
Start SKLearn:
Start Method1:
Start Method2:
{'Name': 'Datasets\\MagicTelescope.csv', 'Instances': 19020, 'Features': 10, 'PosC

Start Platt:
Start SKLearn:
Start Method1:
Start Method2:
{'Name': 'Datasets\\visualizing_galaxy.csv', 'Instances': 323, 'Features': 4, 'PosClassRatio': 0.541795665634675, 'NTestCE': 0.6741489110858303, 'M1TestCE': 0.9209374580551051, 'M2TestCE': 0.7954474192154593, 'PTestCE': 0.6617848446062807, 'SKTestCE': 0.44747251136396327, 'NTrainCE': 0.2974157381219094, 'M1TrainCE': 0.30615493719078335, 'M2TrainCE': 0.27189688367804854, 'PTrainCE': 0.3019108819137248, 'SKTrainCE': 0.16371367112357893, 'NTestAcc': 89.69072164948454, 'M1TestAcc': 88.65979381443299, 'M2TestAcc': 90.72164948453609, 'PTestAcc': 89.69072164948454, 'SKTestAcc': 94.84536082474226, 'NTrainAcc': 95.57522123893806, 'M1TrainAcc': 94.69026548672566, 'M2TrainAcc': 93.80530973451327, 'PTrainAcc': 95.57522123893806, 'SKTrainAcc': 96.01769911504425, 'NConverge': 500, 'M1Converge': 500, 'M2Converge': 500, 'PConverge': 500} 
########################################

Dataset Name: Datasets\vowel.csv
Start Normal:
Start Platt:
Start

In [14]:
results.head()

Unnamed: 0,Name,Instances,Features,PosClassRatio,NTestCE,M1TestCE,M2TestCE,PTestCE,SKTestCE,NTrainCE,...,SKTestAcc,NTrainAcc,M1TrainAcc,M2TrainAcc,PTrainAcc,SKTrainAcc,NConverge,M1Converge,M2Converge,PConverge
0,Datasets\aecoli.csv,336.0,5.0,0.425595,0.978324,0.92038,1.167121,0.960172,1.40907,0.181073,...,54.455446,95.744681,96.170213,95.319149,96.170213,96.595745,500.0,500.0,500.0,500.0
1,Datasets\balloon.csv,2001.0,1.0,0.24088,1.489491,1.109552,1.423428,1.479263,1.511052,0.358236,...,72.379368,85.428571,76.071429,85.428571,85.428571,85.428571,500.0,500.0,500.0,500.0
2,Datasets\banana.csv,5300.0,2.0,0.448302,0.691276,0.689653,0.690614,0.691274,0.691263,0.681572,...,54.591195,56.576819,51.509434,58.544474,56.576819,56.603774,500.0,500.0,500.0,500.0
3,Datasets\blood transfusion center.csv,748.0,4.0,0.237968,16.486509,16.486509,16.486509,16.486509,16.486509,0.481733,...,20.444444,76.481836,77.437859,76.099426,76.481836,77.246654,500.0,500.0,500.0,500.0
4,Datasets\chscase_vine2.csv,468.0,2.0,0.547009,0.693963,0.694219,29.194423,0.694403,12.4316,0.692962,...,51.77305,57.492355,57.492355,57.492355,57.492355,57.492355,0.0,0.0,500.0,0.0


In [15]:
results.to_csv('Evaluation.csv', index=False)