In [1]:
import pandas as pd
import scipy as sp
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
NumDataPerClass = 200
# Two-class problem, distinct means, equal covariance matrices
#
m1 = [[0, 3]]
m2 = [[3, 2.5]]
C = [[2, 1], [1, 2]]
# Set up the data by generating isotropic Guassians and
# rotating them accordingly
#
A = np.linalg.cholesky(C)
U1 = np.random.randn(NumDataPerClass,2)
X1 = U1 @ A.T + m1
U2 = np.random.randn(NumDataPerClass,2)
X2 = U2 @ A.T + m2

In [3]:
X = np.concatenate((X1, X2), axis=0)

In [4]:
labelPos = np.ones(NumDataPerClass)
labelNeg = 0 * np.ones(NumDataPerClass)
y = np.concatenate((labelPos, labelNeg))

In [5]:
rIndex = np.random.permutation(2*NumDataPerClass)
Xr = X[rIndex,]
yr = y[rIndex]

# Training and test sets (half half)
#
xtrain = Xr[0:NumDataPerClass]
ytrain = yr[0:NumDataPerClass]
xtest = Xr[NumDataPerClass:2*NumDataPerClass]
ytest = yr[NumDataPerClass:2*NumDataPerClass]
print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

Ntrain = NumDataPerClass;
Ntest = NumDataPerClass;

(200, 2) (200,) (200, 2) (200,)


In [6]:
def mahalanobis(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.T)
    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return mahal.diagonal()

In [12]:
class MahalanobisBinaryClassifier():
    def __init__(self, xtrain, ytrain):
        self.xtrain_pos = xtrain[ytrain == 1, :]
        self.xtrain_neg = xtrain[ytrain == 0, :]

    def predict_proba(self, xtest):
        pos_neg_dists = [(p,n) for p, n in zip(mahalanobis(xtest, self.xtrain_pos), mahalanobis(xtest, self.xtrain_neg))]
        return np.array([(1-n/(p+n), 1-p/(p+n)) for p,n in pos_neg_dists])

    def predict(self, xtest):
        return np.array([np.argmax(row) for row in self.predict_proba(xtest)])


clf = MahalanobisBinaryClassifier(xtrain, ytrain)        
pred_probs = clf.predict_proba(xtest)
pred_class = clf.predict(xtest)

# Pred and Truth
pred_actuals = pd.DataFrame([(pred, act) for pred, act in zip(pred_class, ytest)], columns=['pred', 'true'])
print(pred_actuals[:50])  

    pred  true
0      1   1.0
1      1   1.0
2      1   1.0
3      0   0.0
4      1   1.0
5      1   0.0
6      1   1.0
7      0   1.0
8      1   1.0
9      1   0.0
10     0   0.0
11     1   0.0
12     1   0.0
13     1   1.0
14     1   1.0
15     1   1.0
16     1   1.0
17     1   1.0
18     1   1.0
19     1   1.0
20     0   0.0
21     0   0.0
22     0   0.0
23     0   0.0
24     1   0.0
25     1   1.0
26     1   1.0
27     1   0.0
28     1   1.0
29     1   0.0
30     0   0.0
31     0   0.0
32     1   1.0
33     1   0.0
34     0   0.0
35     1   1.0
36     1   1.0
37     0   0.0
38     1   1.0
39     1   1.0
40     1   1.0
41     1   1.0
42     0   0.0
43     0   0.0
44     0   0.0
45     1   1.0
46     1   0.0
47     0   0.0
48     1   1.0
49     0   0.0


In [8]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
truth = pred_actuals.loc[:, 'true']
pred = pred_actuals.loc[:, 'pred']
scores = np.array(pred_probs)[:, 1]
print('AUROC: ', roc_auc_score(truth, scores))
print('\nConfusion Matrix: \n', confusion_matrix(truth, pred))
print('\nAccuracy Score: ', accuracy_score(truth, pred))
print('\nClassification Report: \n', classification_report(truth, pred))

AUROC:  0.665959595959596

Confusion Matrix: 
 [[65 45]
 [11 79]]

Accuracy Score:  0.72

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.86      0.59      0.70       110
         1.0       0.64      0.88      0.74        90

    accuracy                           0.72       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.76      0.72      0.72       200



In [9]:
def euclidean(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    return np.linalg.norm(x-np.mean(data), axis=1)

In [10]:
class EuclideanBinaryClassifier():
    def __init__(self, xtrain, ytrain):
        self.xtrain_pos = xtrain[ytrain == 1, :]
        self.xtrain_neg = xtrain[ytrain == 0, :]

    def predict_proba(self, xtest):
        pos_neg_dists = [(p,n) for p, n in zip(euclidean(xtest, self.xtrain_pos), euclidean(xtest, self.xtrain_neg))]
        return np.array([(1-n/(p+n), 1-p/(p+n)) for p,n in pos_neg_dists])

    def predict(self, xtest):
        return np.array([np.argmax(row) for row in self.predict_proba(xtest)])


clf = EuclideanBinaryClassifier(xtrain, ytrain)        
pred_probs = clf.predict_proba(xtest)
pred_class = clf.predict(xtest)

# Pred and Truth
pred_actuals = pd.DataFrame([(pred, act) for pred, act in zip(pred_class, ytest)], columns=['pred', 'true'])
print(pred_actuals[:50])  

    pred  true
0      1   1.0
1      0   1.0
2      1   1.0
3      0   0.0
4      0   1.0
5      0   0.0
6      0   1.0
7      0   1.0
8      1   1.0
9      1   0.0
10     0   0.0
11     0   0.0
12     1   0.0
13     1   1.0
14     1   1.0
15     1   1.0
16     1   1.0
17     1   1.0
18     1   1.0
19     1   1.0
20     0   0.0
21     0   0.0
22     0   0.0
23     0   0.0
24     1   0.0
25     0   1.0
26     1   1.0
27     1   0.0
28     1   1.0
29     1   0.0
30     0   0.0
31     0   0.0
32     0   1.0
33     1   0.0
34     0   0.0
35     1   1.0
36     1   1.0
37     0   0.0
38     1   1.0
39     1   1.0
40     0   1.0
41     1   1.0
42     0   0.0
43     0   0.0
44     0   0.0
45     1   1.0
46     1   0.0
47     0   0.0
48     1   1.0
49     0   0.0


In [11]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
truth = pred_actuals.loc[:, 'true']
pred = pred_actuals.loc[:, 'pred']
scores = np.array(pred_probs)[:, 1]
print('AUROC: ', roc_auc_score(truth, scores))
print('\nConfusion Matrix: \n', confusion_matrix(truth, pred))
print('\nAccuracy Score: ', accuracy_score(truth, pred))
print('\nClassification Report: \n', classification_report(truth, pred))

AUROC:  0.6536363636363637

Confusion Matrix: 
 [[70 40]
 [24 66]]

Accuracy Score:  0.68

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.74      0.64      0.69       110
         1.0       0.62      0.73      0.67        90

    accuracy                           0.68       200
   macro avg       0.68      0.68      0.68       200
weighted avg       0.69      0.68      0.68       200

