In [83]:
import numpy as np
import sklearn.metrics as metrics
import scipy.spatial.distance as distance
import sklearn.datasets as datasets
import matplotlib.pyplot as plt

### Simple test data

In [77]:
a = np.array([[0, 0], [0, 2], [2, 0], [2, 2], [4, 0], [4, 2]])
b = a / 2
c = a * 2
d = np.concatenate((a, np.array([[0, -2], [2, -2], [4, -2]])), axis=0) 

### 1. Covariance matrix determinant

In [8]:
a_t = a.T

In [9]:
a_t_cov = np.cov(a_t)

In [10]:
a_t_cov

array([[3.2, 0. ],
       [0. , 1.2]])

In [12]:
a_t_det = np.linalg.det(a_t_cov)

In [13]:
a_t_det

3.8400000000000003

In [17]:
b_t = b.T

In [18]:
b_t_cov = np.cov(b_t)

In [19]:
b_t_cov

array([[0.8, 0. ],
       [0. , 0.3]])

In [20]:
b_t_det = np.linalg.det(b_t_cov)

In [21]:
b_t_det

0.24000000000000005

In [22]:
c_t = c.T
c_t_cov = np.cov(c_t)
c_t_det = np.linalg.det(c_t_cov)
c_t_det

61.44

In [33]:
d_t = d.T
d_t_cov = np.cov(d_t)
d_t_det = np.linalg.det(d_t_cov)
d_t_det

9.000000000000002

In [39]:
def get_cov_metric(data):
    return np.linalg.det(np.cov(data.T))

In [40]:
get_cov_metric(a)

3.8400000000000003

### 2. Cluster cohesion
implied that data is one cluster

In [80]:
def get_cohesion(data):
    avg_point = data.mean(axis=0)
    return sum([np.linalg.norm(point-avg_point) for point in data])

In [81]:
print('a cohesion', get_cohesion(a))
print('b cohesion', get_cohesion(b))
print('c cohesion', get_cohesion(c))

a cohesion 10.94427190999916
b cohesion 5.47213595499958
c cohesion 21.88854381999832


### 3. C-index

In [78]:
def get_c_index(data):
    pdists = distance.pdist(data)
    s = sum(pdists)
    s_min = min(pdists)
    s_max = max(pdists)

    return (s - s_min) / (s_max - s_min)

In [79]:
print('a c-index', get_c_index(a))
print('b c-index', get_c_index(b))
print('c c-index', get_c_index(c))
print('d c-index', get_c_index(d))

a c-index 16.284695155040843
b c-index 16.284695155040843
c c-index 16.284695155040843
d c-index 31.64419614290455


### Test on blobs

In [131]:
def test_metric(metric, epochs=1, centers=3, n_features=2, n_samples=100):
    error_count = 0
    for _ in range(epochs):
        X, y = datasets.make_blobs(n_samples=n_samples, centers=centers, n_features=n_features, random_state=0)
        
        X_first_class = X[y==0]
        X_second_class = X[y==1]
        X_third_class = X[y==2]
        
        s1 = metric(X_first_class[np.random.choice(X_first_class.shape[0], X_first_class.shape[0] // 3)])
        s2 = metric(X_second_class[np.random.choice(X_second_class.shape[0], X_second_class.shape[0] // 3)])
        s3 = metric(X_third_class[np.random.choice(X_third_class.shape[0], X_third_class.shape[0] // 3)])
        
        d = metric(X[np.random.choice(X.shape[0], X.shape[0] // 9)])
        
        if d <= s1 or d <= s2 and d <= s3:
            error_count += 1
            
    print('run', epochs, 'tests,', epochs - error_count, 'success,', error_count, 'wrong')
            

#### Test covariance determinant metric

In [123]:
test_metric(get_cov_metric, epochs=100000)

run 100000 tests, 98946 success, 1054 wrong


#### Test cohesion

In [133]:
test_metric(get_cohesion, epochs=100000)

run 100000 tests, 99337 success, 663 wrong


#### Test c-index

In [125]:
test_metric(get_c_index, epochs=100000)

run 100000 tests, 59773 success, 40227 wrong


### Test on full size vectors

#### Test covariance determinant matrix

In [149]:
test_metric(get_cov_metric, epochs=100, centers=3, n_features=1324, n_samples=300)

run 100 tests, 0 success, 100 wrong


#### Test cohesion

In [143]:
test_metric(get_cohesion, epochs=1000, centers=3, n_features=1324, n_samples=300)

run 1000 tests, 1000 success, 0 wrong


#### Test c-index

In [144]:
test_metric(get_c_index, epochs=1000, centers=3, n_features=1324, n_samples=300)

run 1000 tests, 0 success, 1000 wrong


### Test on 50 classes

#### Test covariance determinant matrix

In [148]:
test_metric(get_cov_metric, epochs=10, centers=50, n_features=1324, n_samples=5000)

run 10 tests, 0 success, 10 wrong


#### Test cohesion

In [147]:
test_metric(get_cohesion, epochs=100, centers=50, n_features=1324, n_samples=5000)

run 100 tests, 100 success, 0 wrong
