In [1]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import calinski_harabaz_score

  from ._conv import register_converters as _register_converters


In [2]:
def sample(parameters, N=1000):
    """
    Samples N latent variables using parameters
    """
    k = parameters['k']
    n_z = parameters['n_z']
    samples = np.zeros((N,n_z))
    labels = np.zeros(N)
    for i in range(N):
        q = np.random.randint(k)
        mu = parameters['mu_' + str(q)]
        sigma = parameters['sigma_' + str(q)]
        epsilon = np.random.randn(2)
        x = mu + epsilon*sigma
        samples[i] = x
        labels[i] = q
    return samples, labels

In [3]:
def calinksi_harabaz_index(parameters):
    """
    Computes the exact Calinski Harabaz Index (N -> inf)
    Ignores (N-k)/(k-1) because N -> inf
    http://scikit-learn.org/stable/modules/clustering.html#calinski-harabaz-index
    """
    k = parameters['k']
    n_z = parameters['n_z']
    w_k = 0
    b_k = 0
    mu = np.zeros(n_z)
    for q in range(k):
        w_k += np.sum(parameters['sigma_' + str(q)]**2)
        mu += parameters['mu_' + str(q)]
    mu /= k
    for q in range(k):
        mu_q = parameters['mu_' + str(q)]
        b_k += (mu_q-mu).T.dot(mu_q-mu)
    return b_k/w_k

In [4]:
k = 2
n_x = 2
n_z = 2

In [5]:
mu_0 = np.array([-2,-2])
mu_1 = np.array([3,3])
sigma_0 = np.array([2,2])
sigma_1 = np.array([3,3])
parameters = {}
parameters['n_z'] = 2
parameters['k'] = 2
parameters['mu_0'] = mu_0
parameters['mu_1'] = mu_1
parameters['sigma_0'] = sigma_0
parameters['sigma_1'] = sigma_1

In [9]:
# Experiment to see that my algorithm gives same result as sklearn
N = 10000
samples, labels = sample(parameters,N)
sklearn_score = (k-1)/(N-k)*calinski_harabaz_score(samples, labels)
print('sklearn: ' + str(sklearn_score))
my_score = calinksi_harabaz_index(parameters)
print('my: ' + str(my_score))

sklearn: 0.9522406651796289
my: 0.9615384615384616
