In [None]:
import numpy as np
import pyreadr
import matplotlib.pyplot as plt

np.random.seed(0)

# load dataset

classic3 = pyreadr.read_r('data/classic3.RData')['classic3']

X_raw = classic3.to_numpy()
N, D_raw = X_raw.shape
labels = X_raw[:,-1]
X_raw = X_raw[:,1:-1].astype(dtype=np.float32)

# tfn scheme - normalized term frequency-inverse document frequency

Nj = (X_raw > 0).sum(axis=0) # number of documents containing word j = 1, ..., D
idx = np.where(Nj > 0)[0] # remove dead features
D = len(idx)

sub_sample_features = True
if sub_sample_features:
    D_max = 200
    D = np.min((D,D_max))
    idx__ = np.argsort(Nj)
    idx = idx__[-D:]

X_raw, Nj = X_raw[:,idx], Nj[idx]

gj = np.log(N/Nj)
si = 1. / np.sqrt(((X_raw * gj.reshape(1,D))**2).sum(axis=1))

X = X_raw * np.outer(si, gj)

plt.hist(np.mean(X_raw==0., axis=1), density=True)
plt.xlabel('sparisty of vectors')
plt.ylabel('rel. frequency in dataset')
plt.show()

print('\n')
print('selecting D=' + str(D) + ' features out of ' + str(D_raw) + ' features in full dataset.')
print('\n')

In [None]:
# compute 'true' parameters using known labels
from vMFne.negentropy import gradΨ
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
from vMFne.moVMF import posterior_marginal_vMF_mixture_Φ

ls = np.unique(labels)
μs_true = np.stack([ np.mean(X[np.where(labels==ls[k])[0]],axis=0) for k in range(len(ls)) ], axis=0)
w_true = np.array([ np.sum(labels==ls[k]) for k in range(len(ls))]) / len(labels)
w_true = w_true / w_true.sum()
print(w_true)
μs_norm = np.linalg.norm(μs_true,axis=1) 
print(μs_true.dot(μs_true.T))
print(μs_true.dot(μs_true.T) / np.outer(μs_norm,μs_norm))

_, px_true_Ψ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true)
LL_true_Ψ = np.log(px_true_Ψ).sum()

from sklearn.metrics import confusion_matrix

class_true = sum([ (1.*i) * (labels==ls[i]) for i in range(len(ls))])
ph_x_μ_true, _ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true)
class_est_μ_true = np.argmax(ph_x_μ_true,axis=1)
M_μ_true = confusion_matrix(class_true, class_est_μ_true)

ηs_true = gradΨ(μs_true,D=D)
_, px_true_Φ = posterior_marginal_vMF_mixture_Φ(X,w_true,ηs_true)
LL_true_Φ = np.log(px_true_Φ).sum() # may differ from LL_true_Ψ by a constant offset

plt.plot(μs_true.T)
plt.title('mean parameters per class')
plt.xlabel('# of feature')
plt.ylabel('μ[# of feature]')
plt.show()


In [None]:
from vMFne.moVMF import moVMF, posterior_marginal_vMF_mixture_Φ

ηs, w, LL = moVMF(X, K=3, max_iter=200, verbose=False, ηs_init = ηs_true, w_init = w_true)
ph_x, px = posterior_marginal_vMF_mixture_Φ(X,w,ηs)

plt.plot(LL)
plt.plot([0, len(LL)], [LL_true_Φ,LL_true_Φ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.show()

plt.subplot(1,2,1)
class_est = np.argmax(ph_x,axis=1)
M = confusion_matrix(class_true, class_est)
plt.imshow(M)
plt.colorbar()
plt.title('learned model')

plt.subplot(1,2,2)
plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model')


In [None]:
from vMFne.bregman_clustering import softBregmanClustering_vMF
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ

μs, w, LL = softBregmanClustering_vMF(X, K=3, max_iter=200, verbose=False, μs_init = μs_true, w_init = w_true)

plt.plot(LL)
plt.plot([0, len(LL)], [LL_true_Ψ,LL_true_Ψ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.show()

ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs)

print('w', w)
print(μs.dot(μs.T))
norm_μs = np.linalg.norm(μs, axis=1)
print(μs.dot(μs.T) / np.outer(norm_μs,norm_μs))

plt.subplot(1,2,1)
class_est = np.argmax(ph_x,axis=1)
M = confusion_matrix(class_true, class_est)
plt.imshow(M)
plt.colorbar()
plt.title('learned model')

plt.subplot(1,2,2)
plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model')


In [None]:
D = 20
κs = np.linspace(10, 500, 50)
ηs = np.ones((len(κs), D)) / np.sqrt(D)
ηs = κs.reshape(-1,1) * ηs

def grad_Φ(ηs):
    ηs = np.atleast_2d(ηs)
    K,D = ηs.shape
    κs = np.linalg.norm(ηs,axis=-1)    
    IeD2 = scipy.special.ive(D/2.,κs)
    IeD2_1 = scipy.special.ive(D/2.-1.,κs)
    μs = ηs * ((IeD2/IeD2_1) / κs).reshape(-1,1)
    return μs

from vMFne.negentropy import Ψ

μs = grad_Φ(ηs)
H_Φ = vMF_entropy_Φ(ηs)

import scipy.stats

H_vmf = np.zeros_like(κs)
for i,κ in enumerate(κs):
    H_vmf[i] = scipy.stats.vonmises_fisher.entropy(kappa=κ, mu=np.ones(D)/np.sqrt(D))

plt.plot(H_Φ.flatten(), -Ψ(μs, D=D))
plt.plot(H_Φ.flatten(), H_vmf)


# towards mean-parameterized (Hyper-)spherical VAEs
Quick idea to make something out of mean parameterization for hyperspherical VAEs:
- hyperspherical VAEs are defined by von Mises-Fisher p(z), q(z|x) and general (typically Gaussian) p(x|z).
- as such they require the reparametrization trick to get training gradients for q(z|x) from the ELBO
- reparametrizaition for von Mises-Fisher latents is known, but is i) cumbersome and ii) formulated in natural parameterization (one samples a univariate $\omega \sim p(\omega \ | \ \kappa = ||\eta||, D)$.
- we here try a quick idea for $D=2$ and $D=3$ whereafter one only approximately samples $q(z|x)$ by sampling $\tilde{z} \sim \mathcal{N}(\tilde{z}| \mu(x), \sigma_\mu^2)$, where $\mu(x)$ is the mean parameter of the vMF $q(z|x)$. Then $z = \tilde{z}/||\tilde{z}||$, which is differentiable almost surely. The question is for the best-approximating variance function $\sigma^2_\mu$, i.e. a function in $\mu(x)$ (or more sensibly in $||\mu(x)||$).
- for $D=2,3$, it seems that $\sigma^2_\mu = \frac{1-||\mu||^{(8-2D))}}{\sqrt(2\pi)}$ works quite well. 
- generalization to $D > 3$ currently unclear.

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import vonmises  
from matplotlib.pyplot import cm
from vMFne.utils_angular import cart2spherical, spherical_rotMat
from vMFne.sample import sample_vMF_Ulrich

D = 3
N = 1000000

def sigma2(norm_mu):
    c = 2
    renorm =  1./np.sqrt(2*np.pi) * (1 - norm_mu**((4-D)*c))
    return renorm

mu_base = np.array([0., 0.0, 1.0])[-D:].reshape(1,D)
mu_base = mu_base / np.sqrt( (mu_base**2).sum() )
mu_norms = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]

plt.figure(figsize=(8,16))
for jj in range(len(mu_norms)):
    mu = mu_norms[jj] * mu_base
    norm_mu = np.sqrt( (mu**2).sum() )

    # numerically approximate \grad\Psi(||mu||) = ||eta|| = kappa
    etas = np.linspace(0,100, 100000)
    target = norm_mu
    kappa = np.linalg.norm(gradΨ(μ=mu,D=D))

    # sample from Gaussian proposal
    renorm = sigma2(norm_mu)
    x = mu + np.random.normal(size=[N,D]) * np.sqrt(renorm)
    x_norm = x / np.sqrt((x**2).sum(axis=-1)).reshape(-1,1)

    phi_x = cart2spherical(x.T)

    plt.subplot(np.int32(np.ceil(len(mu_norms)/2.)), 2, jj+1)
    if D == 2:
        xx = np.linspace(-np.pi, np.pi, 100)
        phi = cart2spherical(x.T)
        h_x,bins_x = np.histogram(phi, bins=xx, density=True)
        phi_mu = np.arctan2(mu[...,1], mu[...,0])
        phi_vmf = np.mod(vonmises.rvs(kappa, size=N) + phi_mu + np.pi, 2*np.pi) - np.pi
        plt.hist(phi_vmf, bins=xx, density=True)
        plt.plot(bins_x[:-1]+np.diff(bins_x[:2])[0]/2, h_x)
    elif D == 3:
        xx = np.linspace(0, np.pi, 50)        
        x_vmf = sample_vMF_Ulrich(N=N, m=mu.flatten()/norm_mu, kappa=kappa)
        phi_vmf = np.mod(cart2spherical(x_vmf.T) + np.pi, 2*np.pi) - np.pi
        h_vmf,_ = np.histogram(phi_vmf[0], xx, density=True)
        h_x,_   = np.histogram(phi_x[0], xx, density=True)        
        plt.plot(xx[:-1] + (xx[1]-xx[0])/2., h_x, label='angles of Gaussian draws')
        plt.plot(xx[:-1] + (xx[1]-xx[0])/2., h_vmf, label='von Mises-Fisher distribution')
    plt.title(r'$||\mu||=' + "{:10.2f}".format(norm_mu) + ', \kappa=' + "{:10.2f}".format(kappa) + '$')
    if jj == 0:
        plt.ylabel('radial profiles of angles')
        plt.legend()

    """
    For 3D plotting (plotting on S^2 in 3D plots), code adapted from
    https://stackoverflow.com/questions/22128909/plotting-the-temperature-distribution-on-a-sphere-with-python
    """
    """
    from mpl_toolkits.mplot3d import Axes3D
    from sklearn.metrics import pairwise

    if D == 3:
        fig = plt.figure()

        u = np.linspace( 0, 2 * np.pi, 120)
        v = np.linspace( 0, np.pi, 60 )

        # create the sphere surface
        XX = np.outer( np.cos( u ), np.sin( v ) )
        YY = np.outer( np.sin( u ), np.sin( v ) )
        ZZ = np.outer( np.ones( np.size( u ) ), np.cos( v ) )
        locs = np.stack([XX.flatten(),YY.flatten(),ZZ.flatten()], axis=-1)

        d0 = 0.1
        WW_vmf = (pairwise.pairwise_distances(locs, x_vmf.T)<d0).sum(axis=-1)
        myheatmap_vmf = WW_vmf.reshape(len(u), len(v)) / WW_vmf.max()
        WW_x = (pairwise.pairwise_distances(locs, x_norm.T)<d0).sum(axis=-1)
        myheatmap_x = WW_vmf.reshape(len(u), len(v)) / WW_x.max()

        # ~ ax.scatter( *zip( *pointList ), color='#dd00dd' )
        ax = fig.add_subplot( 1, 2, 1, projection='3d')
        ax.plot_surface( XX, YY,  ZZ, cstride=1, rstride=1, facecolors=cm.jet( myheatmap_x ) )
        plt.title('angles of Gaussian')

        ax = fig.add_subplot( 1, 2, 2, projection='3d')
        ax.plot_surface( XX, YY,  ZZ, cstride=1, rstride=1, facecolors=cm.jet( myheatmap_vmf ) )
        plt.title('von Mises-Fisher')
        plt.show() 
    """
