# Load classic3 dataset

In [None]:
import numpy as np
import pyreadr
import matplotlib.pyplot as plt

np.random.seed(0)

# load dataset

datasource = 'cclust_package'
assert datasource in ['ZILGM', 'cclust_package']

if datasource == 'ZILGM':
    # retrieved from https://rdrr.io/github/bbeomjin/ZILGM/man/classic3.html
    # N = 3890, D = 5896
    classic3 = pyreadr.read_r('data/classic3.RData')['classic3']
    X_raw = classic3.to_numpy()
    labels = X_raw[:,-1]
    X_raw = X_raw[:,1:-1].astype(dtype=np.float32)
    word_lens = np.array([ len(classic3.keys()[1:-1][i]) for i in range(len(classic3.keys())-2) ])

elif datasource == 'cclust_package':
    # retrieved from https://github.com/franrole/cclust_package/blob/master/datasets/classic3.mat
    # N = 3891, D = 4303 
    import scipy.io

    classic3 = scipy.io.loadmat('data/classic3.mat')
    X_raw = classic3['A'].toarray()
    labels = classic3['labels']
    word_lens = np.array([ len(classic3['ms'][i,0][0]) for i in range(classic3['ms'].shape[0]) ])

# remove 2-letter and 3-letter words 
idx = np.where(word_lens > 3)[0]
X = X_raw[:,idx]    
N, D_raw = X_raw.shape

# remove dead features
Nj = (X > 0).sum(axis=0) # number of documents containing word j = 1, ..., D
idx = np.where(Nj > 0)[0]
sub_sample_features = False
if sub_sample_features:
    D_max = 200
    D = np.min((D,D_max))
    idx__ = np.argsort(Nj)
    idx = idx__[-D:]
D = len(idx)
X, Nj = X[:,idx], Nj[idx]

# tfn scheme - normalized term frequency-inverse document frequency
gj = np.log(N/Nj)
si = 1. / np.sqrt(((X * gj.reshape(1,D))**2).sum(axis=1))

X = X * np.outer(si, gj)

plt.hist(np.mean(X_raw==0., axis=1), density=True)
plt.xlabel('sparisty of vectors')
plt.ylabel('rel. frequency in dataset')
plt.show()

print('\n')
print('selecting D=' + str(D) + ' features out of ' + str(D_raw) + ' features in full dataset.')
print('\n')

In [None]:
Ψ0 = [0., 1e-4]

In [None]:
# compute 'true' parameters using known labels
from vMFne.negentropy import gradΨ
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
from vMFne.moVMF import posterior_marginal_vMF_mixture_Φ

ls = np.unique(labels)
μs_true = np.stack([ np.mean(X[np.where(labels==ls[k])[0]],axis=0) for k in range(len(ls)) ], axis=0)
w_true = np.array([ np.sum(labels==ls[k]) for k in range(len(ls))]) / len(labels)
w_true = w_true / w_true.sum()
print(w_true)
μs_norm = np.linalg.norm(μs_true,axis=1) 
print(μs_true.dot(μs_true.T))
print(μs_true.dot(μs_true.T) / np.outer(μs_norm,μs_norm))

_, px_true_Ψ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true, Ψ0=Ψ0)
LL_true_Ψ = np.log(px_true_Ψ).sum()

from sklearn.metrics import confusion_matrix

class_true = sum([ (1.*i) * (labels==ls[i]) for i in range(len(ls))])
ph_x_μ_true, _ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true, Ψ0=Ψ0)
class_est_μ_true = np.argmax(ph_x_μ_true,axis=1)
M_μ_true = confusion_matrix(class_true, class_est_μ_true)

ηs_true = gradΨ(μs_true,D=D)
_, px_true_Φ = posterior_marginal_vMF_mixture_Φ(X,w_true,ηs_true)
LL_true_Φ = np.log(px_true_Φ).sum() # may differ from LL_true_Ψ by a constant offset

plt.subplot(1,2,1)
plt.plot(μs_true.T)
plt.title('mean parameters per class')
plt.xlabel('# of feature')
plt.ylabel('μ[# of feature]')

plt.subplot(1,2,2)

plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model confusion matrix')
plt.show()

In [None]:
from vMFne.bregman_clustering import softBregmanClustering_vMF
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ

all_μs, all_w, all_LL = [], [], []

n_repets = 10
for ii in range(n_repets):
    μs, w, LL = softBregmanClustering_vMF(X, K=3, max_iter=25, Ψ0=Ψ0, verbose=True)#, μs_init = μs_true, w_init = w_true)
    all_μs.append(μs)
    all_w.append(w)
    all_LL.append(LL)

plt.plot(LL)
plt.plot([0, len(LL)], [LL_true_Ψ,LL_true_Ψ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.show()

ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)

print('w', w)
print(μs.dot(μs.T))
norm_μs = np.linalg.norm(μs, axis=1)
print(μs.dot(μs.T) / np.outer(norm_μs,norm_μs))

plt.subplot(1,2,1)
class_est = np.argmax(ph_x,axis=1)
M = confusion_matrix(class_true, class_est)
plt.imshow(M)
plt.colorbar()
plt.title('learned model')

plt.subplot(1,2,2)
plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model')

print('M', M)

In [None]:
for w,μs,LL in zip(all_w, all_μs, all_LL):
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    plt.show()

In [None]:
plt.plot(np.stack(all_LL,axis=0).T)
plt.show()

In [None]:
confusion_matrix?

In [None]:
from vMFne.moVMF import moVMF, posterior_marginal_vMF_mixture_Φ

ηs, w, LL = moVMF(X, K=3, max_iter=20, verbose=False, ηs_init = ηs_true, w_init = w_true)
ph_x, px = posterior_marginal_vMF_mixture_Φ(X,w,ηs)

plt.plot(LL)
plt.plot([0, len(LL)], [LL_true_Φ,LL_true_Φ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.show()

plt.subplot(1,2,1)
class_est = np.argmax(ph_x,axis=1)
M = confusion_matrix(class_true, class_est)
plt.imshow(M)
plt.colorbar()
plt.title('learned model')

plt.subplot(1,2,2)
plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model')
