# Load classic3 dataset

In [None]:
import numpy as np
import pyreadr
import matplotlib.pyplot as plt

from vMFne.utils_text import filter_features_against_stopwords, tfn

np.random.seed(0)

# load dataset

datasource = 'cclust_package'
assert datasource in ['ZILGM', 'cclust_package']

if datasource == 'ZILGM':
    # retrieved from https://rdrr.io/github/bbeomjin/ZILGM/man/classic3.html
    # N = 3890, D = 5896, but highly reduntant, e.g. 'cir', 'circul', 'circular', 'circulatori'
    classic3 = pyreadr.read_r('data/classic3.RData')['classic3']
    X_raw = classic3.to_numpy()
    D_raw = X_raw.shape[1]
    labels = X_raw[:,-1]
    X_raw = X_raw[:,1:-1].astype(dtype=np.float32)
    word_lens = np.array([ len(classic3.keys()[1:-1][i]) for i in range(len(classic3.keys())-2) ])

elif datasource == 'cclust_package':
    # retrieved from https://github.com/franrole/cclust_package/blob/master/datasets/classic3.mat
    # N = 3891, D = 4303, a bunch of which are 2-letter (not even words), but otherwise seems sensible 
    import scipy.io

    classic3 = scipy.io.loadmat('data/classic3.mat')
    X_raw = classic3['A'].toarray()
    D_raw = X_raw.shape[1]
    labels = classic3['labels']
    word_lens = np.array([ len(classic3['ms'][i,0][0]) for i in range(classic3['ms'].shape[0]) ])
    dictionary = classic3['ms']

    # remove 2-letter words
    idx = word_lens > 2
    X_raw, dictionary = X_raw[:,idx], dictionary[idx]
    word_lens = word_lens[idx]
    N, D = X_raw.shape

X_raw, dictionary = filter_features_against_stopwords(X_raw, dictionary)
X = tfn(X_raw, remove_dead_features=True)

plt.hist(np.mean(X==0., axis=1), density=True)
plt.xlabel('sparisty of vectors')
plt.ylabel('rel. frequency in dataset')
plt.show()

print('\n')
print('selecting D=' + str(D) + ' features out of ' + str(D_raw) + ' features in full dataset.')
print('\n')

In [None]:
Ψ0 = [None, 0]

In [None]:
# compute 'true' parameters using known labels
from vMFne.negentropy import gradΨ
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
from vMFne.moVMF import posterior_marginal_vMF_mixture_Φ
import scipy

ls = np.unique(labels)
μs_true = np.stack([ np.mean(X[np.where(labels==ls[k])[0]],axis=0) for k in range(len(ls)) ], axis=0)
w_true = np.array([ np.sum(labels==ls[k]) for k in range(len(ls))]) / len(labels)
w_true = w_true / w_true.sum()
print(w_true)
μs_norm = np.linalg.norm(μs_true,axis=1) 
print(μs_true.dot(μs_true.T))
print(μs_true.dot(μs_true.T) / np.outer(μs_norm,μs_norm))

_, log_px_true_Ψ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true, Ψ0=Ψ0)
LL_true_Ψ = log_px_true_Ψ.sum()

from sklearn import metrics
from sklearn.metrics import confusion_matrix

# Calculate the MI for the two clusterings
def mi(class_true, class_est):
    cont = metrics.cluster.contingency_matrix(class_true, class_est, sparse=True).astype(np.float64, copy=False)
    mi = metrics.cluster.mutual_info_score(class_true, class_est, contingency=cont)
    return mi

class_true = sum([ (1.*i) * (labels==ls[i]) for i in range(len(ls))])
ph_x_μ_true_Ψ, _ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true, Ψ0=Ψ0)
class_est_μ_true = np.argmax(ph_x_μ_true_Ψ,axis=1)
M_μ_true = confusion_matrix(class_true, class_est_μ_true)
print(M_μ_true)

ηs_true = gradΨ(μs_true,D=D)
ph_x_μ_true_Φ, log_px_true_Φ = posterior_marginal_vMF_mixture_Φ(X,w_true,ηs_true)
LL_true_Φ = log_px_true_Φ.sum() # may differ from LL_true_Ψ by a constant offset


plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(μs_true.T)
plt.title('mean parameters per class')
plt.xlabel('# of feature')
plt.ylabel('μ[# of feature]')

plt.subplot(1,2,2)

plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model confusion matrix')
plt.ylabel('MI='+str(mi(class_est_μ_true, class_true)))
plt.show()

In [None]:
from vMFne.moVMF import moVMF, posterior_marginal_vMF_mixture_Φ

all_ηs, all_w, all_LL = [], [], []

n_repets = 3
K = 3
for ii in range(n_repets):
    ηs, w, LL = moVMF(X=X, K=K, max_iter=100, verbose=True)
    all_ηs.append(ηs)
    all_w.append(w)
    all_LL.append(LL)

MIs = np.zeros(len(all_LL))
i = 0
for w,ηs,LL in zip(all_w, all_ηs, all_LL):
    ph_x, px = posterior_marginal_vMF_mixture_Φ(X,w,ηs)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)    
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    MIs[i] = mi(class_true.flatten(), class_est.flatten()) 
    plt.ylabel('MI='+str(MIs[i]))
    plt.show()
    i += 1

plt.plot(np.stack(all_LL,axis=0)[:,-1], MIs, 'o')
plt.show()

In [None]:
from vMFne.bregman_clustering import softBregmanClustering_vMF, spherical_kmeans
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
import scipy

all_μs, all_w, all_LL = [], [], []

n_repets = 3
K = 3
for ii in range(n_repets):
    _, w, c = spherical_kmeans(X=X, K=K, max_iter=20, verbose=False)
    μs = np.stack([X[c==k].mean(axis=0) for k in range(K)],axis=0)    
    all_μs.append(μs)
    all_w.append(w)
    all_LL.append(np.ones(20))

MIs = np.zeros(len(all_LL))
i = 0
for w,μs,LL in zip(all_w, all_μs, all_LL):
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)    
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    MIs[i] = mi(class_true.flatten(), class_est.flatten()) 
    plt.ylabel('MI='+str(MIs[i]))
    plt.show()
    i += 1

plt.plot(np.stack(all_LL,axis=0)[:,-1], MIs, 'o')
plt.show()

In [None]:
from vMFne.bregman_clustering import softBregmanClustering_vMF, spherical_kmeans
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ

all_μs, all_w, all_LL = [], [], []

n_repets = 3
K = 3 
for ii in range(n_repets):
    _, w, c = spherical_kmeans(X=X, K=K, max_iter=20, verbose=False)
    μs = np.stack([X[c==k].mean(axis=0) for k in range(K)],axis=0)
    μs, w, LL = softBregmanClustering_vMF(X, K=K, max_iter=100, Ψ0=Ψ0, verbose=False, μs_init=μs, w_init=w)
    all_μs.append(μs)
    all_w.append(w)
    all_LL.append(LL)

plt.plot(LL)
plt.plot([0, len(LL)], [LL_true_Ψ,LL_true_Ψ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.show()

ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)

print('w', w)
print(μs.dot(μs.T))
norm_μs = np.linalg.norm(μs, axis=1)
print(μs.dot(μs.T) / np.outer(norm_μs,norm_μs))

In [None]:
import scipy

MIs = np.zeros(len(all_LL))
i = 0
for w,μs,LL in zip(all_w, all_μs, all_LL):
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)    
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    MIs[i] = mi(class_true.flatten(), class_est.flatten()) 
    plt.ylabel('MI='+str(MIs[i]))
    plt.show()
    i += 1

plt.plot(np.stack(all_LL,axis=0)[:,-1], MIs, 'o')
plt.show()

In [None]:
from vMFne.moVMF import moVMF, posterior_marginal_vMF_mixture_Φ

ηs, w, LL = moVMF(X, K=3, max_iter=20, verbose=False, ηs_init = ηs_true, w_init = w_true)
ph_x, px = posterior_marginal_vMF_mixture_Φ(X,w,ηs)

plt.plot(LL)
plt.plot([0, len(LL)], [LL_true_Φ,LL_true_Φ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.show()

plt.subplot(1,2,1)
class_est = np.argmax(ph_x,axis=1)
M = confusion_matrix(class_true, class_est)
plt.imshow(M)
plt.colorbar()
plt.title('learned model')

plt.subplot(1,2,2)
plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model')
