# Load news20 dataset

MC toolkit: https://www.cs.utexas.edu/users/dml/software/mc/

In [None]:
import scipy.sparse
import numpy as np
import matplotlib.pyplot as plt

from vMFne.utils_text import filter_features_against_stopwords, tfn

only_train_data = True

data_train = np.loadtxt('data/20news_preprocessed/train.data', dtype=int)
data_train = scipy.sparse.coo_array((data_train[:,2], (data_train[:,0]-1, data_train[:,1]-1))).todense()

if only_train_data:
    data = data_train
    labels = np.loadtxt('data/20news_preprocessed/train.label', dtype=int)
else:
    data_test = np.loadtxt('data/20news_preprocessed/test.data', dtype=int)
    data_test = scipy.sparse.coo_array((data_test[:,2], (data_test[:,0]-1, data_test[:,1]-1))).todense()
    data_train = np.concatenate([data_train, 
                                 np.zeros((data_train.shape[0], data_test.shape[1]-data_train.shape[1]),dtype=data_train.dtype)],
                                axis=1)
    data = np.concatenate([data_train, data_test], axis=0)
    labels = np.concatenate([np.loadtxt('data/20news_preprocessed/train.label', dtype=int),
                             np.loadtxt('data/20news_preprocessed/test.label', dtype=int)], axis=0)

N, D_raw = data.shape

dictionary = np.loadtxt('data/20news_preprocessed/vocabulary.txt', dtype=str)
dictionary = dictionary[:D_raw] # training data alone does not contain whole dictionary actually

data, dictionary = filter_features_against_stopwords(data, dictionary)
labels = labels[data.sum(axis=1) > 0] # kick out that one document whose only occuring features 
data = data[data.sum(axis=1) > 0]     # are the stopwords 'more', 'say' and 'need' ...

X = tfn(data, remove_dead_features=True, dtype=np.float32)
N, D = X.shape

plt.hist(np.mean(X==0., axis=1), density=True)
plt.xlabel('sparisty of vectors')
plt.ylabel('rel. frequency in dataset')
plt.show()

print('\n')
print('selecting D=' + str(D) + ' features out of ' + str(D_raw) + ' features in full dataset.')
print('\n')

In [None]:
Ψ0 = [None, 0.] # [None, 0.] means: compute correct Ψ(0), but don't numerically integrate Ψ'(μ), use approximation instead.

In [None]:
# compute 'true' parameters using known labels
from vMFne.negentropy import gradΨ
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
from vMFne.moVMF import posterior_marginal_vMF_mixture_Φ

ls = np.unique(labels)
μs_true = np.stack([ np.mean(X[np.where(labels==ls[k])[0]],axis=0) for k in range(len(ls)) ], axis=0)
w_true = np.array([ np.sum(labels==ls[k]) for k in range(len(ls))]) / len(labels)
w_true = w_true / w_true.sum()
print(w_true)
μs_norm = np.linalg.norm(μs_true,axis=1) 
print(μs_true.dot(μs_true.T))
print(μs_true.dot(μs_true.T) / np.outer(μs_norm,μs_norm))

_, log_px_true_Ψ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true, Ψ0=Ψ0)
LL_true_Ψ = log_px_true_Ψ.sum()

from sklearn import metrics
from sklearn.metrics import confusion_matrix

# Calculate the MI for the two clusterings
def mi(class_true, class_est):
    cont = metrics.cluster.contingency_matrix(class_true, class_est, sparse=True).astype(np.float64, copy=False)
    mi = metrics.cluster.mutual_info_score(class_true, class_est, contingency=cont)
    return mi

class_true = sum([ (1.*i) * (labels==ls[i]) for i in range(len(ls))])
ph_x_μ_true, _ = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true, Ψ0=Ψ0)
class_est_μ_true = np.argmax(ph_x_μ_true,axis=1)
M_μ_true = confusion_matrix(class_true, class_est_μ_true)

ηs_true = gradΨ(μs_true,D=D)
_, log_px_true_Φ = posterior_marginal_vMF_mixture_Φ(X,w_true,ηs_true)
LL_true_Φ = log_px_true_Φ.sum() # may differ from LL_true_Ψ by a constant offset

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(μs_true.T)
plt.title('mean parameters per class')
plt.xlabel('# of feature')
plt.ylabel('μ[# of feature]')

plt.subplot(1,2,2)

plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model confusion matrix')
plt.ylabel('MI='+str(mi(class_est_μ_true, class_true)))
plt.show()

In [None]:
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
from vMFne.bregman_clustering import softBregmanClustering_vMF

K = 20
μs, w, LL = softBregmanClustering_vMF(X, K=K, max_iter=50, Ψ0=Ψ0, verbose=True, μs_init=μs_true, w_init=w_true)

ph_x, _ = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
class_est = np.argmax(ph_x,axis=1)
M = confusion_matrix(class_true, class_est)
plt.imshow(M)
plt.ylabel('MI='+str(mi(class_est, class_true)))
plt.colorbar()
plt.title('learned model - LL= ' + str(LL[-1]))
plt.show()

LL_true = posterior_marginal_vMF_mixture_Ψ(X,w_true,μs_true, Ψ0=Ψ0)[1].sum()

plt.plot(LL)
plt.plot([0, len(LL)-1], [LL_true, LL_true], 'k--')
plt.show()


In [None]:
from vMFne.bregman_clustering import spherical_kmeans, softBregmanClustering_vMF
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
from sklearn.metrics import confusion_matrix
import scipy

all_μs, all_w, all_LL = [], [], []
all_μs_kmean, all_w_kmean, all_c_kmean = [], [], []

n_repets = 10
for ii in range(n_repets):
    _, w, c = spherical_kmeans(X, K=K, max_iter=100, verbose=False)
    μs = np.stack([X[c==k].mean(axis=0) for k in range(K)],axis=0)
    all_c_kmean.append(1 * c)
    all_w_kmean.append(1. * w)
    all_μs_kmean.append(1. * μs)
    μs, w, LL = softBregmanClustering_vMF(X, K=K, max_iter=100, w_init=w, μs_init=μs, Ψ0=Ψ0, verbose=False)
    all_μs.append(μs)
    all_w.append(w)
    all_LL.append(LL)
    print(' - ' + str(ii+1) + '/' + str(n_repets))

MIs = np.zeros(n_repets)
for i in range(n_repets):
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    #class_est = all_c_kmean[i]
    μs = all_μs_kmean[i]
    w = all_w_kmean[i]
    ph_x, _ = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - sph. K-means')
    MIs[i] = mi(class_true.flatten(), class_est.flatten())
    plt.ylabel('MI=' + str(MIs[i]))

    plt.subplot(1,2,2)
    w = all_w[i]
    μs = all_μs[i]
    LL = all_LL[i]
    ph_x, _ = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    MIs[i] = mi(class_true.flatten(), class_est.flatten())
    plt.ylabel('MI=' + str(MIs[i]))
    plt.show()

plt.subplot(1,2,1)
plt.plot(np.stack(all_LL,axis=0).T)
plt.plot([0, len(LL)], [LL_true_Ψ,LL_true_Ψ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.subplot(1,2,2)
plt.plot(np.stack(all_LL,axis=0)[:,-1], MIs, 'o')
plt.show()

In [None]:
n_repets = 7
MIs = np.zeros(n_repets)
for i in range(n_repets):
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    #class_est = all_c_kmean[i]
    μs = all_μs_kmean[i]
    w = all_w_kmean[i]
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - sph. K-means')
    MIs[i] = mi(class_true.flatten(), class_est.flatten())
    plt.ylabel('MI=' + str(MIs[i]))

    plt.subplot(1,2,2)
    w = all_w[i]
    μs = all_μs[i]
    LL = all_LL[i]
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    MIs[i] = mi(class_true.flatten(), class_est.flatten())
    plt.ylabel('MI=' + str(MIs[i]))
    plt.show()

plt.subplot(1,2,1)
plt.plot(np.stack(all_LL,axis=0).T)
plt.plot([0, len(LL)], [LL_true_Ψ,LL_true_Ψ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.subplot(1,2,2)
plt.plot(np.stack(all_LL,axis=0)[:,-1], MIs, 'o')
plt.show()

In [None]:
for i in range(n_repets):
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,all_w[i],all_μs[i], Ψ0=Ψ0)
    plt.plot(np.sort(np.max(ph_x,axis=-1))[:200])
plt.show()

In [None]:
from vMFne.bregman_clustering import softBregmanClustering_vMF, spherical_kmeans
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
import scipy

all_μs, all_w, all_LL = [], [], []

n_repets = 3
K = 20
for ii in range(n_repets):
    #_, w, c = spherical_kmeans(X=X, K=K, max_iter=20, verbose=False)
    #μs = np.stack([X[c==k].mean(axis=0) for k in range(K)],axis=0)    
    μs, w = None, None
    μs, w, LL = softBregmanClustering_vMF(X, K=K, max_iter=50, Ψ0=Ψ0, verbose=True, μs_init=μs, w_init=w)
    all_μs.append(μs)
    all_w.append(w)
    all_LL.append(LL)


MIs = np.zeros(len(all_LL))
i = 0
for w,μs,LL in zip(all_w, all_μs, all_LL):
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)    
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    MIs[i] = mi(class_true.flatten(), class_est.flatten()) 
    plt.ylabel('MI='+str(MIs[i]))
    plt.show()
    i += 1

plt.plot(np.stack(all_LL,axis=0)[:,-1], MIs, 'o')
plt.show()

In [None]:
from vMFne.bregman_clustering import softBregmanClustering_vMF, spherical_kmeans
from vMFne.bregman_clustering import posterior_marginal_vMF_mixture_Ψ
import scipy

all_μs, all_w, all_LL = [], [], []

n_repets = 3
K = 20
for ii in range(n_repets):
    _, w, c = spherical_kmeans(X=X, K=K, max_iter=20, verbose=False)
    μs = np.stack([X[c==k].mean(axis=0) for k in range(K)],axis=0)    
    all_μs.append(μs)
    all_w.append(w)
    all_LL.append(np.ones(20))

MIs = np.zeros(len(all_LL))
i = 0
for w,μs,LL in zip(all_w, all_μs, all_LL):
    ph_x, px = posterior_marginal_vMF_mixture_Ψ(X,w,μs, Ψ0=Ψ0)
    class_est = np.argmax(ph_x,axis=1)
    M = confusion_matrix(class_true, class_est)
    _, idx_class_align = scipy.optimize.linear_sum_assignment(-M.T)
    class_est_aligned = idx_class_align[class_est]
    M = confusion_matrix(class_true, class_est_aligned)    
    plt.imshow(M)
    plt.colorbar()
    plt.title('learned model - LL= ' + str(LL[-1]))
    MIs[i] = mi(class_true.flatten(), class_est.flatten()) 
    plt.ylabel('MI='+str(MIs[i]))
    plt.show()
    i += 1

plt.plot(np.stack(all_LL,axis=0)[:,-1], MIs, 'o')
plt.show()

In [None]:
from vMFne.moVMF import moVMF, posterior_marginal_vMF_mixture_Φ

ηs, w, LL = moVMF(X, K=3, max_iter=20, verbose=False, ηs_init = ηs_true, w_init = w_true)
ph_x, px = posterior_marginal_vMF_mixture_Φ(X,w,ηs)

plt.plot(LL)
plt.plot([0, len(LL)], [LL_true_Φ,LL_true_Φ], 'k--')
plt.xlabel('iteration')
plt.ylabel('log-likelihood')
plt.show()

plt.subplot(1,2,1)
class_est = np.argmax(ph_x,axis=1)
M = confusion_matrix(class_true, class_est)
plt.imshow(M)
plt.colorbar()
plt.title('learned model')

plt.subplot(1,2,2)
plt.imshow(M_μ_true)
plt.colorbar()
plt.title('supervised model')
