In [97]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy as scp
from scipy import sparse

NUM_SAMPLES = 200
LARGE_VAL = 10000000
ITER_THRESH = 5

In [98]:
#A function to get the 20 newsgroup data
def get_data():
    #news_groups_all = fetch_20newsgroups(subset='all')
    #news_data = news_groups_all.data

    #Load in the vectorized news group data from scikit-learn package
    news = fetch_20newsgroups(subset='all')
    all_data = np.array(news.data)
    all_targets = np.array(news.target)
    class_names = news.target_names

    #Set class pairings as described in the multiview clustering paper
    view1_classes = ['comp.graphics','rec.motorcycles', 'sci.space', 'rec.sport.hockey', 'comp.sys.ibm.pc.hardware']
    view2_classes = ['rec.autos', 'sci.med','misc.forsale', 'soc.religion.christian','comp.os.ms-windows.misc']
    
    #Create lists to hold data and labels for each of the 5 classes across 2 different views
    labels =  [num for num in range(len(view1_classes)) for _ in range(NUM_SAMPLES)]
    labels = np.array(labels)
    view1_data = list()
    view2_data = list()
    
    #Randomly sample 200 items from each of the selected classes in view1
    for ind in range(len(view1_classes)):
        class_num = class_names.index(view1_classes[ind])
        class_data = all_data[(all_targets == class_num)]
        indices = np.random.choice(class_data.shape[0], NUM_SAMPLES)
        view1_data.append(class_data[indices])
    view1_data = np.concatenate(view1_data)
   
        
    #Randomly sample 200 items from each of the selected classes in view2
    for ind in range(len(view2_classes)):
        class_num = class_names.index(view2_classes[ind])
        class_data = all_data[(all_targets == class_num)]
        indices = np.random.choice(class_data.shape[0], NUM_SAMPLES)
        view2_data.append(class_data[indices])  
    view2_data = np.concatenate(view2_data)
    
    #Vectorize the data
    vectorizer = TfidfVectorizer()
    view1_data = vectorizer.fit_transform(view1_data)
    view2_data = vectorizer.fit_transform(view2_data)

    #Shuffle and normalize vectors
    shuffled_inds = np.random.permutation(NUM_SAMPLES * len(view1_classes))
    view1_data = sparse.vstack(view1_data)
    view2_data = sparse.vstack(view2_data)
    view1_data = np.array(view1_data[shuffled_inds].todense())
    view2_data = np.array(view2_data[shuffled_inds].todense())
    labels = labels[shuffled_inds]

    return view1_data, view2_data, labels


In [99]:
v1_data, v2_data, labels = get_data()

In [101]:
def compute_posterior(data, w_probs, alphas, k):
    likes = list()
    for ind in range(k):
        like = np.power(w_probs[ind], data)
        like = np.prod(like, axis=1)
        likes.append(like)  
    likes = np.vstack(likes)
    likes_p = likes * alphas.reshape((-1, 1))
    likes_sum = np.sum(likes_p, axis=1).reshape((-1, 1))
    likes_sum[likes_sum == 0] = 1
    posterior = likes_p / likes_sum
    log_like = np.sum(np.log2(likes_sum))
    return posterior, log_like

def compute_posterior2(data, w_probs, alphas, k):
    likes = list()
    for ind in range(k):
        like = data * w_probs[ind]
        like = np.sum(like, axis=1)
        likes.append(like)  
    likes = np.vstack(likes)
    likes_p = likes * alphas.reshape((-1, 1))
    likes_sum = np.sum(likes_p, axis=1).reshape((-1, 1))
    likes_sum[likes_sum == 0] = 1
    posterior = likes_p / likes_sum
    log_like = np.sum(np.log2(likes_sum))
    return posterior, log_like

def iterate(data, posteriors, k):
    
    #For each of the mixture components, compute model params
    w_probs = list()
    for ind in range(k):
        numer = data * posteriors[ind].reshape((-1, 1))
        numer = 1 + np.sum(numer, axis=0)
        denom = np.sum(numer)
        if(denom == 0):
            denom = 1
        probs = numer/denom
        w_probs.append(probs)
    w_probs = np.vstack(w_probs)
    alphas = np.mean(posteriors, axis=1)

    #Compute new posterior
    new_posteriors, log_like = compute_posterior(data, w_probs, alphas, k)
    return w_probs, alphas, new_posteriors, log_like

def final_clusters(posteriors):
    metric = posteriors[0] + posteriors[1]
    f_clusters = np.argmax(metric, axis = 0)
    return f_clusters

def compute_entropy(partitions, labels, k, num_classes):
    
    total_entropy = 0
    num_examples = partitions.shape[0]
    for part in range(k):
        labs = labels[partitions == part]
        part_size = labs.shape[0]
        part_entropy = 0
        for cl in range(num_classes):
            prop = np.sum(labs == cl) * 1.0 / part_size
            ent = 0
            if(prop != 0):
                ent = - prop * np.log2(prop)
            part_entropy += ent
        part_entropy = part_entropy * part_size / num_examples
        total_entropy += part_entropy
    return total_entropy
    

In [102]:
#The main kmeans clustering algorithm
def multinomial(v_data, labels, k = 5):

    #Initialize cluster centers, partitions, and loop params
    w_probs2 = np.random.random((k, v_data[1].shape[1]))
    w_probs2 /= np.linalg.norm(w_probs2, axis=1).reshape((-1, 1))
    w_probs = [None, w_probs2]                              
    alphas2 = (1/k) * np.ones((k,))
    alphas = [None, alphas2]
    
    posterior2, log_likes = compute_posterior(v_data[1], w_probs[1], alphas[1], k)
    posteriors = [None, posterior2]
    objective = [0, 0]
    iter_stall = 0
    iter_num = 0
    entropy = 0
    
    while(iter_stall < ITER_THRESH):
        iter_num += 1
        view = (iter_num + 1) % 2
        
        #Switch partitions, Maximization, and Expectation
        w_probs[view], alphas[view], posteriors[view], log_like = iterate(v_data[view], posteriors[(view + 1) % 2], k)
        iter_stall += 1
        #Recompute objective function
        if(log_like > objective[view]):
            objective[view] = log_like
            iter_stall = 0

        #Obtain evaluation metrics
        f_clusters = final_clusters(posteriors)
        entropy = compute_entropy(f_clusters, labels, k, 5)

    return entropy
        

In [103]:
ent = multinomial([v1_data, v2_data], labels, 5)
print(ent)



nan
