In [1]:
import corex as cx
import vis_corex as vcx
from gensim import models
import numpy as np
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from os.path import basename
from itertools import groupby

# Load 20Newsgroups

In [2]:
categories = None
remove = (
    'headers',
    'footers'
)

In [3]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
all_data = data_train.data + data_test.data
all_names = [basename(f) for f in data_train.filenames] + [basename(f) for f in data_test.filenames]
all_labels = list(enumerate(list(data_train.target) + list(data_test.target)))

# vectorize

In [4]:
binarize = True
max_vocab = 10000

In [5]:
vectorized_unigrams = CountVectorizer(
    binary=binarize,
    max_features=max_vocab
)

vectorized_data = vectorized_unigrams.fit_transform(all_data).toarray()
vectorized_data.shape

(18846, 10000)

# Build single layer `CoRex` representation

## hyperparameters

In [6]:
num_hidden = 20      # number of Y_s; number of clusters?  m?
cluster_dim = 2      # dimension of each hidden; k?
max_samples = 10000
max_iter=50          # 20 iterations didn't come close to convergence
random_seed = 1978

## fit

In [None]:
corex_layer_1 = cx.Corex(
    n_hidden=num_hidden,            
    dim_hidden=cluster_dim,            
    marginal_description='discrete',   # for discrete data
    max_iter=max_iter,
    max_samples=max_samples,
    seed=random_seed,
    verbose=True,
    n_cpu=2,
    ram=8
)

corex, rep size: 20 2
Marginal description:  discrete


In [None]:
t0 = time()
ys_layer_1 = corex_layer_1.fit_transform(vectorized_data)
print("time to compute: {} seconds".format(time() - t0))

[-0.001  0.071  0.03   0.001  0.025  0.063  0.062 -0.001  0.003  0.017  0.01   0.318  0.013  0.012  0.001  0.506 -0.     0.105  0.003  0.117]
[  0.001   0.189   0.457   0.024  -0.     -0.005   0.144   0.001   0.023   0.051   0.011   0.661   0.036   0.017   0.003  21.08    0.001   0.197   0.014  -0.004]
[  0.002   0.03   -0.009   0.168   0.016   0.042   0.192   0.031  -0.005  -0.01    0.057  -0.004   0.001   0.084   0.016   0.039   0.003   0.119   0.029  13.693]
[  0.016   0.045  13.548   0.549   0.002   0.017   0.209  -0.004  -0.      0.059  -0.002   0.078  -0.004   0.206  -0.001   0.338   0.002   0.05    0.018  -0.066]
[  0.111   0.018  -0.035   0.563   0.003   0.027   0.216   0.003   0.012   0.048   0.046   0.071   0.015   0.238   0.054   0.114   0.005   0.01    0.012  18.004]
[  0.359   0.014  14.741   0.56    0.004   0.032   0.213   0.007   0.      0.079   0.001   0.017   0.067   0.244   0.      0.294   0.008   0.012   0.005   0.289]
[  0.492   0.062   0.13    0.559   0.003   0.043

## labels

`.labels` is the size of your data.  And `.labels[i]` gives you the values for each $Y_j$ for the $i^{th}$ datapoint

In [None]:
corex_layer_1.labels[0]

So....in theory, two documents with the same `label`, should also have similar `label vectors`.

In [None]:
labels_grouped = groupby(all_labels, lambda x: x[1])
labels_dict = {}
for idx, group in labels_grouped:
    just_docs = list(map(lambda x: x[0], group))
    if idx in labels_dict:
        labels_dict[idx].extend(just_docs)
    else:
        labels_dict[idx] = just_docs

In [None]:
same_doc_label_1, same_doc_label_2 = labels_dict[0][:2]
print("two docs with label=0: {}, {}".format(same_doc_label_1, same_doc_label_2))
corex_layer_1.labels[same_doc_label_1], corex_layer_1.labels[same_doc_label_2]

And presumably two docs from very different gropus should have dissimilar `label vectors`.

In [None]:
list(enumerate(data_train.target_names))

In [None]:
diff_doc_label_1 = labels_dict[0][0]
diff_doc_label_2 = labels_dict[12][0]
corex_layer_1.labels[diff_doc_label_1], corex_layer_1.labels[diff_doc_label_2]

Not so convincing....

## clusters

This will identify which cluster each word should belong to.

In [None]:
w2i = vectorized_unigrams.vocabulary_
i2w = dict((i,w) for w,i in w2i.items())
i2w[1000]

In [None]:
clusters = corex_layer_1.clusters
clusters_grouped = groupby(enumerate(clusters), lambda x: x[1])
clusters_dict = {}
for idx, group in clusters_grouped:
    just_words = [i2w[i] for i in map(lambda x: x[0], group)]
    if idx in clusters_dict:
        clusters_dict[idx].extend(just_words)
    else:
        clusters_dict[idx] = just_words
for c, words in clusters_dict.items():
    print("cluster {} has {} words".format(c, len(words)))

That's not at all balanced....

## visualize


In [None]:
vcx.vis_rep(
    corex=corex_layer_1, 
    data=vectorized_data,
    row_label=all_names,
    column_label=None,
    prefix="20newsgroups_viz",
    topk=num_hidden
)