In [20]:
import corex as cx
import vis_corex as vcx
from gensim import models
import numpy as np
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from os.path import basename
from itertools import groupby
import re

# Load 20Newsgroups

In [2]:
categories = None
remove = (
    'headers',
    'footers'
)

In [3]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
all_data = data_train.data + data_test.data
all_names = [basename(f) for f in data_train.filenames] + [basename(f) for f in data_test.filenames]
all_labels = list(enumerate(list(data_train.target) + list(data_test.target)))

# vectorize

In [4]:
binarize = True
max_vocab = 10000

In [5]:
vectorized_unigrams = CountVectorizer(
    binary=binarize,
    max_features=max_vocab
)

vectorized_data = vectorized_unigrams.fit_transform(all_data).toarray()
vectorized_data.shape

(18846, 10000)

# Build single layer `CoRex` representation

## hyperparameters

In [6]:
num_hidden = 20      # number of Y_s; number of clusters?  m?
cluster_dim = 2      # dimension of each hidden; k?
max_samples = 10000
max_iter=20          
random_seed = 1978

## fit

In [7]:
corex_layer_1 = cx.Corex(
    n_hidden=num_hidden,            
    dim_hidden=cluster_dim,            
    marginal_description='discrete',   # for discrete data
    max_iter=max_iter,
    max_samples=max_samples,
    seed=random_seed,
    verbose=True,
    n_cpu=3,
    ram=8
)

corex, rep size: 20 2
Marginal description:  discrete


In [8]:
t0 = time()
ys_layer_1 = corex_layer_1.fit_transform(vectorized_data)
print("time to compute: {} seconds".format(time() - t0))

[-0.001  0.071  0.03   0.001  0.025  0.063  0.062 -0.001  0.003  0.017  0.01   0.318  0.013  0.012  0.001  0.506 -0.     0.105  0.003  0.117]
[  0.001   0.189   0.457   0.024  -0.     -0.005   0.144   0.001   0.023   0.051   0.011   0.661   0.036   0.017   0.003  21.08    0.001   0.197   0.014  -0.004]
[  0.002   0.03   -0.009   0.168   0.016   0.042   0.192   0.031  -0.005  -0.01    0.057  -0.004   0.001   0.084   0.016   0.039   0.003   0.119   0.029  13.693]
[  0.016   0.045  13.548   0.549   0.002   0.017   0.209  -0.004  -0.      0.059  -0.002   0.078  -0.004   0.206  -0.001   0.338   0.002   0.05    0.018  -0.066]
[  0.111   0.018  -0.035   0.563   0.003   0.027   0.216   0.003   0.012   0.048   0.046   0.071   0.015   0.238   0.054   0.114   0.005   0.01    0.012  18.004]
[  0.359   0.014  14.741   0.56    0.004   0.032   0.213   0.007   0.      0.079   0.001   0.017   0.067   0.244   0.      0.294   0.008   0.012   0.005   0.289]
[  0.492   0.062   0.13    0.559   0.003   0.043

## labels

`.labels` is the size of your data.  And `.labels[i]` gives you the values for each $Y_j$ for the $i^{th}$ datapoint

In [9]:
corex_layer_1.labels[0]

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1])

So....in theory, two documents with the same `label`, should also have similar `label vectors`.

In [10]:
labels_grouped = groupby(all_labels, lambda x: x[1])
labels_dict = {}
for idx, group in labels_grouped:
    just_docs = list(map(lambda x: x[0], group))
    if idx in labels_dict:
        labels_dict[idx].extend(just_docs)
    else:
        labels_dict[idx] = just_docs

In [11]:
same_doc_label_1, same_doc_label_2 = labels_dict[0][:2]
print("two docs with label=0: {}, {}".format(same_doc_label_1, same_doc_label_2))
corex_layer_1.labels[same_doc_label_1], corex_layer_1.labels[same_doc_label_2]

two docs with label=0: 15, 20


(array([1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1]),
 array([1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1]))

And presumably two docs from very different gropus should have dissimilar `label vectors`.

In [12]:
list(enumerate(data_train.target_names))

[(0, 'alt.atheism'),
 (1, 'comp.graphics'),
 (2, 'comp.os.ms-windows.misc'),
 (3, 'comp.sys.ibm.pc.hardware'),
 (4, 'comp.sys.mac.hardware'),
 (5, 'comp.windows.x'),
 (6, 'misc.forsale'),
 (7, 'rec.autos'),
 (8, 'rec.motorcycles'),
 (9, 'rec.sport.baseball'),
 (10, 'rec.sport.hockey'),
 (11, 'sci.crypt'),
 (12, 'sci.electronics'),
 (13, 'sci.med'),
 (14, 'sci.space'),
 (15, 'soc.religion.christian'),
 (16, 'talk.politics.guns'),
 (17, 'talk.politics.mideast'),
 (18, 'talk.politics.misc'),
 (19, 'talk.religion.misc')]

In [13]:
diff_doc_label_1 = labels_dict[0][0]
diff_doc_label_2 = labels_dict[12][0]
corex_layer_1.labels[diff_doc_label_1], corex_layer_1.labels[diff_doc_label_2]

(array([1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1]),
 array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1]))

Not so convincing....

## clusters

This will identify which cluster each word should belong to.

In [27]:
w2i = vectorized_unigrams.vocabulary_
i2w = dict((i,w) for w,i in w2i.items())

In [26]:
clusters = corex_layer_1.clusters
clusters_grouped = groupby(enumerate(clusters), lambda x: x[1])
clusters_dict = {}
for idx, group in clusters_grouped:
    just_words = [i2w[i] for i in map(lambda x: x[0], group)]
    if idx in clusters_dict:
        clusters_dict[idx].extend(just_words)
    else:
        clusters_dict[idx] = just_words
for c, words in clusters_dict.items():
    print("cluster {} has {} words with a TCS of {}".format(c, len(words), corex_layer_1.tcs[c]))

cluster 0 has 9613 words with a TCS of 24.355386932119735
cluster 1 has 205 words with a TCS of 1.882161663588071
cluster 7 has 53 words with a TCS of 0.3741275769978916
cluster 11 has 5 words with a TCS of 0.23332265806305819
cluster 13 has 2 words with a TCS of 0.08757267504154727
cluster 2 has 15 words with a TCS of 0.8643528613458302
cluster 9 has 13 words with a TCS of 0.35030988101607713
cluster 18 has 1 words with a TCS of 0.04155243040676632
cluster 8 has 2 words with a TCS of 0.36890414436113095
cluster 19 has 1 words with a TCS of 0.03621473931307518
cluster 15 has 1 words with a TCS of 0.07427114469196701
cluster 6 has 6 words with a TCS of 0.38141525228861156
cluster 4 has 19 words with a TCS of 0.4892670086348114
cluster 3 has 35 words with a TCS of 0.6796223281389452
cluster 10 has 8 words with a TCS of 0.30942800068231724
cluster 12 has 6 words with a TCS of 0.15672660999242857
cluster 5 has 10 words with a TCS of 0.463999449130186
cluster 17 has 1 words with a TCS of 0.

That's not at all balanced, but these are sorted by `TCS`.  And so all of the "important" words should be in `cluster=0`.   But there were only 10k total words used, so it didn't really do a good job "trimming" the "important" words' list.


## visualize


In [16]:
vcx.vis_rep(
    corex=corex_layer_1, 
    data=vectorized_data,
    row_label=all_names,
    column_label=None,
    prefix="20newsgroups_viz",
    topk=num_hidden
)

Groups in sorted_groups.txt
Pairwise plots among high TC variables in "relationships"


## groups

In [18]:
path_to_groups = "20newsgroups_viz/text_files/groups_no_overlaps.txt"

In [21]:
group_regex = r'Group num: ([0-9]+), TC.*'
groups = {}
groups_list = []
with open(path_to_groups, "r") as f:
    for line in f:
        if line.startswith("Group num"):
            group_number = re.match(group_regex, line)[1]
            if groups_list:
                groups[int(group_number) - 1] = groups_list
            groups_list = []
        else:
            dim, val = line.rstrip().split(",")
            groups_list.append((dim, val))
groups[int(group_number)] = groups_list

In [25]:
for g, words_ in sorted(groups.items(), key=lambda x: x[0]):
    print("There are {} words in group {} with a total TCS of {}:\n{}".format(
        len(words_), g, corex_layer_1.tcs[g], ",".join(list(map(lambda x: i2w[int(x[0])], words_)))
        )
    )
    print("----")

There are 9613 words in group 0 with a total TCS of 24.355386932119735:
----
There are 205 words in group 1 with a total TCS of 1.882161663588071:
use,system,problem,using,work,card,set,16,need,data,machine,disk,computer,software,ram,hard,hardware,memory,works,problems,run,running,drive,help,video,windows,pc,version,dos,files,mode,program,board,speed,bus,screen,cards,controller,486,file,installed,cpu,monitor,please,mac,drives,fine,chip,ide,motherboard,scsi,mail,hd,mb,meg,apple,graphics,vga,driver,modem,vram,anyone,eisa,ati,486dx,bios,diamond,irq,quadra,clone,fpu,dx2,25mhz,slots,simms,baud,slot,keyboard,gateway,ethernet,33mhz,66mhz,simm,800x600,manuals,com1,adapter,centris,486dx2,seagate,adaptec,speedstar,8mb,386sx,sony,68030,24x,nubus,emm386,advance,610,com2,lciii,smartdrv,reboot,connectors,386dx,connector,harddisk,ami,ini,stacker,soldered,circuitry,powerbook,800k,1mb,16mb,coprocessor,hi,viper,256k,dos6,pins,jumper,cica,ipx,cmos,512k,iisi,resistor,truetype,icon,80ns,pentium,autoexec,wi