In [1]:
%matplotlib notebook

import corex as cx
import vis_corex as vcx
from gensim import models
import numpy as np
from time import time
import re
from itertools import groupby
import matplotlib.pyplot as plt

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



# Load embeddings using `gensim`

In [2]:
embeddings_gensim = models.KeyedVectors.load_word2vec_format(fname="data/vectors_Goldberg_sample.txt", binary=False)
# capture indexes
word_idxs = list(map(lambda x: x[0], embeddings_gensim.vocab.items()))
word_idxs[:5]

['the', '.', ',', 'is', 'and']

# Load embeddings into `numpy` matrix

In [3]:
emb_matrix = embeddings_gensim.syn0
emb_matrix.shape

(4999, 300)

# Build single layer `CoRex` representation

## hyperparameters

In [4]:
num_hidden =10      # number of Y_s; number of clusters?  m?
cluster_dim = 2      # dimension of each hidden; k?
max_samples = emb_matrix.shape[0]
max_iter=12          # previous runs show this is sufficient
random_seed = 1978

In [5]:
corex_layer_1 = cx.Corex(
    n_hidden=num_hidden,            
    dim_hidden=cluster_dim,            
    marginal_description='gaussian',   # for continuous data
    max_iter=max_iter,
    max_samples=max_samples,
    seed=random_seed,
    verbose=True,
    ram=2
)

corex, rep size: 10 2
Marginal description:  gaussian


## fit

In [6]:
t0 = time()
ys_layer_1 = corex_layer_1.fit_transform(emb_matrix)
print("time to compute: {} seconds".format(time() - t0))

[ 0.001  0.001 -0.001 -0.001 -0.001  0.001  0.005  0.002  0.     0.01 ]
[ 0.044  0.062 -0.     0.002  0.002  0.036  0.075  0.034  0.008  0.542]
[ 0.425  0.54   0.001  0.015  0.008  0.239  0.555  0.232  0.057  2.463]
[ 1.068  1.024  0.002  0.108  0.055  0.493  0.92   0.705  0.266  2.69 ]
[ 1.188  1.093  0.005  0.47   0.211  0.562  1.053  0.783  0.474  2.498]
[ 1.151  1.146  0.015  0.749  0.407  0.557  1.337  0.763  0.524  2.183]
[ 1.169  1.285  0.046  0.865  0.46   0.567  1.483  0.711  0.53   1.891]
[ 1.245  1.368  0.106  0.889  0.559  0.549  1.571  0.657  0.589  1.587]
[ 1.285  1.309  0.175  0.849  0.617  0.55   1.699  0.63   0.583  1.427]
[ 1.307  1.426  0.216  0.927  0.808  0.54   1.484  0.584  0.565  1.271]
[ 1.328  1.391  0.26   0.94   0.866  0.524  1.474  0.568  0.585  1.152]
[ 1.31   1.386  0.281  0.924  0.88   0.499  1.465  0.565  0.597  1.109]
Overall tc: 9.01642992993
Best tc: 9.01642992993
time to compute: 183.32326650619507 seconds


## labels

`.labels` is the size of your data.  And `.labels[i]` gives you the values for each $Y_j$ for the $i^{th}$ datapoint

In [7]:
corex_layer_1.labels[0]

array([0, 1, 0, 0, 1, 1, 1, 1, 1, 0])

So....in theory, two words with a high `cosine similarity` in embedding space, should also have similar `label vectors`.

In [8]:
embeddings_gensim.most_similar(["his"])

[('their', 0.8471860885620117),
 ('its', 0.7965162396430969),
 ('my', 0.7934165000915527),
 ('your', 0.7914167642593384),
 ('our', 0.7593696713447571),
 ('whose', 0.738511323928833),
 ('her', 0.6965184211730957),
 ('the', 0.4893943667411804),
 ('``', 0.4586181342601776),
 ("'ll", 0.4517585039138794)]

In [9]:
index_of_his = word_idxs.index('his')
index_of_their = word_idxs.index('their')

corex_layer_1.labels[index_of_his], corex_layer_1.labels[index_of_their]

(array([0, 1, 0, 1, 1, 0, 1, 1, 0, 1]), array([0, 1, 0, 1, 1, 0, 1, 1, 0, 1]))

And presumably two `dissimilar` words in embedding space should have dissimilar `label vectors`.

In [10]:
embeddings_gensim.most_similar(["his"], topn=len(word_idxs))[-3:]

[('eleven', 0.07403583079576492),
 ('announced', 0.06627192348241806),
 ('premiere', 0.05658423900604248)]

In [11]:
index_of_premiere = word_idxs.index('premiere')

corex_layer_1.labels[index_of_his], corex_layer_1.labels[index_of_premiere]

(array([0, 1, 0, 1, 1, 0, 1, 1, 0, 1]), array([1, 1, 0, 1, 0, 1, 0, 1, 0, 1]))

## cluster by coordinates

In [12]:
coordinates = {}
for idx, label in enumerate(corex_layer_1.labels):
    coord = tuple(label)
    if coord in coordinates:
        coordinates[coord].append(word_idxs[idx])
    else:
        coordinates[coord] = [word_idxs[idx]]

In [13]:
for coordinate_pair, words_ in sorted(coordinates.items(), key=lambda x: (x[0], x[1])):
    print("The coordinate pair {} contains the following {} words:\n{}".format(
        coordinate_pair, len(words_), ",".join(words_)
        )
    )
    print("\n")

The coordinate pair (0, 0, 0, 1, 0, 0, 0, 1, 0, 1) contains the following 1 words:
rugby


The coordinate pair (0, 0, 0, 1, 0, 0, 1, 1, 0, 0) contains the following 1 words:
heir


The coordinate pair (0, 0, 0, 1, 0, 1, 0, 1, 0, 1) contains the following 1 words:
publishing


The coordinate pair (0, 0, 0, 1, 0, 1, 0, 1, 1, 1) contains the following 1 words:
;


The coordinate pair (0, 0, 0, 1, 0, 1, 1, 0, 0, 0) contains the following 1 words:
lands


The coordinate pair (0, 0, 0, 1, 0, 1, 1, 1, 0, 0) contains the following 2 words:
borders,doctorate


The coordinate pair (0, 0, 0, 1, 0, 1, 1, 1, 0, 1) contains the following 2 words:
joseph,isbn


The coordinate pair (0, 0, 0, 1, 0, 1, 1, 1, 1, 1) contains the following 1 words:
ed


The coordinate pair (0, 0, 0, 1, 1, 0, 0, 1, 0, 1) contains the following 1 words:
united


The coordinate pair (0, 0, 0, 1, 1, 0, 1, 1, 0, 1) contains the following 2 words:
la,del


The coordinate pair (0, 0, 1, 0, 0, 0, 0, 0, 0, 1) contains the following


The coordinate pair (1, 1, 0, 1, 1, 1, 0, 1, 0, 0) contains the following 1 words:
until


The coordinate pair (1, 1, 0, 1, 1, 1, 0, 1, 0, 1) contains the following 2 words:
of,steam


The coordinate pair (1, 1, 0, 1, 1, 1, 0, 1, 1, 0) contains the following 3 words:
for,because,either


The coordinate pair (1, 1, 0, 1, 1, 1, 1, 0, 0, 0) contains the following 1 words:
``


The coordinate pair (1, 1, 0, 1, 1, 1, 1, 0, 0, 1) contains the following 2 words:
could,twenty


The coordinate pair (1, 1, 0, 1, 1, 1, 1, 0, 1, 0) contains the following 1 words:
afterwards


The coordinate pair (1, 1, 0, 1, 1, 1, 1, 0, 1, 1) contains the following 1 words:
briefly


The coordinate pair (1, 1, 0, 1, 1, 1, 1, 1, 0, 0) contains the following 6 words:
that,he,she,although,every,whether


The coordinate pair (1, 1, 0, 1, 1, 1, 1, 1, 0, 1) contains the following 37 words:
to,can,out,many,north,south,west,east,over,down,off,middle,originally,san,around,x,outside,fair,approximately,dark,forward,northwes

## cluster by dimension

In [21]:
dimensions = {}
for idx, label in enumerate(corex_layer_1.labels):
    for dim in range(len(list(label))):
        if label[dim]:
            if dim not in dimensions:
                dimensions[dim] = [word_idxs[idx]]
            else:
                dimensions[dim].append(word_idxs[idx])

In [26]:
for dim, list_ in sorted(dimensions.items(), key=lambda x: x[0]):
    print("There are {} words that activated dimension={}:\n{}".format(len(list_), dim, ",".join(list_)))
    print("\n")

There are 2195 words that activated dimension=0:
.,,,is,and,was,),to,are,be,(,that,were,have,he,'s,has,had,as,used,an,made,been,i,'',not,``,became,known,or,in,but,also,use,first,played,released,found,born,took,called,of,see,new,began,work,won,its,do,other,well,make,located,there,held,more,built,served,she,named,moved,when,left,take,set,can,would,show,received,died,being,said,think,out,published,end,created,based,become,most,for,only,went,started,given,came,considered,many,established,get,included,include,find,if,up,written,returned,worked,after,taken,on,seen,led,go,joined,from,later,need,founded,then,-,support,such,added,will,by,where,put,play,so,did,know,appeared,north,south,opened,using,produced,elected,appointed,formed,keep,continued,lost,help,while,developed,run,than,come,seems,with,done,married,national,announced,back,recorded,west,am,[,wrote,second,due,american,sent,changed,however,performed,',at,killed,gave,sold,give,signed,brought,want,going,described,provide,same,east,removed,

Not sure there's much here to take away....

## tcs

`.tcs` provides the `total correlation` captured by each $Y_j$, and are sorted from greatest to least.

In [27]:
corex_layer_1.tcs

array([ 1.465,  1.386,  1.31 ,  1.109,  0.924,  0.88 ,  0.597,  0.565,  0.499,  0.281])

So in this case, $Y_0$ captures the most `total correlation`.

## clusters

This will identify which cluster each `dimension` of the word embedding should belong to.  See this example:

```
X = np.array([[0,0,0,0,0], # A matrix with rows as samples and columns as variables.
              [0,0,0,1,1],
              [1,1,1,0,0],
              [1,1,1,1,1]], dtype=int)

layer1 = ce.Corex(n_hidden=2, dim_hidden=2, marginal_description='discrete', smooth_marginals=False)  

layer1.fit(X)  # Fit on data. 

layer1.clusters  # Each variable/column is associated with one Y_j
# array([0, 0, 0, 1, 1])
```

You can see that the first three `dimension`s belong together (in cluster `0`) and the last two together.

In [None]:
clusters = corex_layer_1.clusters
clusters

This generates a clustering of each `dimension` of the word embeddings.  Not sure what meaning this may carry...if any.

In [None]:
clusters = corex_layer_1.clusters
clusters_grouped = groupby(enumerate(clusters), lambda x: x[1])
clusters_dict = {}
for idx, group in clusters_grouped:
    just_dims = list(map(lambda x: x[0], group))
    if idx in clusters_dict:
        clusters_dict[idx].extend(just_dims)
    else:
        clusters_dict[idx] = just_dims
for c, dims in clusters_dict.items():
    print("cluster {} has {} dims".format(c, len(dims)))

## visualize

Runs code to generate a bunch of visualizations, which will end up in the directory called `embedding_viz`.

In [34]:
# vcx.vis_rep(
#     corex=corex_layer_1, 
#     data=emb_matrix,
#     row_label=word_idxs,
#     column_label=None,
#     prefix="embedding_viz_2_x_10_dim",
#     topk=num_hidden
# )

## groups

In [31]:
path_to_groups = "embedding_viz_2_x_10_dim/text_files/groups_no_overlaps.txt"

In [32]:
group_regex = r'Group num: ([0-9]+), TC.*'
groups = {}
groups_list = []
with open(path_to_groups, "r") as f:
    for line in f:
        if line.startswith("Group num"):
            group_number = re.match(group_regex, line)[1]
            if groups_list:
                groups[int(group_number) - 1] = groups_list
            groups_list = []
        else:
            dim, val = line.rstrip().split(",")
            groups_list.append((dim, val))
groups[int(group_number)] = groups_list

In [33]:
for g, dims in groups.items():
    print("There are {} dims in group {} with a total TCS of {}:\n{}".format(
        len(dims), g, corex_layer_1.tcs[g], ",".join(list(map(lambda x: x[0], dims)))
        )
    )
    print("----")

There are 33 dims in group 0 with a total TCS of 1.464927693561168:
125,15,118,195,140,47,267,110,72,8,171,155,211,42,157,64,212,146,295,289,230,148,182,210,226,170,291,46,206,56,243,124,199
----
There are 32 dims in group 1 with a total TCS of 1.3858193619500112:
78,296,280,238,84,32,213,85,248,241,69,1,29,38,121,107,224,193,181,81,229,205,142,160,106,223,139,145,137,26,50,114
----
There are 54 dims in group 2 with a total TCS of 1.3099148051061538:
25,188,136,2,225,0,265,177,279,277,36,270,189,75,164,176,282,12,33,95,37,10,249,183,271,17,166,40,232,149,18,30,150,299,253,208,209,6,57,117,100,49,158,5,257,198,180,190,39,112,58,259,87,192
----
There are 28 dims in group 3 with a total TCS of 1.1090974965612574:
73,173,242,35,143,240,288,204,266,186,220,131,244,20,141,221,169,116,154,79,218,126,217,174,93,165,283,292
----
There are 33 dims in group 4 with a total TCS of 0.9244329697672145:
66,191,132,207,54,108,41,144,101,119,251,159,22,202,152,298,234,276,134,127,71,63,123,246,168,179,2