In [18]:
import corex as cx
import vis_corex as vcx
from gensim import models
import numpy as np
from time import time
import re
from itertools import groupby

# Load embeddings using `gensim`

In [2]:
embeddings_gensim = models.KeyedVectors.load_word2vec_format(fname="data/vectors_Goldberg_sample.txt", binary=False)
# capture indexes
word_idxs = list(map(lambda x: x[0], embeddings_gensim.vocab.items()))
word_idxs[:5]

['the', '.', ',', 'is', 'and']

# Load embeddings into `numpy` matrix

In [3]:
emb_matrix = embeddings_gensim.syn0
emb_matrix.shape

(4999, 300)

# Build single layer `CoRex` representation

## hyperparameters

In [4]:
num_hidden = 10      # number of Y_s; number of clusters?  m?
cluster_dim = 3      # dimension of each hidden; k?
max_samples = emb_matrix.shape[0]
max_iter=12          # previous runs show this is sufficient
random_seed = 1978

In [5]:
corex_layer_1 = cx.Corex(
    n_hidden=num_hidden,            
    dim_hidden=cluster_dim,            
    marginal_description='gaussian',   # for continuous data
    max_iter=max_iter,
    max_samples=max_samples,
    seed=random_seed,
    verbose=True,
    ram=2
)

corex, rep size: 10 3
Marginal description:  gaussian


## fit

In [6]:
t0 = time()
ys_layer_1 = corex_layer_1.fit_transform(emb_matrix)
print("time to compute: {} seconds".format(time() - t0))

[-0.002  0.    -0.    -0.     0.    -0.     0.002 -0.001 -0.    -0.   ]
[ 0.001  0.108  0.026  0.032  0.044  0.007  0.058  0.016  0.026  0.025]
[ 0.004  1.164  0.014  0.39   2.054  0.022  0.129  0.002  0.011  0.218]
[ 0.109  0.819  0.015  1.812  2.642  0.107  0.13   0.003  0.037  1.586]
[ 2.82   0.716  0.192 -0.135  0.675  0.533  0.29   0.029  0.184  0.788]
[ 0.883  0.748  1.032  1.462  1.977  0.583  0.736  0.151  0.514  0.901]
[ 3.526  1.29   1.005  0.533  1.71   0.47   0.516  0.38   0.527  0.495]
[ 2.407  1.884  1.103  0.75   2.49   0.566  0.73   0.356  0.568  0.778]
[ 2.016  2.281  1.219  0.87   2.672  0.576  0.831  0.302  0.556  0.8  ]
[ 1.928  2.03   1.271  0.819  3.228  0.663  0.828  0.33   0.594  0.623]
[ 1.655  1.907  1.133  0.798  3.28   0.983  0.85   0.358  0.698  0.373]
[ 1.145  1.796  1.26   0.769  3.718  1.06   0.843  0.378  0.765  0.173]
Overall tc: 11.9085004576
Best tc: 11.9085004576
time to compute: 193.36731004714966 seconds


## labels

`.labels` is the size of your data.  And `.labels[i]` gives you the values for each $Y_j$ for the $i^{th}$ datapoint

In [7]:
corex_layer_1.labels[0]

array([0, 2, 2, 0, 0, 0, 1, 1, 2, 1])

So....in theory, two words with a high `cosine similarity` in embedding space, should also have similar `label vectors`.

In [8]:
embeddings_gensim.most_similar(["his"])

[('their', 0.8471860885620117),
 ('its', 0.7965162396430969),
 ('my', 0.7934165000915527),
 ('your', 0.7914167642593384),
 ('our', 0.7593696713447571),
 ('whose', 0.738511323928833),
 ('her', 0.6965184211730957),
 ('the', 0.4893943667411804),
 ('``', 0.4586181342601776),
 ("'ll", 0.4517585039138794)]

In [9]:
index_of_his = word_idxs.index('his')
index_of_their = word_idxs.index('their')

corex_layer_1.labels[index_of_his], corex_layer_1.labels[index_of_their]

(array([0, 2, 2, 0, 0, 0, 0, 1, 1, 1]), array([0, 1, 2, 0, 0, 0, 0, 1, 1, 1]))

And presumably two `dissimilar` words in embedding space should have dissimilar `label vectors`.

In [10]:
embeddings_gensim.most_similar(["his"], topn=len(word_idxs))[-3:]

[('eleven', 0.07403583079576492),
 ('announced', 0.06627192348241806),
 ('premiere', 0.05658423900604248)]

In [11]:
index_of_premiere = word_idxs.index('premiere')

corex_layer_1.labels[index_of_his], corex_layer_1.labels[index_of_premiere]

(array([0, 2, 2, 0, 0, 0, 0, 1, 1, 1]), array([1, 0, 2, 1, 2, 1, 1, 2, 1, 0]))

## tcs

`.tcs` provides the `total correlation` captured by each $Y_j$, and are sorted from greatest to least.

In [12]:
corex_layer_1.tcs

array([ 3.718,  1.796,  1.26 ,  1.145,  1.06 ,  0.843,  0.769,  0.765,  0.378,  0.173])

So in this case, $Y_0$ captures the most `total correlation`.

## clusters

This will identify which cluster each `dimension` of the word embedding should belong to.  See this example:

```
X = np.array([[0,0,0,0,0], # A matrix with rows as samples and columns as variables.
              [0,0,0,1,1],
              [1,1,1,0,0],
              [1,1,1,1,1]], dtype=int)

layer1 = ce.Corex(n_hidden=2, dim_hidden=2, marginal_description='discrete', smooth_marginals=False)  

layer1.fit(X)  # Fit on data. 

layer1.clusters  # Each variable/column is associated with one Y_j
# array([0, 0, 0, 1, 1])
```

You can see that the first three `dimension`s belong together (in cluster `0`) and the last two together.

In [13]:
clusters = corex_layer_1.clusters
clusters

array([4, 6, 7, 3, 6, 5, 8, 6, 0, 2, 2, 4, 7, 0, 4, 0, 0, 6, 4, 6, 1, 5, 1, 2, 2, 7, 2, 9, 0, 6, 2, 5, 0, 3, 2, 0, 7, 2, 2, 5, 7, 4, 0, 2, 8, 1, 0, 1, 0, 7, 8, 3, 5, 1, 0, 6, 3, 5, 8, 9, 6, 4, 0, 1,
       0, 2, 0, 2, 1, 4, 8, 1, 1, 0, 3, 3, 5, 3, 4, 0, 1, 5, 0, 1, 0, 0, 5, 5, 1, 9, 3, 0, 0, 2, 0, 7, 5, 0, 2, 1, 5, 0, 7, 4, 0, 1, 0, 0, 0, 1, 0, 6, 9, 0, 0, 3, 0, 4, 0, 0, 6, 1, 6, 8, 0, 0, 7, 7,
       6, 8, 5, 3, 1, 6, 1, 3, 7, 2, 7, 2, 1, 0, 5, 1, 8, 2, 0, 0, 1, 7, 2, 5, 1, 9, 1, 1, 4, 0, 2, 2, 7, 0, 3, 7, 4, 8, 2, 8, 4, 1, 1, 3, 0, 0, 0, 0, 3, 7, 2, 1, 6, 3, 0, 7, 0, 6, 5, 0, 2, 2, 8, 1,
       8, 0, 5, 0, 8, 2, 5, 1, 0, 4, 1, 2, 1, 6, 6, 1, 4, 5, 4, 0, 0, 0, 1, 0, 7, 7, 6, 5, 0, 0, 6, 7, 0, 2, 1, 1, 0, 9, 0, 2, 2, 6, 2, 1, 0, 6, 0, 0, 1, 3, 0, 0, 1, 5, 1, 1, 3, 2, 6, 1, 0, 8, 0, 5,
       5, 4, 5, 7, 4, 0, 5, 1, 7, 9, 1, 1, 6, 8, 7, 7, 5, 6, 8, 4, 4, 2, 2, 3, 3, 5, 4, 6, 8, 2, 3, 1, 0, 3, 6, 1, 7, 3, 3, 1, 0, 7, 4, 4])

This generates a clustering of each `dimension` of the word embeddings.  Not sure what meaning this may carry...if any.

In [39]:
clusters = corex_layer_1.clusters
clusters_grouped = groupby(enumerate(clusters), lambda x: x[1])
clusters_dict = {}
for idx, group in clusters_grouped:
    just_dims = list(map(lambda x: x[0], group))
    if idx in clusters_dict:
        clusters_dict[idx].extend(just_dims)
    else:
        clusters_dict[idx] = just_dims
for c, dims in clusters_dict.items():
    dims_total += len(dims)
    print("cluster {} has {} dims".format(c, len(dims)))

cluster 4 has 23 dims
cluster 6 has 26 dims
cluster 7 has 26 dims
cluster 3 has 23 dims
cluster 5 has 27 dims
cluster 8 has 17 dims
cluster 0 has 70 dims
cluster 2 has 34 dims
cluster 1 has 47 dims
cluster 9 has 7 dims


## visualize

Runs code to generate a bunch of visualizations, which will end up in the directory called `embedding_viz`.

In [20]:
# vcx.vis_rep(
#     corex=corex_layer_1, 
#     data=emb_matrix,
#     row_label=word_idxs,
#     column_label=None,
#     prefix="embedding_viz",
#     topk=num_hidden
# )

## groups

In [47]:
path_to_groups = "embedding_viz_10_x_3_dim/text_files/groups_no_overlaps.txt"

In [48]:
group_regex = r'Group num: ([0-9]+), TC.*'
groups = {}
groups_list = []
with open(path_to_groups, "r") as f:
    for line in f:
        if line.startswith("Group num"):
            group_number = re.match(group_regex, line)[1]
            if groups_list:
                groups[int(group_number) - 1] = groups_list
            groups_list = []
        else:
            dim, val = line.rstrip().split(",")
            groups_list.append((dim, val))
groups[int(group_number)] = groups_list

In [49]:
for g, dims in groups.items():
    print("There are {} dims in group {} with a total TCS of {}:\n{}".format(
        len(dims), g, corex_layer_1.tcs[g], ",".join(list(map(lambda x: x[0], dims)))
        )
    )
    print("----")

There are 102 dims in group 0 with a total TCS of 3.7178747457547745:
296,84,280,238,213,125,184,161,15,85,195,118,32,288,16,72,239,267,47,140,110,8,220,211,147,236,92,136,155,241,42,175,48,64,279,157,263,121,244,28,128,188,146,224,3,36,79,108,289,174,135,182,56,294,176,113,111,75,148,141,62,221,230,200,177,193,95,226,170,74,103,206,23,81,264,63,57,291,185,243,172,178,293,114,273,160,104,124,255,67,53,94,77,162,101,290,133,145,233,165,286,19
----
There are 41 dims in group 1 with a total TCS of 1.795976832579594:
35,173,242,143,73,227,266,109,191,11,54,207,187,202,82,66,132,275,276,204,45,251,71,295,107,20,68,256,190,13,168,152,214,80,179,212,228,246,287,199,123
----
There are 37 dims in group 2 with a total TCS of 1.2604112385829591:
78,197,262,156,14,89,225,265,69,117,38,154,59,163,261,247,252,201,210,91,116,22,43,169,98,138,61,52,285,159,122,298,112,258,192,27,55
----
There are 29 dims in group 3 with a total TCS of 1.145412547302473:
25,41,34,1,270,30,150,277,189,33,37,10,40,194,21