# Visualize the results

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

import os, re
import util

from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib notebook

In [2]:
def restore_emb(config, train_dir):
    print 'Restoring graph...',
    with tf.Graph().as_default():
        with tf.variable_scope('cnn'):
            if config.has_key('contextwise') and config['contextwise']:
                import cnn_context
                m = cnn_context.Model(config, is_train=False)
            else:
                import cnn
                m = cnn.Model(config, is_train=False)
        saver = tf.train.Saver(tf.all_variables())

        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(train_dir)
            saver.restore(sess, ckpt.model_checkpoint_path)

            embeddings = sess.run(tf.all_variables())[0]
    print 'done.'
    return embeddings

In [7]:
def get_2d_emb(embeddings, limit=5000):
    print 'Reducing dimension...',
    embeddings = embeddings[:limit]
    norm = np.sqrt(np.sum(np.square(embeddings), axis=0))
    normalized_embeddings = embeddings / norm
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    two_d_embeddings = tsne.fit_transform(normalized_embeddings)
    print 'done.'
    return two_d_embeddings

In [4]:
def construct_emb_df(data_dir, two_d_embeddings):
    print 'Restoring vocabulary...',
    vocab_path = os.path.join(data_dir, 'vocab.txt')
    word2id, _ = util.initialize_vocabulary(vocab_path)
    word = pd.DataFrame({'word': word2id.keys()}, index=word2id.values())
    values = pd.DataFrame(two_d_embeddings, columns=['x', 'y'])
    emb_df = pd.concat([word, values], axis=1)
    print 'done.'
    return emb_df

In [12]:
def merge_id(emb_df):
    print 'Merging ids...',
    positive_df = pd.read_csv(os.path.join(data_dir, '..', 'positive_relations.tsv'),
                           sep='\t', index_col=0, encoding='utf-8')
    subj = positive_df[['subj', 'subj_qid']].rename_axis(
        {"subj": "entity", "subj_qid": "id"}, axis="columns")
    obj = positive_df[['obj', 'obj_qid']].rename_axis(
        {"obj": "entity", "obj_qid": "id"}, axis="columns")
    rel = positive_df[['rel', 'rel_id']].rename_axis(
        {"rel_id": "id"}, axis="columns")
    entity = pd.concat([subj, obj, rel], axis=0, ignore_index=True)
    entity['word'] = ['<'+w.lower()+'>' for w in entity['id']]
    entity = entity.drop_duplicates('word')
    merge_emb_df = emb_df.merge(entity[['entity', 'rel', 'word']], how='outer', on='word')
    print 'done.'
    return merge_emb_df

In [10]:
train_dir = os.path.join(os.getcwd(), 'train', '1474062134')
config = util.load_from_dump(os.path.join(train_dir, 'flags.cPickle'))
data_dir = config['data_dir']

embeddings = restore_emb(config, train_dir)
two_d_embeddings = get_2d_emb(embeddings, limit=5000)
emb_df = construct_emb_df(data_dir, two_d_embeddings)
merge_emb_df = merge_id(emb_df)

Merging ids... done.


In [13]:
merge_emb_df = merge_id(emb_df)
merge_emb_df.head()

Merging ids... done.


Unnamed: 0,word,x,y,entity,rel
0,<pad>,-7.750008,12.951242,,
1,<unk>,-2.388376,4.990622,,
2,",",1.291499,0.967199,,
3,the,-0.28479,0.974413,,
4,in,-1.644315,1.269674,,


In [None]:
ax = merge_emb_df[pd.isnull(merge_emb_df.entity)].plot(kind="scatter", 
        x='x', y='y', color='red', legend=True, label='words')
ax = merge_emb_df[pd.notnull(merge_emb_df.entity)].plot(kind="scatter", 
        x='x', y='y', color='blue', ax=ax, legend=True, label='entities')
# annotate labels
for index, row in merge_emb_df[pd.notnull(merge_emb_df.entity)].iterrows():
    ax.annotate(row['entity'], xy=(row['x'], row['y']))
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('Embeddings')

#fig = ax.get_figure()
#fig.savefig('emb.svg')