In [9]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
from collections import defaultdict
from IPython.display import display
import util
from models import BaseModel, dense_maxnorm

### Loading the data

In [10]:
with open('saved_triplets.pkl', 'rb') as f:
# Pickle the 'data' dictionary using the highest protocol available.
    triplets = pickle.load(f)
    triplets.rename(columns={0: "head", 1: "rel", 2: "tail"}, inplace=True)
    msk = np.random.rand(len(triplets)) < 0.8
    train = triplets[msk]
    test = val = triplets[~msk]

### Preprocessing

In [12]:
mask = np.zeros(len(train)).astype(bool)
lookup = defaultdict(list)
for idx,h,r,t in train.itertuples():
    lookup[(h,t)].append(idx) 
for h,r,t in pd.concat((val,test)).itertuples(index=False):
    mask[lookup[(h,t)]] = True
    mask[lookup[(t,h)]] = True
train = train.loc[~mask]
heads, tails = set(train['head']), set(train['tail'])
val = val.loc[val['head'].isin(heads) & val['tail'].isin(tails)]
test = test.loc[test['head'].isin(heads) & test['tail'].isin(tails)]
print('Train shape:', train.shape)
print('Validation shape:', val.shape)
print('Test shape:', test.shape)

Train shape: (66, 3)
Validation shape: (3, 3)
Test shape: (3, 3)


### Adding false statements

In [14]:
rng = np.random.RandomState(42)
combined_df = pd.concat((train, val, test))
val = util.create_tf_pairs(val, combined_df, rng)
test = util.create_tf_pairs(test, combined_df, rng)
print('Validation shape:', val.shape)
print('Test shape:', test.shape)

Validation shape: (6, 4)
Test shape: (6, 4)


In [15]:
graph = tf.Graph()
with graph.as_default():
    # input and target placeholders
    head_input = tf.placeholder(tf.int32, shape=[None])
    rel_input = tf.placeholder(tf.int32, shape=[None])
    tail_input = tf.placeholder(tf.int32, shape=[None])
    target = tf.placeholder(tf.float32, shape=[None])

### Decomposing the data and categorizing

In [16]:
embedding_size = 20
head_cnt = len(set(train['head']))
rel_cnt = len(set(train['rel']))
tail_cnt = len(set(train['tail']))
    
with graph.as_default():
    # embedding variables
    init_sd = 1.0 / np.sqrt(embedding_size)
    head_embedding_vars = tf.Variable(tf.truncated_normal([head_cnt, embedding_size], 
                                                          stddev=init_sd))
    rel_embedding_vars = tf.Variable(tf.truncated_normal([rel_cnt, embedding_size], 
                                                         stddev=init_sd))
    tail_embedding_vars = tf.Variable(tf.truncated_normal([tail_cnt, embedding_size], 
                                                          stddev=init_sd))
    # embedding layer for the (h, r, t) triple being fed in as input
    head_embed = tf.nn.embedding_lookup(head_embedding_vars, head_input)
    rel_embed = tf.nn.embedding_lookup(rel_embedding_vars, rel_input)
    tail_embed = tf.nn.embedding_lookup(tail_embedding_vars, tail_input)
    # CP model output
    output = tf.reduce_sum(tf.mul(tf.mul(head_embed, rel_embed), tail_embed), 1)
    
# TensorFlow requires integer indices
field_categories = (set(train['head']), set(train['rel']), set(train['tail']))
train, train_idx_array = util.make_categorical(train, field_categories)
val, val_idx_array = util.make_categorical(val, field_categories)
# test, test_idx_array = util.make_categorical(test, field_categories)

### Optimization and Negative Sampling

In [17]:
from util import ContrastiveTrainingProvider

batch_provider = ContrastiveTrainingProvider(train_idx_array, batch_pos_cnt=3, 
                                             separate_head_tail=True)
batch_triples, batch_labels = batch_provider.next_batch()
batch_df = pd.DataFrame()
batch_df['head'] = pd.Categorical.from_codes(batch_triples[:,0], train['head'].cat.categories)
batch_df['rel'] = pd.Categorical.from_codes(batch_triples[:,1], train['rel'].cat.categories)
batch_df['tail'] = pd.Categorical.from_codes(batch_triples[:,2], train['tail'].cat.categories)
batch_df['label'] = batch_labels
display(batch_triples)
print('which encodes:')
display(batch_df)

array([[ 6,  0, 13],
       [32,  0, 13],
       [ 8,  0, 38],
       [54,  0, 38],
       [ 1,  0, 24],
       [ 1,  0, 38]])

which encodes:


Unnamed: 0,head,rel,tail,label
0,b'kitsch',11,b'pavlova',1.0
1,b'conditioned',11,b'pavlova',0.0
2,b'remarkably',11,b'very',1.0
3,b'bye',11,b'very',0.0
4,b'large',11,b'small',1.0
5,b'large',11,b'very',0.0


In [25]:
train[['head','tail']]

Unnamed: 0,head,tail
0,b'main',b'principal'
1,b'kitsch',b'pavlova'
2,b'divi',b'filius'
3,b'freeware',b'license'
4,b'kami',b'shinto'
5,b'business',b'corporate'
6,b'large',b'small'
7,b'will',b'would'
8,b'dogs',b'dog'
9,b'glory',b'god'


### Defining optional loss function

In [None]:
def least_squares_objective(output, target, add_bias=True):
    y = output
    if add_bias:
        bias = tf.Variable([0.0])
        y = output + bias
    loss = tf.reduce_sum(tf.square(y - target))
    return y, loss

def logistic_objective(output, target, add_bias=True):
    y = output
    if add_bias:
        bias = tf.Variable([0.0])
        y = output + bias
    squashed_y = tf.clip_by_value(tf.sigmoid(y), 0.001, 0.999) # avoid NaNs
    loss = -tf.reduce_sum(target*tf.log(squashed_y) + (1-target)*tf.log(1-squashed_y))
    return squashed_y, loss

def ranking_margin_objective(output, margin=1.0):
    y_pairs = tf.reshape(output, [-1,2]) # fold: 1 x n -> [n/2 x 2]
    pos_scores, neg_scores = tf.split(1, 2, y_pairs) # separate the pairs
    hinge_losses = tf.nn.relu(margin - pos_scores + neg_scores)
    total_hinge_loss = tf.reduce_sum(hinge_losses)
    return output, total_hinge_loss

### TransE Model

In [None]:
class TransE(BaseModel):

    def __init__(self, embedding_size, batch_pos_cnt=100, 
                 max_iter=1000, dist='euclidean', 
                 margin=1.0, opt=None):
        super(TransE, self).__init__(embedding_size=embedding_size,
                                     maxnorm=1.0,
                                     batch_pos_cnt=batch_pos_cnt,
                                     max_iter=max_iter,
                                     model_type='ranking_margin',
                                     opt=opt)
        self.dist = dist
        self.margin = margin
        self.EPS = 1e-3 # for sqrt gradient when dist='euclidean'
    
    def _create_model(self, train_triples):
        # Count unique items to determine embedding matrix sizes
        entity_cnt = len(set(train_triples[:,0]).union(train_triples[:,2]))
        rel_cnt = len(set(train_triples[:,1]))
        init_sd = 1.0 / np.sqrt(self.embedding_size)
        # Embedding variables
        entity_var_shape = [entity_cnt, self.embedding_size]
        rel_var_shape = [rel_cnt, self.embedding_size]
        entity_init  = tf.truncated_normal(entity_var_shape, stddev=init_sd)
        rel_init = tf.truncated_normal(rel_var_shape, stddev=init_sd)
        # Ensure maxnorm constraints are initially satisfied
        entity_init = dense_maxnorm(entity_init, self.maxnorm)
        self.entity_embedding_vars = tf.Variable(entity_init)
        self.rel_embedding_vars = tf.Variable(rel_init)
        # Embedding layer for each (head, rel, tail) triple being fed in as input
        head_embed = tf.nn.embedding_lookup(self.entity_embedding_vars, self.head_input)
        tail_embed = tf.nn.embedding_lookup(self.entity_embedding_vars, self.tail_input)
        rel_embed = tf.nn.embedding_lookup(self.rel_embedding_vars, self.rel_input)
        # Relationship vector acts as a translation in entity embedding space
        diff_vec = tail_embed - (head_embed + rel_embed)
        # negative dist so higher scores are better (important for pairwise loss)
        if self.dist == 'manhattan':
            raw_output = -tf.reduce_sum(tf.abs(diff_vec), 1)
        elif self.dist == 'euclidean':
            # +eps because gradients can misbehave for small values in sqrt
            raw_output = -tf.sqrt(tf.reduce_sum(tf.square(diff_vec), 1) + self.EPS)
        elif self.dist == 'sqeuclidean':
            raw_output = -tf.reduce_sum(tf.square(diff_vec), 1)
        else:
            raise Exception('Unknown distance type')
        # Model output
        self.output, self.loss = ranking_margin_objective(raw_output, self.margin)
        # Optimization with postprocessing to limit embedding vars to L2 ball
        self.train_step = self.opt.minimize(self.loss)
        unique_ent_indices = tf.unique(tf.concat(0, [self.head_input, self.tail_input]))[0]
        self.post_step = self._norm_constraint_op(self.entity_embedding_vars, 
                                                  unique_ent_indices, 
                                                  self.maxnorm)
# feed dict for monitoring progress on validation set
val_labels = np.array(val['truth_flag'], dtype=np.float)
val_feed_dict = {head_input: val_idx_array[:,0],
                 rel_input: val_idx_array[:,1],
                 tail_input: val_idx_array[:,2],
                 target: val_labels}

### Learning step and accuracy

In [None]:
transE = TransE(embedding_size=20,
                margin=1.0,
                dist='euclidean',
                batch_pos_cnt=100, 
                max_iter=30000)

val_feed_dict = transE.create_feed_dict(val_idx_array)

def train_step_callback(itr, batch_feed_dict):
    if (itr % 2000) == 0 or (itr == (transE.max_iter-1)):
        batch_size = len(batch_feed_dict[transE.target])
        batch_avg_loss = transE.sess.run(transE.loss, batch_feed_dict) / batch_size
        val_output, val_loss = transE.sess.run((transE.output, transE.loss), val_feed_dict)
        val_avg_loss = val_loss / len(val_labels)
        val_pair_ranking_acc = util.pair_ranking_accuracy(val_output)
        msg = 'itr {}, batch loss: {:.2}, val loss: {:.2}, val pair ranking acc: {:.2}'
        print(msg.format(itr, batch_avg_loss, val_avg_loss, val_pair_ranking_acc))
    return True

transE.fit(train_idx_array, train_step_callback)

acc, pred, scores, thresh_map = util.model_threshold_and_eval(transE, test, val)
print('Test set accuracy: {:.2}'.format(acc))

### Visualization

In [None]:
from sklearn.manifold import TSNE
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource, reset_output
from bokeh.palettes import Spectral11

subsample = 10

reset_output()
output_notebook()

lexnames = pd.read_table(os.path.join(data_dir, 'wordnet_lexnames.txt'), index_col=0)
entity_embeddings = transE.sess.run(transE.entity_embedding_vars)
entity_cats = train['head'].cat.categories
entity_names = pd.Categorical.from_codes(range(len(entity_embeddings)), 
                                         entity_cats).astype(str)
entity_lexnames = lexnames.loc[entity_names].values

# Run on just a subset of the data to save some time
emb_subset = entity_embeddings[::subsample, :] 
emb_subset_names = entity_names[::subsample]
emb_subset_lexnames = entity_lexnames[::subsample]

print('Embeddings shape:', entity_embeddings.shape)
print('Using subset:', emb_subset.shape)
print('Running T-SNE, may take a while...')
tsne = TSNE(n_iter=1000, method='barnes_hut')
lowdim = tsne.fit_transform(emb_subset)

print('Plotting...')
source = ColumnDataSource(
  data=dict(x=lowdim[:,0],
            y=lowdim[:,1],
            name=emb_subset_names,
            lexname=emb_subset_lexnames)
)
colormap = {}
for i,ln in enumerate(set(emb_subset_lexnames.flat)):
    colormap[ln] = Spectral11[i % len(Spectral11)]
colors = [colormap[ln] for ln in emb_subset_lexnames.flat]
tools = 'pan,wheel_zoom,box_zoom,reset,resize,hover'
fig = figure(title="T-SNE of WordNet TransE Embeddings", 
             plot_width=800, plot_height=600, tools=tools)
fig.scatter('x', 'y', source=source, alpha=0.5, fill_color=colors, line_color=None)
hover = fig.select(dict(type=HoverTool))
hover.tooltips = [('','@name, @lexname')]
h = show(fig)