# Setup

#### Imports

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
import spacy
from scipy import spatial
from collections import defaultdict
nlp = spacy.load('en')    

Using TensorFlow backend.


#### Load the corpus, and convert it into training data 

In [45]:
text = open('./alice_in_wonderland.txt').read()
doc = nlp(text.decode('utf8'))

#we don't really care about training on all deps 
bad_deps = ['ROOT', 'compound', 'pobj', 'punct']
examples = []

for sent in doc.sents:
    for word in sent:
        #casing should probably be a paramater
        source =  word.head.text.lower()
        target= word.text.lower()
        dep = word.dep_
        for child in word.children:
            source =  child.head.text.lower()
            target= child.text.lower()
            dep = child.dep_
            #If we see a prepositional dependency, we want to merge it
            #so ('scientist', prep, 'with') and ('with', pobj,'telescope) 
            #becomes ('scientist, 'prep_with', 'telescope)
            if dep == 'prep':
                for c2 in child.children:
                    if (c2.dep_ == 'pobj'):
                        examples.append((source,"prep_" + child.text.lower(),  c2.text))
            else:
                if not dep in bad_deps:
                    examples.append((source, dep,target))
#index all depndency triples by their head, so we can sort them in different ways 
indexed_training = defaultdict(list)
for (a,b,c)in examples:
    indexed_training[a].append((b,c))


def convert_predict_context(indexed_training):
    #x and y are training data and labels (respectively)
    #If reverse is true, we the pair (training, dep, context) will generate (training, dep_context)
    #and (context, dep_training)
    #TODO, should the reversed example be treated differently? That is done in the original paper
    #but no empirical justification is given
    x = []
    y = []
    reverse = True
    for k,Vs in indexed_training.iteritems():
        for (context_dep, context_word) in Vs:
            x.append(context_dep +"_" + context_word)
            y.append(k)
        if reverse:
            for (context_dep, context_word) in Vs:
                x.append(context_dep +"_" + k)
                y.append(context_word) 


    train_indices = dict((w, i) for i, w in enumerate(set(x)))
    label_indices = dict((w, i) for i, w in enumerate(set(y)))
    #y gets converted to one_hot vectors, but we can leave x as the indeces
    #because of the Embedding layer
    x = np.array([train_indices[w] for w in x])
    y = np.array([label_indices[w] for w in y])
    y = np_utils.to_categorical(y, np.argmax(y))
    V = np.argmax(y)
    return (x,y,train_indices,label_indices)



def convert_predict_words(indexed_training):
    #x and y are training data and labels (respectively)
    #If reverse is true, we the pair (training, dep, context) will generate (training, dep_context)
    #and (context, dep_training)
    #TODO, should the reversed example be treated differently? That is done in the original paper
    #but no empirical justification is given
    x = []
    y = []
    reverse = True
    for k,Vs in indexed_training.iteritems():
        for (context_dep, context_word) in Vs:
            y.append(context_dep +"_" + context_word)
            x.append(k)
        if reverse:
            for (context_dep, context_word) in Vs:
                y.append(context_dep +"_" + k)
                x.append(context_word) 


    train_indices = dict((w, i) for i, w in enumerate(set(x)))
    label_indices = dict((w, i) for i, w in enumerate(set(y)))
    #y gets converted to one_hot vectors, but we can leave x as the indeces
    #because of the Embedding layer
    x = np.array([train_indices[w] for w in x])
    y = np.array([label_indices[w] for w in y])
    y = np_utils.to_categorical(y, np.argmax(y))
    return (x,y,train_indices,label_indices)



#### Compile the model

In [46]:
#TODO, does it matter that the input and output vectors use the same indeces? 
(x,y,train_indices,label_indicies) = convert_predict_context(indexed_training)
num_input = np.argmax(x)
num_output = len(y[0])
print("input=",num_input,"output=",num_output)
#300 dimensions for output, because thats what the paper does 
dim = 300
#Embed into a 'dim' dimensional space, flatten it to 1d for output, then softmax it
model = Sequential()
model.add(Embedding(input_dim=num_input, output_dim=dim, init='glorot_uniform', input_length=1))
model.add(Reshape((dim, )))
model.add(Dense(input_dim=dim, output_dim=num_output, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam")

('input=', 41847, 'output=', 25879)


  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':


#### Run the model

In [47]:
model.fit(x,y,batch_size=512, epochs=20,)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f969f08a0d0>

In [48]:
def print_top_words(model, train_indices):
    train_indices_rev = {b : a for a,b in train_indices.iteritems()}
    embeddings = model.get_weights()[0]
    vecs = dict()
    for i,w in train_indices_rev.iteritems():
        vecs[i] = embeddings[i]
    count = 0
    for input_word in train_indices.keys()[0:10]:
        input_vector = embeddings[train_indices[input_word]]

        sims = dict()
        for idx,vector in vecs.iteritems():
            result = 1 - spatial.distance.cosine(vector, input_vector)
            sims[train_indices_rev[idx]] = result 

        print(input_word)
        for (a,b) in sorted(sims.items(), key=lambda x:x[1], reverse=True)[:10]:
            print("\t" + str(a) + ": " + str(b))
        
        count += 1
        if (count > 10):
            break


In [49]:
print_top_words(model,train_indices)

prep_in_sleep
	prep_in_sleep: 0.999999918645
	conj_giving: 0.863054418029
	acl_dear: 0.78758052845
	prep_of_axes: 0.78692723477
	advmod_familiarly: 0.786467974289
	conj_smiling: 0.785594899253
	advcl_gloves: 0.784982644274
	prep_in_manner: 0.783622716608
	nsubj_recognised: 0.781431259298
	conj_nursing: 0.777836998244
prt_got
	prt_got: 1.00000005935
	prt_marked: 0.978079096881
	prt_held: 0.969610789246
	prt_made: 0.950130742397
	prt_look: 0.929567484811
	prt_gazing: 0.910770170009
	prt_brightened: 0.909942484709
	prt_keeping: 0.909757633053
	prep_in_spite: 0.909645872802
	prt_picked: 0.9095285719
prep_on_sides
	prep_on_sides: 0.999999913227
	auxpass_being: 0.680741016712
	agent_by: 0.662744532987
	prep_like_fish: 0.584138588749
	dobj_pieces: 0.578506348083
	conj_picked: 0.577769585363
	prep_to_puppy: 0.576482361297
	conj_difficulty: 0.576200802656
	pcomp_on: 0.5509562169
	dobj_it: 0.545201036797
conj_director
	conj_director: 0.999999918836
	appos_gbnewby@pglaf.org: 0.99821548949
	appos_