In [6]:
import csv
import collections
import numpy as np
import nltk
import keras
import keras.backend as K

In [2]:
nltk.download('wordnet')
nltk.download('omw')

[nltk_data] Downloading package wordnet to /home/marco/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to /home/marco/nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

### generate wordnet relation pairs

In [3]:
with open('./models/knowledge-graphj-embedding-analogic/wordnet_pairs.csv', 'w') as f:
    writer = csv.writer(f)
    for s in nltk.corpus.wordnet.all_synsets():
        for v in s.also_sees():
            writer.writerow((f's.{s.name()}', 's.also_sees', f's.{v.name()}'))
        for v in s.attributes():
            writer.writerow((f's.{s.name()}', 's.attributes', f's.{v.name()}'))
        for v in s.causes():
            writer.writerow((f's.{s.name()}', 's.causes', f's.{v.name()}'))
        for v in s.entailments():
            writer.writerow((f's.{s.name()}', 's.entailments', f's.{v.name()}'))
        for v in s.hypernyms():
            writer.writerow((f's.{s.name()}', 's.hypernyms', f's.{v.name()}'))
        for v in s.hyponyms():
            writer.writerow((f's.{s.name()}', 's.hyponyms', f's.{v.name()}'))
        for v in s.instance_hypernyms():
            writer.writerow((f's.{s.name()}', 's.instance_hypernyms', f's.{v.name()}'))
        for v in s.instance_hyponyms():
            writer.writerow((f's.{s.name()}', 's.instance_hyponyms', f's.{v.name()}'))
        for v in s.lemmas():
            writer.writerow((f's.{s.name()}', 's.lemmas', f'l.{v.name()}'))
        for v in s.member_holonyms():
            writer.writerow((f's.{s.name()}', 's.member_holonyms', f's.{v.name()}'))
        for v in s.member_meronyms():
            writer.writerow((f's.{s.name()}', 's.member_meronyms', f's.{v.name()}'))
        for v in s.part_holonyms():
            writer.writerow((f's.{s.name()}', 's.part_holonyms', f's.{v.name()}'))
        writer.writerow((f's.{s.name()}', 's.pos', f'p.{s.pos()}'))
        for v in s.region_domains():
            writer.writerow((f's.{s.name()}', 's.region_domains', f's.{v.name()}'))
        for v in s.root_hypernyms():
            writer.writerow((f's.{s.name()}', 's.root_hypernyms', f's.{v.name()}'))
        for v in s.similar_tos():
            writer.writerow((f's.{s.name()}', 's.similar_tos', f's.{v.name()}'))
        for v in s.substance_holonyms():
            writer.writerow((f's.{s.name()}', 's.substance_holonyms', f's.{v.name()}'))
        for v in s.substance_meronyms():
            writer.writerow((f's.{s.name()}', 's.substance_meronyms', f's.{v.name()}'))
        for v in s.topic_domains():
            writer.writerow((f's.{s.name()}', 's.topic_domains', f's.{v.name()}'))
        for v in s.usage_domains():
            writer.writerow((f's.{s.name()}', 's.usage_domains', f's.{v.name()}'))
        for v in s.verb_groups():
            writer.writerow((f's.{s.name()}', 's.verb_groups', f's.{v.name()}'))
        for v in s.frame_ids():
            writer.writerow((f's.{s.name()}', 's.frame_ids', f'f.{v}'))
    seen_lemma_keys = set()
    for l_name in nltk.corpus.wordnet.all_lemma_names():
        ls = nltk.corpus.wordnet.lemmas(l_name)
        for l in ls:
            if l.key() in seen_lemma_keys:
                continue
            seen_lemma_keys.add(l.key())
            for v in l.also_sees():
                writer.writerow((f'l.{l.key()}', 'l.also_sees', f'l.{v.key()}'))
            for v in l.antonyms():
                writer.writerow((f'l.{l.key()}', 'l.antonyms', f'l.{v.key()}'))
            for v in l.derivationally_related_forms():
                writer.writerow((f'l.{l.key()}', 'l.derivationally_related_forms', f'l.{v.key()}'))
            for v in l.frame_ids():
                writer.writerow((f'l.{l.key()}', 'l.frame_ids', f'f.{v}'))
            for v in l.pertainyms():
                writer.writerow((f'l.{l.key()}', 'l.pertainyms', f'l.{v.key()}'))
            for v in l.region_domains():
                writer.writerow((f'l.{l.key()}', 'l.region_domains', f'l.{v.key()}'))
            writer.writerow((f'l.{l.key()}', 'l.synset', f's.{l.synset().name()}'))
            writer.writerow((f'l.{l.key()}', 'l.syntactic_marker', f'sm.{l.syntactic_marker()}'))
            for v in l.topic_domains():
                writer.writerow((f'l.{l.key()}', 'l.topic_domains', f'l.{v.key()}'))
            for v in l.usage_domains():
                writer.writerow((f'l.{l.key()}', 'l.usage_domains', f'l.{v.key()}'))
            for v in l.verb_groups():
                writer.writerow((f'l.{l.key()}', 'l.verb_groups', f'l.{v.key()}'))
    del seen_lemma_keys

### postprocess relation pairs

In [4]:
entity_set = set()
relation_set = set()
with open('./models/knowledge-graphj-embedding-analogic/wordnet_pairs.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entity_set.add(row[0])
        entity_set.add(row[2])
        relation_set.add(row[1])
entity_set = list(sorted(entity_set))
relation_set = list(sorted(relation_set))
entity_lookup = { k+1:v for (k,v) in enumerate(entity_set)}
entity_index = { v:k+1 for (k,v) in enumerate(entity_set)}
entity_max = len(entity_set)
relation_lookup = { k+1:v for (k,v) in enumerate(relation_set)}
relation_index = { v:k+1 for (k,v) in enumerate(relation_set)}
relation_max = len(relation_set)
del entity_set
del relation_set
len(entity_lookup), len(relation_lookup)

(473382, 33)

In [5]:
dataset = []
with open('./models/knowledge-graphj-embedding-analogic/wordnet_pairs.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        dataset.append((entity_index[row[0]], relation_index[row[1]], entity_index[row[2]]))
dataset = np.array(dataset, dtype='int')
dataset.shape

(1278206, 3)

In [14]:
bern = {}
for h,r,t in dataset:
    r_stat = bern.get(r)
    if r_stat is None:
        r_stat = (collections.Counter(), collections.Counter())
        bern[r] = r_stat
    h_stat, t_stat = r_stat
    h_stat[h] += 1
    t_stat[t] += 1
bern = {r:(
    np.mean(list(r_stat[0].values())),
    np.mean(list(r_stat[1].values()))) for r,r_stat in bern.items()}
bern = {r:r_stat[0]/(r_stat[0]+r_stat[1]) for r,r_stat in bern.items()}

### trans f

In [5]:
class TransF(keras.layers.Layer):
    
    def __init__(self, **kwargs):
        super(TransF, self).__init__(**kwargs)
    
    def build(self, input_shape):
        entity_shape, relation_shape = input_shape
        self.kernel_width = entity_shape[1]
        self.kernel = self.add_weight(
            shape=(relation_shape[1], entity_shape[1], entity_shape[1]),
            initializer='zeros', name='kernel')
        super(TransF, self).build(input_shape)
    
    def call(self, inputs):
        entity, relation = inputs
        relation = K.reshape(relation, K.concatenate([K.shape(relation), (1,1)]))
        relation = relation * self.kernel
        relation = K.sum(relation, axis=1, keepdims=False)
        relation = relation + K.eye(self.kernel_width)
        return (K.expand_dims(entity, axis=1) @ relation)[:,0]
    
    def compute_output_shape(self, input_shape):
        entity_shape, _ = input_shape
        return entity_shape

In [10]:
X_h = X_input_h = keras.layers.Input((1,), dtype='int32')
X_t = X_input_t = keras.layers.Input((1,), dtype='int32')
X_r = X_input_r = keras.layers.Input((1,), dtype='int32')
X_e_embedding = keras.layers.Embedding(entity_max+1, 50,
    embeddings_constraint=keras.constraints.unit_norm(axis=-1))
X_r_embedding = keras.layers.Embedding(relation_max+1, 50,
    embeddings_constraint=keras.constraints.max_norm(max_value=1., axis=-1))
X_hr_embedding = keras.layers.Embedding(relation_max+1, 6)
X_tr_embedding = keras.layers.Embedding(relation_max+1, 6)
X_hr_trans_f = TransF()
X_tr_trans_f = TransF()
X_h = X_e_embedding(X_h)
X_t = X_e_embedding(X_t)
X_hr = X_hr_embedding(X_r)
X_tr = X_hr_embedding(X_r)
X_r = X_r_embedding(X_r)
X_h = keras.layers.Flatten()(X_h)
X_t = keras.layers.Flatten()(X_t)
X_hr = keras.layers.Flatten()(X_hr)
X_tr = keras.layers.Flatten()(X_tr)
X_r = keras.layers.Flatten()(X_r)
X_h = X_hr_trans_f([X_h, X_hr])
X_t = X_tr_trans_f([X_t, X_tr])
X = keras.layers.Add()([X_h, X_r])
X = keras.layers.Subtract()([X, X_t])
X = keras.layers.Dot(-1)([X, X])
X = keras.layers.Lambda(lambda x: K.sqrt(x))(X)
M_sample = keras.Model([X_input_h, X_input_r, X_input_t], X)
M_sample.compile('adam', 'mse')
M_sample.summary()
X_p_h = X_input_p_h = keras.layers.Input((1,), dtype='int32')
X_p_t = X_input_p_t = keras.layers.Input((1,), dtype='int32')
X_p_r = X_input_p_r = keras.layers.Input((1,), dtype='int32')
X_n_h = X_input_n_h = keras.layers.Input((1,), dtype='int32')
X_n_t = X_input_n_t = keras.layers.Input((1,), dtype='int32')
X_n_r = X_input_n_r = keras.layers.Input((1,), dtype='int32')
X_p = M_sample([X_p_h, X_p_r, X_p_t])
X_n = M_sample([X_n_h, X_n_r, X_n_t])
X = keras.layers.Subtract()([X_p, X_n])
M = keras.Model([X_p_h, X_p_r, X_p_t, X_n_h, X_n_r, X_n_t], X)
def TransF_loss(t, y):
    return K.sum(K.clip(y + 3, 0, np.inf) + 0*t)
M.compile('adam', TransF_loss)
M.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 32)        15148256    input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 1, 8)         272         input_9[0][0]                    
          

In [26]:
def negative_sampling():
    negative_type = np.array([bern[r] for r in dataset[:,1]])
    negative_type = np.random.rand(dataset.shape[0]) < negative_type
    d0 = np.where(negative_type, np.random.randint(
        0, entity_max+1, size=dataset.shape[:1]), dataset[:,0])
    d1 = np.where(~negative_type, np.random.randint(
        0, entity_max+1, size=dataset.shape[:1]), dataset[:,2])
    return np.stack([d0, dataset[:,1], d1], axis=-1)

In [58]:
for epoch in range(10):
    nsamples = negative_sampling()
    M.fit([
        dataset[:,0],dataset[:,1],dataset[:,2],
        nsamples[:,0],nsamples[:,1],nsamples[:,2]],
        np.zeros((dataset.shape[0],1)), batch_size=512, epochs=1)

epoch 0
Epoch 1/1
epoch 1
Epoch 1/1
epoch 2
Epoch 1/1
epoch 3
Epoch 1/1
epoch 4
Epoch 1/1
epoch 5
Epoch 1/1
epoch 6
Epoch 1/1
epoch 7
Epoch 1/1

KeyboardInterrupt: 

In [59]:
keras.models.save_model(M, './models/knowledge-graphj-embedding-analogic/model.hdf5')

In [86]:
test_synset = nltk.corpus.wordnet.lemmas('drink', pos='v')[0].synset()
test_synset.name(), test_synset.hypernyms()

('drink.v.01', [Synset('consume.v.02')])

In [87]:
result = M_sample.predict([
    np.repeat(np.array([[entity_index['s.drink.v.01']]]), entity_max+1, axis=0),
    np.repeat(np.array([[relation_index['s.hypernyms']]]), entity_max+1, axis=0),
    np.arange(0, entity_max+1, 1, dtype='int32')[:, np.newaxis]], batch_size=512)
order = np.argsort(result, axis=0)
[entity_lookup[i] for i in order[:30,0]]

['s.declare.v.01',
 's.evaluate.v.02',
 's.praise.v.01',
 's.mark.v.05',
 's.communication.n.02',
 's.displace.v.01',
 's.rise.v.01',
 's.propel.v.01',
 's.reduce.v.01',
 's.worsen.v.01',
 's.send.v.01',
 's.plan.v.02',
 's.unpack.v.01',
 's.radiate.v.05',
 's.try.v.01',
 's.draw.v.07',
 's.forgive.v.01',
 's.state.v.01',
 's.re-create.v.01',
 's.reception_room.n.01',
 's.separate.v.12',
 's.end.v.02',
 's.weaken.v.01',
 's.decrease.v.02',
 's.explain.v.01',
 's.pass.v.05',
 's.grow.v.02',
 's.convert.v.02',
 's.play.v.01',
 's.hash_out.v.01']