In [1]:
import csv
import collections
import numpy as np
import nltk
import keras
import keras.backend as K

Using TensorFlow backend.


In [64]:
nltk.download('wordnet')
nltk.download('omw')

[nltk_data] Downloading package wordnet to /home/marco/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to /home/marco/nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

### generate wordnet relation pairs

In [65]:
with open('./models/knowledge-graphj-embedding-analogic/wordnet_pairs.csv', 'w') as f:
    writer = csv.writer(f)
    for s in nltk.corpus.wordnet.all_synsets():
        for v in s.also_sees():
            writer.writerow((f's.{s.name()}', 's.also_sees', f's.{v.name()}'))
        for v in s.attributes():
            writer.writerow((f's.{s.name()}', 's.attributes', f's.{v.name()}'))
        for v in s.causes():
            writer.writerow((f's.{s.name()}', 's.causes', f's.{v.name()}'))
        for v in s.entailments():
            writer.writerow((f's.{s.name()}', 's.entailments', f's.{v.name()}'))
        for v in s.hypernyms():
            writer.writerow((f's.{s.name()}', 's.hypernyms', f's.{v.name()}'))
        for v in s.hyponyms():
            writer.writerow((f's.{s.name()}', 's.hyponyms', f's.{v.name()}'))
        for v in s.instance_hypernyms():
            writer.writerow((f's.{s.name()}', 's.instance_hypernyms', f's.{v.name()}'))
        for v in s.instance_hyponyms():
            writer.writerow((f's.{s.name()}', 's.instance_hyponyms', f's.{v.name()}'))
        for v in s.lemmas():
            writer.writerow((f's.{s.name()}', 's.lemmas', f'l.{v.name()}'))
        for v in s.member_holonyms():
            writer.writerow((f's.{s.name()}', 's.member_holonyms', f's.{v.name()}'))
        for v in s.member_meronyms():
            writer.writerow((f's.{s.name()}', 's.member_meronyms', f's.{v.name()}'))
        for v in s.part_holonyms():
            writer.writerow((f's.{s.name()}', 's.part_holonyms', f's.{v.name()}'))
        for v in s.part_meronyms():
            writer.writerow((f's.{s.name()}', 's.part_meronyms', f's.{v.name()}'))
        writer.writerow((f's.{s.name()}', 's.pos', f'p.{s.pos()}'))
        for v in s.region_domains():
            writer.writerow((f's.{s.name()}', 's.region_domains', f's.{v.name()}'))
        for v in s.root_hypernyms():
            writer.writerow((f's.{s.name()}', 's.root_hypernyms', f's.{v.name()}'))
        for v in s.similar_tos():
            writer.writerow((f's.{s.name()}', 's.similar_tos', f's.{v.name()}'))
        for v in s.substance_holonyms():
            writer.writerow((f's.{s.name()}', 's.substance_holonyms', f's.{v.name()}'))
        for v in s.substance_meronyms():
            writer.writerow((f's.{s.name()}', 's.substance_meronyms', f's.{v.name()}'))
        for v in s.topic_domains():
            writer.writerow((f's.{s.name()}', 's.topic_domains', f's.{v.name()}'))
        for v in s.usage_domains():
            writer.writerow((f's.{s.name()}', 's.usage_domains', f's.{v.name()}'))
        for v in s.verb_groups():
            writer.writerow((f's.{s.name()}', 's.verb_groups', f's.{v.name()}'))
        for v in s.frame_ids():
            writer.writerow((f's.{s.name()}', 's.frame_ids', f'f.{v}'))
    seen_lemma_keys = set()
    for l_name in nltk.corpus.wordnet.all_lemma_names():
        ls = nltk.corpus.wordnet.lemmas(l_name)
        for l in ls:
            if l.key() in seen_lemma_keys:
                continue
            seen_lemma_keys.add(l.key())
            for v in l.also_sees():
                writer.writerow((f'l.{l.key()}', 'l.also_sees', f'l.{v.key()}'))
            for v in l.antonyms():
                writer.writerow((f'l.{l.key()}', 'l.antonyms', f'l.{v.key()}'))
            for v in l.derivationally_related_forms():
                writer.writerow((f'l.{l.key()}', 'l.derivationally_related_forms', f'l.{v.key()}'))
            for v in l.frame_ids():
                writer.writerow((f'l.{l.key()}', 'l.frame_ids', f'f.{v}'))
            for v in l.pertainyms():
                writer.writerow((f'l.{l.key()}', 'l.pertainyms', f'l.{v.key()}'))
            for v in l.region_domains():
                writer.writerow((f'l.{l.key()}', 'l.region_domains', f'l.{v.key()}'))
            writer.writerow((f'l.{l.key()}', 'l.synset', f's.{l.synset().name()}'))
            writer.writerow((f'l.{l.key()}', 'l.syntactic_marker', f'sm.{l.syntactic_marker()}'))
            for v in l.topic_domains():
                writer.writerow((f'l.{l.key()}', 'l.topic_domains', f'l.{v.key()}'))
            for v in l.usage_domains():
                writer.writerow((f'l.{l.key()}', 'l.usage_domains', f'l.{v.key()}'))
            for v in l.verb_groups():
                writer.writerow((f'l.{l.key()}', 'l.verb_groups', f'l.{v.key()}'))
    del seen_lemma_keys

### postprocess relation pairs

In [2]:
entity_set = set()
relation_set = set()
with open('./models/knowledge-graphj-embedding-analogic/wordnet_pairs.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entity_set.add(row[0])
        entity_set.add(row[2])
        relation_set.add(row[1])
entity_set = list(sorted(entity_set))
relation_set = list(sorted(relation_set))
entity_lookup = { k+1:v for (k,v) in enumerate(entity_set)}
entity_index = { v:k+1 for (k,v) in enumerate(entity_set)}
entity_max = len(entity_set)
relation_lookup = { k+1:v for (k,v) in enumerate(relation_set)}
relation_index = { v:k+1 for (k,v) in enumerate(relation_set)}
relation_max = len(relation_set)
del entity_set
del relation_set
len(entity_lookup), len(relation_lookup)

(473382, 34)

In [3]:
dataset = []
with open('./models/knowledge-graphj-embedding-analogic/wordnet_pairs.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        dataset.append((entity_index[row[0]], relation_index[row[1]], entity_index[row[2]]))
dataset = np.array(dataset, dtype='int')
dataset.shape

(1287303, 3)

In [4]:
bern = {}
for h,r,t in dataset:
    r_stat = bern.get(r)
    if r_stat is None:
        r_stat = (collections.Counter(), collections.Counter())
        bern[r] = r_stat
    h_stat, t_stat = r_stat
    h_stat[h] += 1
    t_stat[t] += 1
bern = {r:(
    np.mean(list(r_stat[0].values())),
    np.mean(list(r_stat[1].values()))) for r,r_stat in bern.items()}
bern = {r:r_stat[0]/(r_stat[0]+r_stat[1]) for r,r_stat in bern.items()}

### trans f

In [5]:
class TransF(keras.layers.Layer):
    
    def __init__(self, **kwargs):
        super(TransF, self).__init__(**kwargs)
    
    def build(self, input_shape):
        entity_shape, relation_shape = input_shape
        self.kernel_width = entity_shape[1]
        self.kernel = self.add_weight(
            shape=(relation_shape[1], entity_shape[1], entity_shape[1]),
            initializer='zeros', name='kernel')
        super(TransF, self).build(input_shape)
    
    def call(self, inputs):
        self: keras.layers.Layer
        entity, relation = inputs
        relation = K.reshape(relation, K.concatenate([K.shape(relation), (1,1)]))
        relation = relation * self.kernel
        relation = K.sum(relation, axis=1, keepdims=False)
        relation = relation + K.eye(self.kernel_width)
        result = (K.expand_dims(entity, axis=1) @ relation)[:,0]
        unit_norm_output_loss = \
            K.abs(K.sqrt(K.sum(K.square(result), axis=-1, keepdims=True)) - 1.)
        self.add_loss(unit_norm_output_loss)
        return result
    
    def compute_output_shape(self, input_shape):
        entity_shape, _ = input_shape
        return entity_shape

In [6]:
X_h = X_input_h = keras.layers.Input((1,), dtype='int32')
X_t = X_input_t = keras.layers.Input((1,), dtype='int32')
X_r = X_input_r = keras.layers.Input((1,), dtype='int32')
X_s = X_input_s = keras.layers.Input((1,))
X_e_embedding = keras.layers.Embedding(entity_max+1, 32)
X_r_embedding = keras.layers.Embedding(relation_max+1, 32,
    embeddings_constraint=keras.constraints.max_norm(max_value=1., axis=-1))
X_hr_embedding = keras.layers.Embedding(relation_max+1, 8)
X_tr_embedding = keras.layers.Embedding(relation_max+1, 8)
X_hr_trans_f = TransF()
X_tr_trans_f = TransF()
X_h = X_e_embedding(X_h)
X_t = X_e_embedding(X_t)
X_hr = X_hr_embedding(X_r)
X_tr = X_hr_embedding(X_r)
X_r = X_r_embedding(X_r)
X_h = keras.layers.Flatten()(X_h)
X_t = keras.layers.Flatten()(X_t)
X_hr = keras.layers.Flatten()(X_hr)
X_tr = keras.layers.Flatten()(X_tr)
X_hr = keras.layers.Multiply()([X_hr, X_s])
X_tr = keras.layers.Multiply()([X_tr, X_s])
X_r = keras.layers.Flatten()(X_r)
X_h = X_hr_trans_f([X_h, X_hr])
X_t = X_tr_trans_f([X_t, X_tr])
X = keras.layers.Add()([X_h, X_r])
X = keras.layers.Subtract()([X, X_t])
X = keras.layers.Dot(-1)([X, X])
X = keras.layers.Lambda(lambda x: K.sqrt(x))(X)
M_transf = keras.Model([X_input_h, X_input_r, X_input_t, X_input_s], X)
M_transf.compile('adam', 'mse')
M_transf.summary()
X_p_h = X_input_p_h = keras.layers.Input((1,), dtype='int32')
X_p_t = X_input_p_t = keras.layers.Input((1,), dtype='int32')
X_p_r = X_input_p_r = keras.layers.Input((1,), dtype='int32')
X_n_h = X_input_n_h = keras.layers.Input((1,), dtype='int32')
X_n_t = X_input_n_t = keras.layers.Input((1,), dtype='int32')
X_n_r = X_input_n_r = keras.layers.Input((1,), dtype='int32')
X_p = M_transf([X_p_h, X_p_r, X_p_t, X_input_s])
X_n = M_transf([X_n_h, X_n_r, X_n_t, X_input_s])
X = keras.layers.Concatenate()([X_p, X_n])
M = keras.Model([
    X_input_p_h, X_input_p_r, X_input_p_t,
    X_input_n_h, X_input_n_r, X_input_n_t,
    X_input_s], X)
def TransF_loss(t, y):
    p,n = y[:,0], y[:,1]
    margin_loss = K.sum(K.clip(p + 3 - n, 0, np.inf))
    energy_loss = K.sum(p)
    return margin_loss + energy_loss + 0*K.sum(t)
M.compile('adam', TransF_loss)
M.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 8)         280         input_3[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 32)        15148256    input_1[0][0]                    
          

In [7]:
def negative_sampling():
    negative_type = np.array([bern[r] for r in dataset[:,1]])
    negative_type = np.random.rand(dataset.shape[0]) < negative_type
    d0 = np.where(negative_type, np.random.randint(
        0, entity_max+1, size=dataset.shape[:1]), dataset[:,0])
    d1 = np.where(~negative_type, np.random.randint(
        0, entity_max+1, size=dataset.shape[:1]), dataset[:,2])
    return np.stack([d0, dataset[:,1], d1], axis=-1)

In [8]:
N_EPOCH = 25
for epoch in range(N_EPOCH):
    nsamples = negative_sampling()
    s = np.zeros if epoch < (N_EPOCH*2/3) else np.ones
    M.fit([
        dataset[:,0],dataset[:,1],dataset[:,2],
        nsamples[:,0],nsamples[:,1],nsamples[:,2],
        s((dataset.shape[0],1))],
        np.zeros((dataset.shape[0],2)), batch_size=512,
        initial_epoch=epoch, epochs=epoch+1)

Epoch 1/1
Epoch 2/2
Epoch 3/3
Epoch 4/4
Epoch 5/5
Epoch 6/6
Epoch 7/7
Epoch 8/8
Epoch 9/9
Epoch 10/10
Epoch 11/11
Epoch 12/12

KeyboardInterrupt: 

In [23]:
keras.models.save_model(M, './models/knowledge-graphj-embedding-analogic/model.hdf5')

In [22]:
keras.models.load_model('./models/knowledge-graphj-embedding-analogic/model.hdf5', custom_objects={
    'TransF': TransF,
    'TransF_loss': TransF_loss,
})

<keras.engine.training.Model at 0x7fc1588764e0>

### test

In [58]:
test_synset = nltk.corpus.wordnet.lemmas('usa', pos='n')[0].synset()
test_synset.name(), test_synset.part_meronyms()

('united_states.n.01',
 [Synset('alabama.n.01'),
  Synset('alaska.n.01'),
  Synset('american_state.n.01'),
  Synset('arizona.n.01'),
  Synset('arkansas.n.01'),
  Synset('california.n.01'),
  Synset('colony.n.03'),
  Synset('colorado.n.01'),
  Synset('connecticut.n.01'),
  Synset('connecticut.n.02'),
  Synset('dakota.n.02'),
  Synset('delaware.n.04'),
  Synset('district_of_columbia.n.01'),
  Synset('east.n.03'),
  Synset('florida.n.01'),
  Synset('georgia.n.01'),
  Synset('great_lakes.n.01'),
  Synset('hawaii.n.01'),
  Synset('idaho.n.01'),
  Synset('illinois.n.01'),
  Synset('indiana.n.01'),
  Synset('iowa.n.02'),
  Synset('kansas.n.01'),
  Synset('kentucky.n.01'),
  Synset('louisiana.n.01'),
  Synset('louisiana_purchase.n.01'),
  Synset('maine.n.01'),
  Synset('maryland.n.01'),
  Synset('massachusetts.n.01'),
  Synset('michigan.n.01'),
  Synset('mid-atlantic_states.n.01'),
  Synset('midwest.n.01'),
  Synset('minnesota.n.01'),
  Synset('mississippi.n.01'),
  Synset('mississippi.n.02'),

In [62]:
print(relation_index)
M_transf.predict([
    np.array([[entity_index['s.united_states.n.01']]]),
    np.array([[relation_index['s.part_meronym']]]),
    np.array([[entity_index['s.alaska.n.01']]]),
    np.ones((1, 1))], batch_size=512)

{'l.also_sees': 1, 'l.antonyms': 2, 'l.derivationally_related_forms': 3, 'l.frame_ids': 4, 'l.pertainyms': 5, 'l.region_domains': 6, 'l.synset': 7, 'l.syntactic_marker': 8, 'l.topic_domains': 9, 'l.usage_domains': 10, 'l.verb_groups': 11, 's.also_sees': 12, 's.attributes': 13, 's.causes': 14, 's.entailments': 15, 's.frame_ids': 16, 's.hypernyms': 17, 's.hyponyms': 18, 's.instance_hypernyms': 19, 's.instance_hyponyms': 20, 's.lemmas': 21, 's.member_holonyms': 22, 's.member_meronyms': 23, 's.part_holonyms': 24, 's.pos': 25, 's.region_domains': 26, 's.root_hypernyms': 27, 's.similar_tos': 28, 's.substance_holonyms': 29, 's.substance_meronyms': 30, 's.topic_domains': 31, 's.usage_domains': 32, 's.verb_groups': 33}


KeyError: 's.part_meronym'

In [26]:
result = M_transf.predict([
    np.repeat(np.array([[entity_index['s.drink.v.01']]]), entity_max+1, axis=0),
    np.repeat(np.array([[relation_index['s.hyponyms']]]), entity_max+1, axis=0),
    np.arange(0, entity_max+1, 1, dtype='int32')[:, np.newaxis],
    np.ones((entity_max+1, 1))], batch_size=512)
[entity_lookup[i] for i in np.argsort(result, axis=0)[:50,0]]

['s.belong.v.05',
 's.decide.v.03',
 's.ferment.v.02',
 's.download.v.01',
 's.separate.v.08',
 's.zoom.v.02',
 's.confederate.v.02',
 's.seed.v.05',
 's.heckle.v.02',
 's.cope.v.01',
 's.film.v.01',
 's.madden.v.03',
 's.rerun.v.03',
 's.computerize.v.01',
 's.sack.v.01',
 's.neutralize.v.02',
 's.sleet.v.01',
 's.even.v.03',
 's.declare.v.07',
 's.shuffle.v.03',
 's.legislate.v.01',
 's.fictionalize.v.01',
 's.alkalinize.v.01',
 's.raddle.v.01',
 's.ting.v.01',
 's.madder.v.01',
 's.gate.v.02',
 's.score.v.06',
 's.better.v.01',
 's.recast.v.02',
 's.join_battle.v.01',
 's.rout.v.03',
 's.drop.v.20',
 's.oppose.v.03',
 's.suffuse.v.02',
 's.outlaw.v.01',
 's.hob.v.01',
 's.rush_off.v.01',
 's.eke_out.v.01',
 's.down.v.04',
 's.uncover.v.01',
 's.pull.v.08',
 's.blank.v.01',
 's.secure.v.03',
 's.outgrow.v.02',
 's.notice.v.02',
 's.truss.v.03',
 's.encrust.v.03',
 's.ennoble.v.01',
 's.flip.v.01']