# Task 2: Learning from Embeddings using AmpliGraph

In [1]:
import numpy as np
import pandas as pd
import ampligraph
import requests
from ampligraph.datasets import load_from_csv
import tensorflow as tf

# Task 2.1 Link Prediction

In [2]:
from ampligraph.evaluation import evaluate_performance
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

In [3]:
url = 'https://ampligraph.s3-eu-west-1.amazonaws.com/datasets/GoT.csv'
open('GoT.csv', 'wb').write(requests.get(url).content)
X = load_from_csv('.', 'GoT.csv', sep=',')

In [4]:
X_train, X_test = X[:3000], X[3000:]

In [5]:
positives_filter = X
tf.logging.set_verbosity(tf.logging.ERROR)

### ComplEx

In [6]:
from ampligraph.latent_features import ComplEx

In [7]:
complex_model = ComplEx(batches_count=100, 
                seed=0, 
                epochs=200, 
                k=150, 
                eta=5,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

In [8]:
complex_model.fit(X_train, early_stopping = False)

Average ComplEx Loss:   0.032386: 100%|██████████| 200/200 [02:20<00:00,  1.42epoch/s]


In [9]:
complex_model.predict(X_test[0])

array([0.8109181], dtype=float32)

In [10]:
ranks = evaluate_performance(X_test, 
                             model=complex_model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)



100%|██████████| 119/119 [00:01<00:00, 89.14it/s]


In [11]:
mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.49
Hits@10: 0.62
Hits@3: 0.52
Hits@1: 0.43


### DistMult

In [12]:
from ampligraph.latent_features import DistMult

In [13]:
distmult_model = DistMult(batches_count=100, 
                seed=0, 
                epochs=200, 
                k=150, 
                eta=5,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

In [14]:
distmult_model.fit(X_train, early_stopping = False)

Average DistMult Loss:   0.031504: 100%|██████████| 200/200 [01:18<00:00,  2.55epoch/s]


In [15]:
ranks = evaluate_performance(X_test, 
                             model=distmult_model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)



100%|██████████| 119/119 [00:00<00:00, 272.94it/s]


In [16]:
mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.49
Hits@10: 0.62
Hits@3: 0.51
Hits@1: 0.42


### TransE

In [17]:
from ampligraph.latent_features import TransE

In [18]:
transe_model = TransE(batches_count=100, 
                seed=0, 
                epochs=200, 
                k=150, 
                eta=5,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

In [19]:
transe_model.fit(X_train, early_stopping = False)

Average TransE Loss:   0.046237: 100%|██████████| 200/200 [01:19<00:00,  2.51epoch/s]


In [20]:
ranks = evaluate_performance(X_test, 
                             model=transe_model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)



100%|██████████| 119/119 [00:00<00:00, 278.04it/s]


In [21]:
mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.34
Hits@10: 0.59
Hits@3: 0.41
Hits@1: 0.21


##  2.2 Relation prediction

In [25]:
relation_list = np.unique(X[:,1])

In [50]:
def relation_prediction(subject, object):
    record = []
    for relation in relation_list:
        triple = np.array([subject, relation, object])
        score = current_model.predict(triple)
        record.append((relation,score[0]))
    record.sort(reverse=True,key=lambda x: x[1])
    return record[0][0]

In [51]:
current_model = complex_model
relation_prediction('Jorah Mormont', 'Daenerys Targaryen')

'ALLIED_WITH'

In [52]:
current_model = transe_model
relation_prediction('Jorah Mormont', 'Daenerys Targaryen')

'SPOUSE'

In [53]:
current_model = distmult_model
relation_prediction('Jorah Mormont', 'Daenerys Targaryen')

'SWORN_TO'

## 2.3 Nearest Neighbor Search

In [22]:
from ampligraph.discovery import find_nearest_neighbours

In [23]:
find_nearest_neighbours(complex_model, entities = ['Arya Stark'], n_neighbors = 5)

(array([['Arya Stark', 'Edrick Stark', 'Desmond', 'Gariss', 'Poxy Tym']],
       dtype='<U12'),
 array([[0.        , 0.9139245 , 0.9399056 , 0.98441035, 1.0022637 ]],
       dtype=float32))

In [24]:
find_nearest_neighbours(distmult_model, entities = ['Arya Stark'], n_neighbors = 5)

(array([['Arya Stark', 'Edrick Stark', 'Desmond', 'Gariss', 'Quent']],
       dtype='<U12'),
 array([[0.        , 0.56513625, 0.5897246 , 0.5920119 , 0.6459339 ]],
       dtype=float32))

In [25]:
find_nearest_neighbours(transe_model, entities = ['Arya Stark'], n_neighbors = 5)

(array([['Arya Stark', 'Hayhead', 'Poxy Tym', 'Harwin', 'Shyra']],
       dtype='<U10'),
 array([[0.        , 0.90866417, 0.9719526 , 0.9743764 , 1.0016923 ]],
       dtype=float32))