In [38]:
from scipy.optimize import linear_sum_assignment
from torch_geometric.datasets import DBP15K
from sklearn.metrics import pairwise_distances
from time import time
import numpy as np
import torch

In [2]:
class SumEmbedding(object):
    def __call__(self, data):
        data.x1, data.x2 = data.x1.sum(dim=1), data.x2.sum(dim=1)
        return data

In [4]:
data = DBP15K(root='', pair='en_fr', transform=SumEmbedding())[0]

In [36]:
times = []
for i in [10, 100, 300, 500, 1000, 2000, 4000, 8000, 14120]:
    x_fr = data.x1[:i]
    x_en = data.x2[:i]
    sim_eucl = pairwise_distances(x_fr, x_en)
    sim_cos = pairwise_distances(x_fr, x_en, metric='cosine')
    start = time()
    match_eucl = linear_sum_assignment(sim_eucl)
    match_cos = linear_sum_assignment(sim_cos)
    end = time()
    times.append(end - start)
    hits_at_1_eucl = (match_eucl[0] == match_eucl[1]).sum() / i
    hits_at_1_cos = (match_cos[0] == match_cos[1]).sum() / i
    print(f'{i} eucl: {hits_at_1_eucl:.02f}, cos: {hits_at_1_cos:.02f}, time: {end - start:.02f}')

10 eucl: 1.00, cos: 1.00, time: 0.00
100 eucl: 0.98, cos: 0.98, time: 0.00
300 eucl: 0.94, cos: 0.97, time: 0.01
500 eucl: 0.93, cos: 0.96, time: 0.03
1000 eucl: 0.93, cos: 0.95, time: 0.10
2000 eucl: 0.90, cos: 0.94, time: 0.70
4000 eucl: 0.89, cos: 0.93, time: 5.08
8000 eucl: 0.88, cos: 0.92, time: 36.01
14120 eucl: 0.87, cos: 0.91, time: 192.11


In [39]:
fr_degree_dict = {}
fr_neigh_dict = {}
fr_features = np.empty((data.x1.shape[0], 5))

for edge in data.edge_index1.T:
    s, t = list(map(int, edge))
    fr_degree_dict[s] = fr_degree_dict.setdefault(s, 0) + 1
    fr_degree_dict[t] = fr_degree_dict.setdefault(t, 0) + 1

for edge in data.edge_index1.T:
    s, t = list(map(int, edge))
    fr_neigh_dict[s] = np.append(fr_neigh_dict.setdefault(s, np.array([])), fr_degree_dict[t])
    fr_neigh_dict[t] = np.append(fr_neigh_dict.setdefault(t, np.array([])), fr_degree_dict[s])
    
for i in range(data.x1.shape[0]):
    degree_seq = fr_neigh_dict[i]
    deg = fr_degree_dict[i]
    min_deg = np.min(degree_seq)
    max_deg = np.max(degree_seq)
    mean_deg = np.mean(degree_seq)
    std_deg = np.std(degree_seq)
    fr_features[i][0] = deg
    fr_features[i][1] = min_deg
    fr_features[i][2] = max_deg
    fr_features[i][3] = mean_deg
    fr_features[i][4] = std_deg
    
del(fr_degree_dict)
del(fr_neigh_dict)

x1 = torch.tensor(fr_features).float()


en_degree_dict = {}
en_neigh_dict = {}
en_features = np.empty((data.x2.shape[0], 5))

for edge in data.edge_index2.T:
    s, t = list(map(int, edge))
    en_degree_dict[s] = en_degree_dict.setdefault(s, 0) + 1
    en_degree_dict[t] = en_degree_dict.setdefault(t, 0) + 1

for edge in data.edge_index2.T:
    s, t = list(map(int, edge))
    en_neigh_dict[s] = np.append(en_neigh_dict.setdefault(s, np.array([])), en_degree_dict[t])
    en_neigh_dict[t] = np.append(en_neigh_dict.setdefault(t, np.array([])), en_degree_dict[s])

for i in range(data.x2.shape[0]):
    degree_seq = en_neigh_dict[i]
    deg = en_degree_dict[i]
    min_deg = np.min(degree_seq)
    max_deg = np.max(degree_seq)
    mean_deg = np.mean(degree_seq)
    std_deg = np.std(degree_seq)
    en_features[i][0] = deg
    en_features[i][1] = min_deg
    en_features[i][2] = max_deg
    en_features[i][3] = mean_deg
    en_features[i][4] = std_deg
    
del(en_degree_dict)
del(en_neigh_dict)

x2 = torch.tensor(en_features).float()

In [41]:
times = []
for i in [10, 100, 300, 500, 1000, 2000, 4000, 8000, 14120]:
    x_fr = x1[:i]
    x_en = x2[:i]
    sim_eucl = x_fr @ x_en.T
    sim_cos = pairwise_distances(x_fr, x_en, metric='cosine')
    start = time()
    match_eucl = linear_sum_assignment(sim_eucl, maximize=True)
    match_cos = linear_sum_assignment(sim_cos)
    end = time()
    times.append(end - start)
    hits_at_1_eucl = (match_eucl[0] == match_eucl[1]).sum() / i
    hits_at_1_cos = (match_cos[0] == match_cos[1]).sum() / i
    print(f'{i} eucl: {hits_at_1_eucl:.04f}, cos: {hits_at_1_cos:.04f}, time: {end - start:.02f}')

10 eucl: 0.2000, cos: 0.1000, time: 0.00
100 eucl: 0.0800, cos: 0.0800, time: 0.00
300 eucl: 0.0500, cos: 0.0333, time: 0.05
500 eucl: 0.0380, cos: 0.0360, time: 0.18
1000 eucl: 0.0140, cos: 0.0100, time: 1.25
2000 eucl: 0.0120, cos: 0.0080, time: 9.35
4000 eucl: 0.0088, cos: 0.0037, time: 84.24
8000 eucl: 0.0039, cos: 0.0040, time: 700.05
14120 eucl: 0.0022, cos: 0.0024, time: 4193.74
