In [1]:
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import pairwise_distances
from time import time
import numpy as np
import torch

In [2]:
x1_path = '../preprocessing/processed_data/graph_data/fr_entities_features.pt'
x2_path = '../preprocessing/processed_data/graph_data/en_entities_features.pt'
edge_index1_path = '../preprocessing/processed_data/graph_data/fr_edge_index.pt'
edge_index2_path = '../preprocessing/processed_data/graph_data/en_edge_index.pt'

x1 = torch.load(x1_path).float()
x2 = torch.load(x2_path).float()
edge_index1 = torch.load(edge_index1_path).float()
edge_index2 = torch.load(edge_index2_path).float()

#### Hungarian algorithm on embeddings

In [6]:
times = []
for i in [10, 100, 300, 500, 1000, 2000, 4000, 8000, 15000]:
    x_pm = x1[:i]
    x_ot = x2[:i]
    sim_eucl = pairwise_distances(x_pm, x_ot)
    sim_cos = pairwise_distances(x_pm, x_ot, metric='cosine')
    start = time()
    match_eucl = linear_sum_assignment(sim_eucl)
    match_cos = linear_sum_assignment(sim_cos)
    end = time()
    times.append(end - start)
    hits_at_1_eucl = (match_eucl[0] == match_eucl[1]).sum() / i
    hits_at_1_cos = (match_cos[0] == match_cos[1]).sum() / i
    
    print(f'{i} eucl: {hits_at_1_eucl:.04f}, cos: {hits_at_1_cos:.04f}, time: {end - start:.02f}')

10 eucl: 0.6000, cos: 0.6000, time: 0.00
100 eucl: 0.1700, cos: 0.1200, time: 0.00
300 eucl: 0.0500, cos: 0.0400, time: 0.04
500 eucl: 0.0320, cos: 0.0220, time: 0.14
1000 eucl: 0.0240, cos: 0.0120, time: 0.79
2000 eucl: 0.0095, cos: 0.0065, time: 4.78
4000 eucl: 0.0100, cos: 0.0063, time: 38.23
8000 eucl: 0.0053, cos: 0.0037, time: 268.26
15000 eucl: 0.0031, cos: 0.0015, time: 1749.35


#### Preparing topological features

In [7]:
pm_degree_dict = {}
pm_neigh_dict = {}
pm_features = np.empty((x1.shape[0], 5))

for i in range(x1.shape[0]):
    pm_degree_dict.setdefault(i, 0)
    pm_neigh_dict.setdefault(i, 0)
    
for edge in edge_index1.T:
    s, t = list(map(int, edge))
    pm_degree_dict[s] = pm_degree_dict.setdefault(s, 0) + 1
    pm_degree_dict[t] = pm_degree_dict.setdefault(t, 0) + 1

for edge in edge_index1.T:
    s, t = list(map(int, edge))
    pm_neigh_dict[s] = np.append(pm_neigh_dict.setdefault(s, np.array([])), pm_degree_dict[t])
    pm_neigh_dict[t] = np.append(pm_neigh_dict.setdefault(t, np.array([])), pm_degree_dict[s])
    
for i in range(x1.shape[0]):
    degree_seq = pm_neigh_dict[i]
    deg = pm_degree_dict[i]
    min_deg = np.min(degree_seq)
    max_deg = np.max(degree_seq)
    mean_deg = np.mean(degree_seq)
    std_deg = np.std(degree_seq)
    pm_features[i][0] 
    pm_features[i][1] = min_deg
    pm_features[i][2] = max_deg
    pm_features[i][3] = mean_deg
    pm_features[i][4] = std_deg
    
del(pm_degree_dict)
del(pm_neigh_dict)

x1 = torch.tensor(pm_features).float()


ot_degree_dict = {}
ot_neigh_dict = {}
ot_features = np.empty((x2.shape[0], 5))

for i in range(x2.shape[0]):
    ot_degree_dict.setdefault(i, 0)
    ot_neigh_dict.setdefault(i, 0)
    
for edge in edge_index2.T:
    s, t = list(map(int, edge))
    ot_degree_dict[s] = ot_degree_dict.setdefault(s, 0) + 1
    ot_degree_dict[t] = ot_degree_dict.setdefault(t, 0) + 1

for edge in edge_index2.T:
    s, t = list(map(int, edge))
    ot_neigh_dict[s] = np.append(ot_neigh_dict.setdefault(s, np.array([])), ot_degree_dict[t])
    ot_neigh_dict[t] = np.append(ot_neigh_dict.setdefault(t, np.array([])), ot_degree_dict[s])

for i in range(x2.shape[0]):
    degree_seq = ot_neigh_dict[i]
    deg = ot_degree_dict[i]
    min_deg = np.min(degree_seq)
    max_deg = np.max(degree_seq)
    mean_deg = np.mean(degree_seq)
    std_deg = np.std(degree_seq)
    ot_features[i][0] = deg
    ot_features[i][1] = min_deg
    ot_features[i][2] = max_deg
    ot_features[i][3] = mean_deg
    ot_features[i][4] = std_deg
    
del(ot_degree_dict)
del(ot_neigh_dict)

x2 = torch.tensor(ot_features).float()

#### Hungarian algorithm on topological features

In [8]:
times = []
for i in [10, 100, 300, 500, 1000, 2000, 4000, 8000, 15000]:
    x_pm = x1[:i]
    x_ot = x2[:i]
    sim_eucl = x_pm @ x_ot.T
    sim_cos = pairwise_distances(x_pm, x_ot, metric='cosine')
    start = time()
    match_eucl = linear_sum_assignment(sim_eucl, maximize=True)
    match_cos = linear_sum_assignment(sim_cos)
    end = time()
    times.append(end - start)
    hits_at_1_eucl = (match_eucl[0] == match_eucl[1]).sum() / i
    hits_at_1_cos = (match_cos[0] == match_cos[1]).sum() / i
    print(f'{i} eucl: {hits_at_1_eucl:.04f}, cos: {hits_at_1_cos:.04f}, time: {end - start:.02f}')

10 eucl: 0.3000, cos: 0.0000, time: 0.00
100 eucl: 0.0600, cos: 0.0400, time: 0.00
300 eucl: 0.0267, cos: 0.0200, time: 0.06
500 eucl: 0.0140, cos: 0.0120, time: 0.26
1000 eucl: 0.0070, cos: 0.0060, time: 1.82
2000 eucl: 0.0045, cos: 0.0035, time: 13.36
4000 eucl: 0.0015, cos: 0.0020, time: 120.77
8000 eucl: 0.0011, cos: 0.0004, time: 985.25
15000 eucl: 0.0003, cos: 0.0002, time: 7461.92
