<a href="https://colab.research.google.com/github/nicholasgaubatz/regal_data-mining/blob/main/Regal_EXAMPLE_Deezer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Section

In [1]:
import networkx as nx
import pandas as pd
import os
from scipy import sparse
from aligning import *
from extract import *
from representation import *
from config import RepMethod

# Read in the graphs

We are comparing the database of Deezer users in Romania and Hungary.

In [2]:
df2=pd.read_csv("RO_edges.csv")

In [4]:
df=pd.read_csv("HU_edges.csv")

# Extract the features
Step 1


In [5]:
G = nx.from_pandas_edgelist(df, source='node_1', target='node_2')

In [6]:
G2= nx.from_pandas_edgelist(df2, source='node_1', target='node_2')

In [7]:
adj_matrix2 =nx.adjacency_matrix(G2)

In [8]:
adj_matrix=nx.adjacency_matrix(G)

In [9]:
adj_matrix_combined = sparse.block_diag([adj_matrix, adj_matrix2])

# Initialize the graph objects
graph1 = Graph(adj_matrix)
graph2 = Graph(adj_matrix2)
graph3 = Graph(adj_matrix_combined)

I commented out the node identities 1 and 2 to lower runtime


In [10]:
K = 2  # Maximum hop distance
delta = 0.5  # Discount factor

# Extract node identities for both graphs
#node_identities_1 = extract_node_identity(graph1, K, delta)
#node_identities_2 = extract_node_identity(graph2, K, delta)
node_identities_3 = extract_node_identity(graph3, K, delta)

#print("Node Identities for Graph 1:\n", node_identities_1)
#print("\nNode Identities for Graph 2:\n", node_identities_2)
print("\nNode Identities for Graph 3:\n", node_identities_3)


Node Identities for Graph 3:
 [[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Get the representations of the graphs

In [15]:
rep_method = RepMethod(max_layer=2)
print('Number of landmark nodes:', rep_method.p)


Number of landmark nodes: None


In [17]:
get_number_of_landmarks(graph3, rep_method)
print(rep_method.p)

164


In [18]:
landmarks = get_random_landmarks(graph3, rep_method)
print(landmarks)

[14623 81269  1097 85302 14958 61694 55571 43556 13041 37734  7632 89019
 16892 31835 82496 40883 24124 82766 18136 38922 72899 51647 55594 57601
 50749 75292 29086 35141 45612 41196 59095 73986 76039 88956 27286 76235
 39989 68538 25338 17073 58046 58524 86311 85258 30278 54648  5017 65756
 52038 45040  7461 31549 13454 27108 38517 86340 18765 62861 52624 88940
 34220 81913 70658 10398 16969 37887 84232  3588 54112 74146 16359 37492
 61573 48096 17207 62950 58615 76644 85452 56589 72697 52172 65987 53785
  1366  9559 11487 87887 88661 38286 13399 79748 32509 25903 23259 26304
  4537 89195  9135 15880 23655 33826 88203 84235 40263 21753 71769 59922
 89182 15258 30192 50369 39786 86480 81707 79084 71647 67268  5104 14181
 47066 17959 24081 37816  7027 78756 69434 65790 71519 75802 77826 87137
 35396 82967 49680 70071 50550 71379 31923 88691 28602 29908 72072 43349
 58460 45022 22455 45053 50141 43778 39186 53185 43404  1023 12709  1830
  8841 81711 17208 28274 10582 11245 88459 17413]


In [19]:
C = compute_C_matrix(node_identities_3, landmarks)
(f'Shape of C: {len(C)} x {len(C[0])}')
print('C:')
print(C)

C:
[[5.49675402e-137 2.89719808e-044 1.87786949e-031 ... 0.00000000e+000
  1.02528200e-029 3.05362431e-045]
 [6.96118029e-151 1.80644620e-016 2.22933171e-020 ... 0.00000000e+000
  4.91042987e-016 1.54008828e-051]
 [2.27160567e-182 1.95556811e-008 2.78700543e-029 ... 0.00000000e+000
  2.97832627e-016 2.16669124e-067]
 ...
 [2.29912624e-206 4.97870684e-002 1.09469770e-035 ... 0.00000000e+000
  1.58321429e-023 9.57726952e-081]
 [3.02944698e-216 1.42642339e-002 6.38150345e-040 ... 0.00000000e+000
  1.24904918e-028 7.55581902e-086]
 [2.04302899e-199 2.03468369e-004 2.97569687e-035 ... 0.00000000e+000
  1.05306174e-020 1.75413810e-082]]


In [20]:
representations_1, representations_2 = compute_representation(C, landmarks, adj_matrix.shape[0])
print('Representations of nodes from first graph:')
print(representations_1)
representations_1= np.nan_to_num(representations_1, nan=0)
representations_2= np.nan_to_num(representations_2, nan=0)

Representations of nodes from first graph:
[[ 3.89470535e-17  1.20504527e-16  5.55569788e-18 ... -3.13747780e-16
  -6.42444483e-17 -1.68039186e-17]
 [ 1.25069366e-08  3.84751916e-09 -1.38671027e-08 ... -2.27453537e-07
  -5.28107896e-08  8.87090734e-09]
 [ 4.31710648e-04  3.98268271e-04 -1.05025406e-03 ... -1.50638840e-02
  -2.25656010e-03  9.57798403e-04]
 ...
 [ 1.27193429e-02 -2.83883121e-03 -1.70711300e-04 ...  2.03121827e-02
   1.24944324e-01  9.84710416e-01]
 [-6.18056691e-02  1.25579976e-01 -2.02405597e-01 ... -3.54195622e-01
  -2.72677365e-01  2.66440518e-01]
 [ 1.11556354e-01  1.76611023e-01  1.44704657e-01 ... -3.42635375e-01
  -6.59155712e-02  3.68128298e-01]]


  Y_twiddle = Y_twiddle / np.linalg.norm(Y_twiddle, axis=1).reshape(Y_twiddle.shape[0], 1) # Normalization of rows
  Y_twiddle = Y_twiddle / np.linalg.norm(Y_twiddle, axis=1).reshape(Y_twiddle.shape[0], 1) # Normalization of rows


# Calculate Similarity Matrix


In [21]:
similarity_matrix = get_similarity_matrix(representations_1, representations_2, 3)
print(similarity_matrix)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  (0, 37648)	0.979615959807361
  (0, 15718)	0.9068525180728447
  (0, 13200)	0.8181353338838454
  (1, 7504)	0.9308335557587669
  (1, 37988)	0.874328127202474
  (1, 31435)	0.8666210299802761
  (2, 11732)	0.6320771817431458
  (2, 35706)	0.6111633740480582
  (2, 24666)	0.5933802182632709
  (3, 25339)	0.8205490097869452
  (3, 1880)	0.7876250500013583
  (3, 16153)	0.7377116415840013
  (4, 16100)	0.9655165825698269
  (4, 13807)	0.9651179141559776
  (4, 166)	0.9620707485190786
  (5, 32454)	0.7102062782728552
  (5, 3479)	0.7048729497774576
  (5, 5943)	0.7034931212027898
  (6, 3442)	0.7938868485925901
  (6, 6242)	0.6258704830381911
  (6, 8416)	0.6258464764252127
  (7, 39978)	0.5423649397645444
  (7, 34837)	0.5040460878784344
  (7, 39675)	0.49986007045205927
  (8, 25229)	0.9707501148957018
  :	:
  (47529, 37103)	0.7211347821705898
  (47530, 41112)	0.9056442317929785
  (47530, 25978)	0.9048812175711838
  (47530, 5238)	0.9028089220727741
  (47531, 37836)	0.6405172927626103
  (47531, 38062)	0.637091