In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from scipy.stats import pearsonr
from scipy.spatial import distance

import network_distance as nd # From: https://www.michelecoscia.com/wp-content/uploads/2021/04/network_distance.zip

In [2]:
vector_df = pd.read_csv("../data/obama_nodevectors.csv") # Load the data. Comma-separated. 3 columns: node id, then two values per node.

vector_df

Unnamed: 0,node,X,Y
0,347971482,0.429952,0.142627
1,181692013,-0.391304,-0.103878
2,1433227584,0.812500,0.702424
3,394166747,0.770833,0.699578
4,24655764,0.812500,0.015197
...,...,...,...
199,549600704,0.666667,0.336801
200,21773704,0.666667,0.146318
201,14662354,0.479167,0.274170
202,14979330,-0.391304,-0.240186


In [3]:
v1_dict = vector_df.set_index("node").to_dict()["X"]                              # Convert each variable to a "node -> value" dict
v2_dict = vector_df.set_index("node").to_dict()["Y"]

nodes = sorted(list(set(v1_dict.keys()) | set(v2_dict.keys())))                   # Get a list of all the nodes for which we have a value

v1_array = np.array([v1_dict[node] if node in v1_dict else 0 for node in nodes])  # Convert dicts into a numpy array by iterating over the nodelist (to preserve order)
v2_array = np.array([v2_dict[node] if node in v2_dict else 0 for node in nodes])

v1_array

array([-0.26086957,  0.76157407,  0.02083333,  0.47916667,  0.41666667,
       -0.17783816, -0.26086957, -0.18523551, -0.06521739,  0.47916667,
       -0.39130435,  0.60416667, -0.65217391,  0.8125    ,  0.375     ,
        0.70833333,  0.375     ,  0.05027174,  0.77083333, -0.26086957,
        0.66666667,  0.89583333,  0.80555556,  0.8125    ,  0.64583333,
        0.375     ,  0.1236413 , -0.45652174,  0.59375   ,  0.70833333,
        0.10027174,  0.8125    ,  0.66666667, -0.52173913,  0.8125    ,
       -0.10688406,  0.2123591 ,  0.18312198,  0.375     , -0.04347826,
        0.25498188,  0.64583333, -0.39130435, -0.34782609,  0.3125    ,
        0.66666667,  0.03025362,  0.41666667, -0.26086957, -0.26086957,
       -0.41304348, -0.26086957,  0.375     , -0.82608696,  0.66666667,
        0.8125    , -0.08695652, -0.67391304, -0.41304348,  0.2181677 ,
        0.8125    , -0.19565217, -0.41304348,  0.66666667,  0.66666667,
        0.60416667, -0.03492351, -0.61956522,  0.66666667,  0.18

In [4]:
print(f"""
Euclidean distance: {distance.euclidean(v1_array, v2_array)};
Cosine distance: {distance.cosine(v1_array, v2_array)};
Pearson correlation: {pearsonr(v1_array, v2_array)};
X variance: {np.var(v1_array)};
Y variance: {np.var(v2_array)}.
""")


Euclidean distance: 4.470842537966641;
Cosine distance: 0.14040258639628211;
Pearson correlation: PearsonRResult(statistic=0.8295942972278013, pvalue=5.089008071273281e-53);
X variance: 0.22515240634802855;
Y variance: 0.07401456886131398.



In [5]:
# Read comma-separated edge list with one edge per line into a networkx undirected graph object
G = nx.read_edgelist("../data/obama_edgelist.csv", delimiter = ",", nodetype = int)

print(G.nodes, len(G.edges))

[602807375, 17955706, 1656057482, 1132189957, 18448384, 167621823, 564824282, 393577394, 123481439, 40532427, 1255179128, 257939371, 222695674, 1706026082, 272490401, 58316145, 206877842, 62645216, 158232890, 210427918, 26519763, 18169110, 17606369, 158427758, 67026176, 1344539976, 1923895026, 66546898, 61664932, 80940214, 15097698, 31703974, 334988425, 24868543, 522718572, 13284722, 127486569, 2720553168, 118735414, 34842246, 838006482, 497005469, 59491108, 269946558, 59133139, 425937551, 226409888, 61313751, 347971482, 6742412, 15905092, 86177206, 19747572, 836914616, 619215615, 48050455, 16146692, 28731961, 413470255, 478153611, 116783980, 438019172, 319420349, 20175865, 233761277, 24655764, 1890308449, 549600704, 882947018, 3232895802, 21343653, 492833134, 66124619, 16667912, 282695161, 52944689, 379498878, 27503607, 177702195, 35603124, 71036272, 1202205522, 238670666, 15966651, 182550591, 1536617431, 944456239, 1130537934, 19550464, 907258501, 35700018, 27676828, 61015562, 490706