In [1]:
%load_ext autoreload
%load_ext notexbook
%autoreload 2
%load_ext autotime

import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import sys
sys.path.append("..")
from tqdm import tqdm
import networkx as nx

plt.style.use("../config/custom_plt.mplstyle")

colors = [
    "#7494d3",
    "#5cb545",
    "#9956c6",
    "#a7b338",
    "#6a6bc6",
    "#d09e40",
    "#ce62bb",
    "#56be85",
    "#d1477d",
    "#397f4d",
    "#cf4b4a",
    "#40bbc1",
    "#d8662c",
    "#99af66",
    "#b76989",
    "#6d7127",
    "#b6744a"
]

time: 2.17 s (started: 2023-03-16 15:14:42 +01:00)


In [2]:
# %texify --code-font-family Monaco --linespread 1.2 --code-font-size 13

time: 531 µs (started: 2023-03-16 15:14:45 +01:00)


- 1  --  citeseer
- 2  --  cora
- 3  --  film
- 4  --  dblp
- 5  --  imdb
- ?  --  acm [x]
- 6  --  amazon_photo
- 7  --  cornell
- 8  --  wisconsin
- 9  --  texas
- 10  --  lastfm
- ?  --  twitch_ru [x]
- 11  --  twitch_ptbr
- 12  --  twitch_fr
- 13  --  twitch_es
- 14  --  twitch_engb
- ?  --  twitch_de [x]

In [54]:
from sklearn.preprocessing import LabelEncoder
import networkx.algorithms.community as nx_comm

def load_results(inf_coords_path, labels, g):
    inf_coords = pd.read_csv(inf_coords_path, comment="#", header=None, sep="\s+")
    inf_coords.columns = ['index', 'kappa', 'hyp_rad', 'p1', 'p2', 'p3']
    inf_coords['index'] = inf_coords['index'].astype(str)
    inf_coords = inf_coords.merge(labels, on="index")
    le = LabelEncoder()
    inf_coords['encoded_label'] = le.fit_transform(inf_coords['label'])
    inf_coords = inf_coords.drop_duplicates(subset=['index'])
        
    # Louvain communities
    communities = nx_comm.louvain_communities(g, seed=123)
    communities_dict = []
    for i, com in enumerate(communities):
        communities_dict.append({c:i for c in com})

    result = {}
    for d in communities_dict:
        result.update(d)

    communities_louvain = pd.DataFrame()
    communities_louvain['index'] = result.keys()
    communities_louvain['label_louvain'] = result.values()

    inf_coords = inf_coords.merge(communities_louvain, on='index')        
    print('Number of communities from Louvain: ', len(np.unique(inf_coords['label_louvain'])))
    return inf_coords

time: 1.67 ms (started: 2023-03-16 15:35:20 +01:00)


In [55]:
citeseer_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/citeseer/citeseer_GC_fixed.edge")
citeseer_features = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/citeseer/citeseer.content", header=None, sep="\s+", low_memory=False)
citeseer_labels = pd.DataFrame()
citeseer_labels['index'] = citeseer_features.iloc[:, 0].astype(str)
citeseer_labels['label'] = citeseer_features.iloc[:, -1]

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/citeseer/umap"
citeseer_le_ml = load_results(f"{base_path}/le_ml/citeseer_GC_fixed.inf_coord", citeseer_labels, citeseer_graph)
citeseer_umap_ml = load_results(f"{base_path}/umap_ml/citeseer_GC_fixed.inf_coord", citeseer_labels, citeseer_graph)
citeseer_only_umap = load_results(f"{base_path}/only_umap/citeseer_GC_fixed.inf_coord", citeseer_labels, citeseer_graph)

Number of communities from Louvain:  37
Number of communities from Louvain:  37
Number of communities from Louvain:  37
time: 19.8 s (started: 2023-03-16 15:35:21 +01:00)


In [56]:
cora_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/cora/cora_GC.edge")
cora_features = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/cora/cora.content", header=None, sep="\s+", low_memory=False)
cora_labels = pd.DataFrame()
cora_labels['index'] = cora_features.iloc[:, 0].astype(str)
cora_labels['label'] = cora_features.iloc[:, -1]

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/cora/umap/"
cora_le_ml = load_results(f"{base_path}/le_ml/cora_GC.inf_coord", cora_labels, cora_graph)
cora_umap_ml = load_results(f"{base_path}/umap_ml/cora_GC.inf_coord", cora_labels, cora_graph)
cora_only_umap = load_results(f"{base_path}/only_umap/cora_GC.inf_coord", cora_labels, cora_graph)

Number of communities from Louvain:  27
Number of communities from Louvain:  27
Number of communities from Louvain:  27
time: 916 ms (started: 2023-03-16 15:35:40 +01:00)


In [57]:
film_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/film/out1_graph_edges.edge")
film_features = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/film/out1_node_feature_label.txt", sep="\s+")
film_labels = pd.DataFrame()
film_labels['index'] = film_features['node_id'].astype(str)
film_labels['label'] = film_features['label']

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/film/umap/"
film_le_ml = load_results(f"{base_path}/le_ml/out1_graph_edges.inf_coord", film_labels, film_graph)
film_umap_ml = load_results(f"{base_path}/umap_ml/out1_graph_edges.inf_coord", film_labels, film_graph)
film_only_umap = load_results(f"{base_path}/only_umap/out1_graph_edges.inf_coord", film_labels, film_graph)

Number of communities from Louvain:  35
Number of communities from Louvain:  35
Number of communities from Louvain:  35
time: 3.66 s (started: 2023-03-16 15:35:41 +01:00)


In [60]:
dblp_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/dblp/dblp_pprefp/eS1/dblp_pprefp_GC.edge")
dblp_labels = np.loadtxt("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/dblp/dblp_label.txt")
dblp_labels = np.argmax(dblp_labels, axis=1)
dblp_labels_dict = pd.DataFrame()
dblp_labels_dict['label'] = dblp_labels
dblp_labels_dict = dblp_labels_dict.reset_index()
dblp_labels_dict['index'] = dblp_labels_dict['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/dblp/umap/"
dblp_le_ml = load_results(f"{base_path}/le_ml/dblp_pprefp_GC.inf_coord", dblp_labels_dict, dblp_graph)
dblp_umap_ml = load_results(f"{base_path}/umap_ml/dblp_pprefp_GC.inf_coord", dblp_labels_dict, dblp_graph)
dblp_only_umap = load_results(f"{base_path}/only_umap/dblp_pprefp_GC.inf_coord", dblp_labels_dict, dblp_graph)

Number of communities from Louvain:  18
Number of communities from Louvain:  18
Number of communities from Louvain:  18
time: 2.95 s (started: 2023-03-16 15:36:19 +01:00)


In [61]:
imdb_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/imdb/imdb_mam/eS1/imdb_mam_GC.edge")
imdb_labels = np.loadtxt("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/imdb/imdb_label.txt")
imdb_labels = np.argmax(imdb_labels, axis=1)
imdb_labels_dict = pd.DataFrame()
imdb_labels_dict['label'] = imdb_labels
imdb_labels_dict = imdb_labels_dict.reset_index()
imdb_labels_dict['index'] = imdb_labels_dict['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/imdb/umap/"
imdb_le_ml = load_results(f"{base_path}/le_ml/imdb_mam_GC.inf_coord", imdb_labels_dict, imdb_graph)
imdb_umap_ml = load_results(f"{base_path}/umap_ml/imdb_mam_GC.inf_coord", imdb_labels_dict, imdb_graph)
imdb_only_umap = load_results(f"{base_path}/only_umap/imdb_mam_GC.inf_coord", imdb_labels_dict, imdb_graph)

Number of communities from Louvain:  27
Number of communities from Louvain:  27
Number of communities from Louvain:  27
time: 2.38 s (started: 2023-03-16 15:36:42 +01:00)


In [62]:
amazon_photo_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/amazon_photo/amazon_photo_GC.edge")
amazon_photo_labels = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/amazon_photo/amazon_photo_labels.txt", header=None)
amazon_photo_labels.reset_index(inplace=True)
amazon_photo_labels.columns = ['index', 'label']
amazon_photo_labels['index'] = amazon_photo_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/amazon_photo/umap/"
amazon_photo_le_ml = load_results(f"{base_path}/le_ml/amazon_photo_GC.inf_coord", amazon_photo_labels, amazon_photo_graph)
amazon_photo_umap_ml = load_results(f"{base_path}/umap_ml/amazon_photo_GC.inf_coord", amazon_photo_labels, amazon_photo_graph)
amazon_photo_only_umap = load_results(f"{base_path}/only_umap/amazon_photo_GC.inf_coord", amazon_photo_labels, amazon_photo_graph)

Number of communities from Louvain:  14
Number of communities from Louvain:  14
Number of communities from Louvain:  14
time: 7.48 s (started: 2023-03-16 15:37:03 +01:00)


In [63]:
cornell_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/cornell/out1_graph_edges.edge")
cornell_features = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/cornell/out1_node_feature_label.txt", sep="\s+")
cornell_labels = pd.DataFrame()
cornell_labels['index'] = cornell_features['node_id']
cornell_labels['label'] = cornell_features['label']
cornell_labels['index'] = cornell_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/cornell/umap/"
cornell_le_ml = load_results(f"{base_path}/le_ml/out1_graph_edges.inf_coord", cornell_labels, cornell_graph)
cornell_umap_ml = load_results(f"{base_path}/umap_ml/out1_graph_edges.inf_coord", cornell_labels, cornell_graph)
cornell_only_umap = load_results(f"{base_path}/only_umap/out1_graph_edges.inf_coord", cornell_labels, cornell_graph)

Number of communities from Louvain:  14
Number of communities from Louvain:  14
Number of communities from Louvain:  14
time: 79.8 ms (started: 2023-03-16 15:37:24 +01:00)


In [64]:
wisconsin_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/wisconsin/out1_graph_edges.edge")
wisconsin_features = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/wisconsin/out1_node_feature_label.txt", sep="\s+")
wisconsin_labels = pd.DataFrame()
wisconsin_labels['index'] = wisconsin_features['node_id']
wisconsin_labels['label'] = wisconsin_features['label']
wisconsin_labels['index'] = wisconsin_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/wisconsin/umap/"
wisconsin_le_ml = load_results(f"{base_path}/le_ml/out1_graph_edges.inf_coord", wisconsin_labels, wisconsin_graph)
wisconsin_umap_ml = load_results(f"{base_path}/umap_ml/out1_graph_edges.inf_coord", wisconsin_labels, wisconsin_graph)
wisconsin_only_umap = load_results(f"{base_path}/only_umap/out1_graph_edges.inf_coord", wisconsin_labels, wisconsin_graph)

Number of communities from Louvain:  12
Number of communities from Louvain:  12
Number of communities from Louvain:  12
time: 114 ms (started: 2023-03-16 15:37:53 +01:00)


In [65]:
texas_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/texas/out1_graph_edges.edge")
texas_features = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/texas/out1_node_feature_label.txt", sep="\s+")
texas_labels = pd.DataFrame()
texas_labels['index'] = texas_features['node_id']
texas_labels['label'] = texas_features['label']
texas_labels['index'] = texas_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/webkb/texas/umap/"
texas_le_ml = load_results(f"{base_path}/le_ml/out1_graph_edges.inf_coord", texas_labels, texas_graph)
texas_umap_ml = load_results(f"{base_path}/umap_ml/out1_graph_edges.inf_coord", texas_labels, texas_graph)
texas_only_umap = load_results(f"{base_path}/only_umap/out1_graph_edges.inf_coord", texas_labels, texas_graph)

Number of communities from Louvain:  12
Number of communities from Louvain:  12
Number of communities from Louvain:  12
time: 104 ms (started: 2023-03-16 15:38:16 +01:00)


In [66]:
lastfm_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/lastfm_asia/lastfm_asia_edges/eS1/lastfm_asia_edges.edge")
lastfm_labels = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/lastfm_asia/lastfm_asia_target.csv")
lastfm_labels.columns = ['index', 'label']
lastfm_labels['index'] = lastfm_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/lastfm_asia/umap/"
lastfm_le_ml = load_results(f"{base_path}/le_ml/lastfm_asia_edges.inf_coord", lastfm_labels, lastfm_graph)
lastfm_umap_ml = load_results(f"{base_path}/umap_ml/lastfm_asia_edges.inf_coord", lastfm_labels, lastfm_graph)
lastfm_only_umap = load_results(f"{base_path}/only_umap/lastfm_asia_edges.inf_coord", lastfm_labels, lastfm_graph)

Number of communities from Louvain:  29
Number of communities from Louvain:  29
Number of communities from Louvain:  29
time: 3.59 s (started: 2023-03-16 15:38:34 +01:00)


In [67]:
twitch_ptbr_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/PTBR/musae_PTBR_edges.edge")
twitch_ptbr_labels = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/PTBR/musae_PTBR_target.csv")
twitch_ptbr_labels['label'] = twitch_ptbr_labels['mature'].astype(int)
twitch_ptbr_labels = twitch_ptbr_labels[['new_id', 'label']]
twitch_ptbr_labels['index'] = twitch_ptbr_labels['new_id']
twitch_ptbr_labels.insert(0, 'index', twitch_ptbr_labels.pop('index'))
twitch_ptbr_labels['index'] = twitch_ptbr_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/PTBR/umap/"
twitch_ptbr_le_ml = load_results(f"{base_path}/le_ml/musae_PTBR_edges.inf_coord", twitch_ptbr_labels, twitch_ptbr_graph)
twitch_ptbr_umap_ml = load_results(f"{base_path}/umap_ml/musae_PTBR_edges.inf_coord", twitch_ptbr_labels, twitch_ptbr_graph)
twitch_ptbr_only_umap = load_results(f"{base_path}/only_umap/musae_PTBR_edges.inf_coord", twitch_ptbr_labels, twitch_ptbr_graph)

Number of communities from Louvain:  8
Number of communities from Louvain:  8
Number of communities from Louvain:  8
time: 2.37 s (started: 2023-03-16 15:38:53 +01:00)


In [68]:
twitch_fr_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/FR/musae_FR_edges.edge")
twitch_fr_labels = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/FR/musae_FR_target.csv")
twitch_fr_labels['label'] = twitch_fr_labels['mature'].astype(int)
twitch_fr_labels = twitch_fr_labels[['new_id', 'label']]
twitch_fr_labels['index'] = twitch_fr_labels['new_id']
twitch_fr_labels.insert(0, 'index', twitch_fr_labels.pop('index'))
twitch_fr_labels['index'] = twitch_fr_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/FR/umap/"
twitch_fr_le_ml = load_results(f"{base_path}/le_ml/musae_FR_edges.inf_coord", twitch_fr_labels, twitch_fr_graph)
twitch_fr_umap_ml = load_results(f"{base_path}/umap_ml/musae_FR_edges.inf_coord", twitch_fr_labels, twitch_fr_graph)
twitch_fr_only_umap = load_results(f"{base_path}/only_umap/musae_FR_edges.inf_coord", twitch_fr_labels, twitch_fr_graph)

Number of communities from Louvain:  7
Number of communities from Louvain:  7
Number of communities from Louvain:  7
time: 6.73 s (started: 2023-03-16 15:39:11 +01:00)


In [69]:
twitch_es_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/ES/musae_ES_edges.edge")
twitch_es_labels = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/ES/musae_ES_target.csv")
twitch_es_labels['label'] = twitch_es_labels['mature'].astype(int)
twitch_es_labels = twitch_es_labels[['new_id', 'label']]
twitch_es_labels['index'] = twitch_es_labels['new_id']
twitch_es_labels.insert(0, 'index', twitch_es_labels.pop('index'))
twitch_es_labels['index'] = twitch_es_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/ES/umap/"
twitch_es_le_ml = load_results(f"{base_path}/le_ml/musae_ES_edges.inf_coord", twitch_es_labels, twitch_es_graph)
twitch_es_umap_ml = load_results(f"{base_path}/umap_ml/musae_ES_edges.inf_coord", twitch_es_labels, twitch_es_graph)
twitch_es_only_umap = load_results(f"{base_path}/only_umap/musae_ES_edges.inf_coord", twitch_es_labels, twitch_es_graph)

Number of communities from Louvain:  10
Number of communities from Louvain:  10
Number of communities from Louvain:  10
time: 3.61 s (started: 2023-03-16 15:39:34 +01:00)


In [70]:
twitch_engb_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/ENGB/musae_ENGB_edges.edge")
twitch_engb_labels = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/ENGB/musae_ENGB_target.csv")
twitch_engb_labels['label'] = twitch_engb_labels['mature'].astype(int)
twitch_engb_labels = twitch_engb_labels[['new_id', 'label']]
twitch_engb_labels['index'] = twitch_engb_labels['new_id']
twitch_engb_labels.insert(0, 'index', twitch_engb_labels.pop('index'))
twitch_engb_labels['index'] = twitch_engb_labels['index'].astype(str)

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/twitch/ENGB/umap/"
twitch_engb_le_ml = load_results(f"{base_path}/le_ml/musae_ENGB_edges.inf_coord", twitch_engb_labels, twitch_engb_graph)
twitch_engb_umap_ml = load_results(f"{base_path}/umap_ml/musae_ENGB_edges.inf_coord", twitch_engb_labels, twitch_engb_graph)
twitch_engb_only_umap = load_results(f"{base_path}/only_umap/musae_ENGB_edges.inf_coord", twitch_engb_labels, twitch_engb_graph)

Number of communities from Louvain:  17
Number of communities from Louvain:  17
Number of communities from Louvain:  17
time: 5.01 s (started: 2023-03-16 15:39:53 +01:00)
