In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import sys
import copy
sys.path.append("..")
from tqdm.notebook import tqdm
from numba import jit
from scipy import stats
import networkx as nx


import warnings
warnings.filterwarnings('ignore')
plt.style.use("../config/custom_plt.mplstyle")

colors = [
    "#7494d3",
    "#5cb545",
    "#9956c6",
    "#a7b338",
    "#6a6bc6",
    "#d09e40",
    "#ce62bb",
    "#56be85",
    "#d1477d",
    "#397f4d",
    "#cf4b4a",
    "#40bbc1",
    "#d8662c",
    "#99af66",
    "#b76989",
    "#6d7127",
    "#b6744a"
]

In [2]:
from sklearn.preprocessing import LabelEncoder
import networkx.algorithms.community as nx_comm

def load_results(inf_coords_path, labels, g):
    inf_coords = pd.read_csv(inf_coords_path, comment="#", header=None, sep="\s+")
    inf_coords.columns = ['index', 'kappa', 'hyp_rad', 'p1', 'p2', 'p3']
    inf_coords['index'] = inf_coords['index'].astype(str)
    inf_coords = inf_coords.merge(labels, on="index")
    le = LabelEncoder()
    inf_coords['encoded_label'] = le.fit_transform(inf_coords['label'])
    inf_coords = inf_coords.drop_duplicates(subset=['index'])
        
    # Louvain communities
    communities = nx_comm.louvain_communities(g, seed=123)
    communities_dict = []
    for i, com in enumerate(communities):
        communities_dict.append({c:i for c in com})

    result = {}
    for d in communities_dict:
        result.update(d)

    communities_louvain = pd.DataFrame()
    communities_louvain['index'] = result.keys()
    communities_louvain['label_louvain'] = result.values()

    inf_coords = inf_coords.merge(communities_louvain, on='index')        
    print('Number of communities from Louvain: ', len(np.unique(inf_coords['label_louvain'])))

    return inf_coords

In [3]:
citeseer_graph = nx.read_edgelist("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/citeseer/citeseer_GC_fixed.edge")
citeseer_edges = nx.to_pandas_edgelist(citeseer_graph)
citeseer_features = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/citeseer/citeseer.content", header=None, sep="\s+", low_memory=False)
citeseer_labels = pd.DataFrame()
citeseer_labels['index'] = citeseer_features.iloc[:, 0].astype(str)
citeseer_labels['label'] = citeseer_features.iloc[:, -1]

base_path = "/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/citeseer/umap"
citeseer_le_ml = load_results(f"{base_path}/le_ml/citeseer_GC_fixed.inf_coord", citeseer_labels, citeseer_graph)
citeseer_umap_ml = load_results(f"{base_path}/umap_ml/citeseer_GC_fixed.inf_coord", citeseer_labels, citeseer_graph)
citeseer_only_umap = load_results(f"{base_path}/only_umap/citeseer_GC_fixed.inf_coord", citeseer_labels, citeseer_graph)

citeseer_find_k = pd.read_csv("/home/rob/MEGAsync/datasets/networks/machine_learning_datasets/f_mercator/citeseer_labels_umap_find_k_cC.csv")
citeseer_find_k['index'] = citeseer_find_k['index'].astype(str)

citeseer_le_ml = citeseer_le_ml.merge(citeseer_find_k)
citeseer_umap_ml = citeseer_umap_ml.merge(citeseer_find_k)
citeseer_only_umap = citeseer_only_umap.merge(citeseer_find_k)

Number of communities from Louvain:  37
Number of communities from Louvain:  37
Number of communities from Louvain:  37


In [4]:
citeseer_le_ml

Unnamed: 0,index,kappa,hyp_rad,p1,p2,p3,label,encoded_label,label_louvain,label_clustering_find_k
0,210,3.374480,11.3413,8.702180,-9.573470,-0.727437,ML,5,26,3
1,845,3.817040,11.2180,0.788784,0.652710,12.917400,ML,5,17,0
2,940,1.754250,11.9955,-10.643300,7.375810,-0.474620,IR,4,2,0
3,1625,3.217410,11.3889,-12.062900,-2.940680,3.707690,ML,5,6,0
4,1894,7.307700,10.5686,-5.138150,11.505700,3.021160,IR,4,21,2
...,...,...,...,...,...,...,...,...,...,...
2105,beaudouin-lafon00instrumental,0.720304,12.8856,3.201750,-11.742100,-4.447580,HCI,3,28,3
2106,rousseau97socialpsychological,2.699510,11.5644,6.540750,-11.152300,-0.867993,Agents,1,32,1
2107,bachpedersen98multidimensional,1.405880,12.2168,8.470210,9.769900,-0.844370,DB,2,9,0
2108,georgeff99beliefdesireintention,6.329990,10.7122,10.050900,-6.859120,4.454230,Agents,1,32,1


In [5]:
import pyvista as pv
pv.global_theme.color = 'white'

# Different color scheme for different type of labels


# For Metadata
# new_colors = ["#b18281", "#6d45cd", "#62a03b", "#c84ccb", "#a68b3c", "#482a79","#d74327", 
#              "#6f7dcf", "#cf783d", "#608aa4", "#cd4859", "#5f9c7b", "#d2478d", "#44532d", 
#              "#b773b5", "#703425", "#342d40", "#723057"]

# # For Topology
# new_colors = ["#dca083", "#7dbad3", "#983aa1", "#b1c232", "#db4393", "#a5662f", "#733fd4",
#               "#24192f", "#3d2560", "#4d8163", "#64c27a", "#a9ad84", "#86af46", "#db502c",
#               "#882d2a", "#50222f", "#d64ed7", "#c4a2c5", "#34475e", "#5a3d23", "#e19a2e",
#               "#bea44b", "#557f8f", "#d8405a", "#462b8d", "#6a6d2b", "#5ec0ab", "#2b3f28",
#               "#648ace", "#d67681", "#54c840", "#397c30", "#8f6c64", "#903767", "#d082cd",
#               "#715c8b", "#7070dc"]

# # For Features
new_colors = ["#b8934e", "#c44f39", "#819bb1", "#4d393d", "#69aa55", "#c65b94", "#7d4cba"]


def get_spherical_cap_structure_grid(b, opening_angle, R, color_idx, radius=1.0):
    # From: https://stackoverflow.com/a/45458451
    r = R
    phi = np.linspace(0, 2 * np.pi, 30)
    theta = np.linspace(0, opening_angle, 20)
    X = r * np.stack([
        np.outer(np.cos(phi), np.sin(theta)),
        np.outer(np.sin(phi), np.sin(theta)),
        np.outer(np.ones(np.size(phi)), np.cos(theta)),
        ], axis=-1)

    # rotate X such that [0, 0, 1] gets rotated to `c`;
    # <https://math.stackexchange.com/a/476311/36678>.
    a = np.array([0.0, 0.0, 1.0])
    a_x_b = np.cross(a, b)
    a_dot_b = np.dot(a, b)
    if a_dot_b == -1.0:
        X_rot = -X
    else:
        X_rot = (
            X +
            np.cross(a_x_b, X) +
            np.cross(a_x_b, np.cross(a_x_b, X)) / (1.0 + a_dot_b)
            )
        
    return pv.StructuredGrid(X_rot[..., 0], X_rot[..., 1], X_rot[..., 2])

In [6]:
def get_geodesic(p1, p2):
    omega = np.arccos(np.dot(p1, p2) / (np.linalg.norm(p1) * np.linalg.norm(p2)))
    t = np.linspace(0, 1)
    
    line = []
    for t in np.linspace(0, 1):
        line.append(np.sin((1 - t) * omega) / np.sin(omega) * p1 + np.sin(t * omega) / np.sin(omega) * p2)
    return np.array(line)

In [7]:
def compute_prob_S2(p1, p2, kappa1, kappa2):
    beta = 2.98
    mu = 0.0375
    R = 1
    angle = np.arccos(np.dot(p1, p2) / (np.linalg.norm(p1) * np.linalg.norm(p2)))
    
    chi = (R * angle) / np.sqrt(kappa1 * kappa2 * mu)
    return 1 / (1 + np.power(chi, beta))

In [8]:
def plot_embedding(df, label):
    pv.set_plot_theme("document")
    plotter = pv.Plotter(window_size=[4096, 4096])

    plotter.enable_anti_aliasing('ssaa')

    R = 1
    u, v = np.mgrid[0:2*np.pi:100j, 0:np.pi:100j]
    x = R*np.cos(u)*np.sin(v)
    y = R*np.sin(u)*np.sin(v)
    z = R*np.cos(v)
    grid = pv.StructuredGrid(x, y, z)
    plotter.add_mesh(grid, color='#fdfdfd', opacity=1)

    # Plot edges
    pos = df[['p1', 'p2', 'p3']].values
    pos /= np.linalg.norm(pos, axis=1)[:, None]
    kappa = df['kappa'].values

    count = 0
    for source, target in tqdm(citeseer_edges.values):
        s_i = df['index'].tolist().index(source)
        t_i = df['index'].tolist().index(target)

        # Compute the probability of connection
        p1, p2 = pos[s_i], pos[t_i]
        prob = compute_prob_S2(p1, p2, kappa[s_i], kappa[t_i])
        if prob < 0.5: # filter out low probable links
            count += 1
            continue

        l = get_geodesic(p1, p2)
        actor = plotter.add_lines(l, color='#8a8a8a', width=6*prob)

    print('Number of low probable links: ', count)

    max_kappa = max(df['kappa'].values)
    idx = 0
    i = 0
    R = 1.001
    for name, group in df.groupby(label):

        pos = group[['p1', 'p2', 'p3']].values
        for j in range(len(group)):
            p = pos[j] / np.linalg.norm(pos[j])
            s = group['kappa'].values[j]
            s /= max_kappa
            s *= 0.25
            cap = get_spherical_cap_structure_grid(p, s, R, color_idx=idx)
            plotter.add_mesh(cap, color=new_colors[idx])
            i += 1
        idx += 1


    plotter.camera_position = 'yz'

    plotter.camera.azimuth = -30
    plotter.camera.elevation = -30

    return plotter

In [9]:
# plotter = plot_embedding(citeseer_le_ml, 'label')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_le_ml_labels_metadata.jpg")


In [10]:
# plotter = plot_embedding(citeseer_umap_ml, 'label')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_umap_ml_labels_metadata.jpg")


In [11]:
# plotter = plot_embedding(citeseer_only_umap, 'label')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_only_umap_labels_metadata.jpg")


In [12]:
# plotter = plot_embedding(citeseer_le_ml, 'label_louvain')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_le_ml_labels_topology.jpg")


In [13]:
# plotter = plot_embedding(citeseer_umap_ml, 'label_louvain')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_umap_ml_labels_topology.jpg")


In [14]:
# plotter = plot_embedding(citeseer_only_umap, 'label_louvain')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_only_umap_labels_topology.jpg")


In [15]:
# plotter = plot_embedding(citeseer_le_ml, 'label_clustering_find_k')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_le_ml_labels_features.jpg")


In [16]:
# plotter = plot_embedding(citeseer_umap_ml, 'label_clustering_find_k')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_umap_ml_labels_features.jpg")


In [17]:
# plotter = plot_embedding(citeseer_only_umap, 'label_clustering_find_k')
# plotter.screenshot("/home/rob/Dropbox/NodesFeaturesEmbeddings/Report/figures-publication-and-random-initialization-27-04-23/plots/citeseer_only_umap_labels_features.jpg")

  0%|          | 0/3668 [00:00<?, ?it/s]

Number of low probable links:  819


array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]]