In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

# os.environ["GEOMSTATS_BACKEND"] = "torch"

import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from geomstats.geometry.hyperbolic import Hyperbolic
from geomstats.datasets.prepare_graph_data import HyperbolicEmbedding, Graph

from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold

from scipy.io import loadmat
from scipy.spatial.distance import pdist, squareform

from hyperdt.tree import DecisionTreeClassifier, HyperbolicDecisionTreeClassifier
from hyperdt.forest import HyperbolicRandomForestClassifier
from hyperdt.conversions import convert

from hyperbolics.utils.distortions import map_score, distortion

INFO: Using numpy backend


In [3]:
EPOCHS = 2

In [5]:
# Copied from 17_graphs_2.ipynb


def load_graph(graph_dir, graph_type="directed", edge_type="unweighted", add_isolates=False, top_cc=False):
    # Specify paths
    adjacency_path = f"{graph_dir}/adjacency.tsv"
    dense_adjacency_path = f"{graph_dir.replace('/raw/', '/interim/')}/adjacency_dense.tsv"
    labels_path = f"{graph_dir}/labels.tsv"
    label_names_path = f"{graph_dir}/names_labels.tsv"
    names_path = f"{graph_dir}/names.tsv"

    # Adjacency matrix: (out_node, in_node)
    adjacency = pd.read_table(adjacency_path, header=None, usecols=[0, 1])

    # Labels: (label, )
    labels = pd.read_table(labels_path, header=None, usecols=[0])[0]

    # Label name: (label_name, )
    if os.path.exists(label_names_path):
        label_names = pd.read_table(label_names_path, header=None, usecols=[0])[0]
    else:
        label_names = pd.Series(labels[0].unique()).reset_index()

    # Node name: (node_name, )
    if os.path.exists(names_path):
        names = pd.read_table(names_path, header=None, usecols=[0])[0]
    else:
        names = pd.Series(np.arange(len(labels))).reset_index()

    # Networkx object
    base_graph = nx.DiGraph if graph_type == "directed" else nx.Graph
    networkx_graph = nx.from_pandas_edgelist(adjacency, source=0, target=1, create_using=base_graph)
    if add_isolates:
        networkx_graph.add_nodes_from(names.index)

    # Add names and labels to nodes
    nx.set_node_attributes(networkx_graph, dict(zip(names.index, labels)), "label")
    networkx_graph = nx.relabel_nodes(networkx_graph, dict(zip(names.index, names)))

    # Get connected components
    if top_cc:
        networkx_graph = networkx_graph.subgraph(max(nx.connected_components(networkx_graph), key=len))

    # Pairwise distances
    distances = nx.floyd_warshall_numpy(networkx_graph)

    # Geomstats object
    dense_adjacency = nx.to_numpy_array(networkx_graph)
    np.savetxt(dense_adjacency_path, dense_adjacency, fmt="%d", delimiter="\t")
    geomstats_graph = Graph(graph_matrix_path=dense_adjacency_path, labels_path=labels_path)

    return {
        "labels": list(networkx_graph.nodes(data="label")),
        "label_names": list(label_names),
        "names": list(networkx_graph.nodes()),
        "networkx_graph": networkx_graph,
        "geomstats_graph": geomstats_graph,
        "distances": distances,
    }


########################################################################################################################

polblogs = load_graph("data/raw/polblogs", graph_type="undirected", top_cc=True)

########################################################################################################################


def assess_embedding(embedding, true_dists=polblogs["distances"], dim=2, coords="ball"):
    # Clip to max distance in embeddings
    manifold = Hyperbolic(dim, default_coords_type=coords)
    pairwise_dists = squareform(pdist(embedding, metric=manifold.metric.dist))
    graph_dists = np.clip(true_dists, a_min=None, a_max=np.max(pairwise_dists))

    # Assess distortion:
    distortion_val = distortion(graph_dists, pairwise_dists, n=len(graph_dists), jobs=-1)
    map_val = map_score(graph_dists, pairwise_dists, n=len(graph_dists), jobs=-1)
    return distortion_val, map_val

In [18]:
# Save labels for convenience

labels = polblogs["labels"]
labels = pd.DataFrame(labels)
labels = labels.set_index(0)
labels.to_csv("data/processed/polblogs_labels.csv", header=False)

In [7]:
n_neg = 10
n_con = 10
lr = 1e-3
epochs = 100
folds = 5

depths = [1, 2, 3, 4, 5, 6, 7, 8]
# dims = [2, 4, 6, 8, 10, 12, 14, 16]
dims = [4, 6, 8]

y = np.array([x[1] for x in polblogs["labels"]])
cv = list(KFold(n_splits=folds, shuffle=True, random_state=42).split(y))

results = []
for embed_dim in dims:
    try:
        embed = HyperbolicEmbedding(dim=embed_dim, n_negative=n_neg, n_context=n_con, lr=lr, max_epochs=epochs)
        polblogs_gs_embed = embed.embed(polblogs["geomstats_graph"])
        print(f"Embedding shape: {polblogs_gs_embed.shape}")

    except Exception as e:
        print(f"Embed failed at {embed_dim}: {e}")
        continue

    # Save embedding
    np.savetxt(
        f"data/processed/geomstats_embeddings/polblogs/dim{embed_dim}_neg{n_neg}_con{n_con}_lr{lr}_ep{epochs}.tsv",
        polblogs_gs_embed,
        delimiter="\t",
    )

    # Convert to hyperboloid
    polblogs_hyperboloid = convert(polblogs_gs_embed, "poincare", "hyperboloid")

    # Embedding scores
    try:
        n = polblogs_gs_embed.shape[0]
        distortion, map_score = assess_embedding(polblogs_gs_embed)
    except Exception as e:
        print(f"Distortion failed at {embed_dim}: {e}")
        continue

    for depth in depths:
        try:
            dt_scores = []
            hdt_scores = []
            for train, test in cv:
                dt = DecisionTreeClassifier(max_depth=depth)
                dt.fit(polblogs_hyperboloid[train], y[train])
                dt_scores.append(dt.score(polblogs_hyperboloid[test], y[test]))

                hdt = HyperbolicDecisionTreeClassifier(max_depth=depth)
                hdt.fit(polblogs_hyperboloid[train], y[train])
                hdt_scores.append(hdt.score(polblogs_hyperboloid[test], y[test]))
            results.append(
                {
                    "Dim": embed_dim,
                    "Depth": depth,
                    "Score_DT": np.mean(dt_scores),
                    "Std_DT": np.std(dt_scores),
                    "Score_HDT": np.mean(hdt_scores),
                    "Std_HDT": np.std(hdt_scores),
                    "Distortion": distortion,
                    "MAP": map_score,
                }
            )
        except Exception as e:
            print(f"DT failed at {embed_dim}, {depth}: {e}")
            continue

    # Overwrite for each embedding dimension
    results = pd.DataFrame(results)
    results.to_csv(
        f"data/processed/geomstats_embeddings/polblogs/results_neg{n_neg}_con{n_con}_lr{lr}_ep{epochs}.tsv",
        sep="\t",
        index=False,
    )

INFO: Number of edges: 1222
INFO: Mean vertices by edges: 27.357610474631752
INFO: iteration 0 loss_value 2.688718
INFO: iteration 1 loss_value 2.637627
INFO: iteration 2 loss_value 2.614299
INFO: iteration 3 loss_value 2.585342


KeyboardInterrupt: 

In [None]:
# Load from hSVM instead

# Use micro-F1 score
from sklearn.metrics import f1_score

for clf in [
    HyperbolicDecisionTreeClassifier(weights="balanced"),
    HyperbolicRandomForestClassifier(n_estimators=100, weights="balanced"),
]:
    for dataset in ["football", "karate", "polblogs", "polbooks"]:
        scores = []
        for run in [1, 2, 3, 4, 5]:
            data = loadmat(f"hyplinear/data/realnet/{dataset}_data_{run}.mat")
            hyperboloid = convert(data["B"], "poincare", "hyperboloid")
            labels = data["label"].ravel()
            labels = np.unique(labels, return_inverse=True)[1]
            scores.extend(cross_val_score(clf, hyperboloid, labels, cv=StratifiedKFold(5), scoring="f1_micro"))
        print(f"{clf} \t{dataset}:\t{np.mean(scores):.4f} +/- {np.std(scores):.4f}")
    print()

HyperbolicDecisionTreeClassifier() 	football:	0.3478 +/- 0.0778
HyperbolicDecisionTreeClassifier() 	karate:	0.9314 +/- 0.0714
HyperbolicDecisionTreeClassifier() 	polblogs:	0.9147 +/- 0.0163
HyperbolicDecisionTreeClassifier() 	polbooks:	0.8210 +/- 0.0811

HyperbolicRandomForestClassifier() 	football:	0.3443 +/- 0.0684
HyperbolicRandomForestClassifier() 	karate:	0.9371 +/- 0.0709
HyperbolicRandomForestClassifier() 	polblogs:	0.9222 +/- 0.0134
HyperbolicRandomForestClassifier() 	polbooks:	0.7981 +/- 0.0926

