In [1]:
import pathpyG as pp

import torch
from pathpyG.nn.dbgnn import DBGNN
from torch_geometric.transforms import RandomNodeSplit
import torch_geometric
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt

pp.config['torch']['device'] = 'cpu'
device = pp.config['torch']['device']

# Load the synthetic dataset

In [2]:
# Read temporal network
t = pp.TemporalGraph.from_csv('../data/temporal_clusters.tedges')

In [3]:
node_colors = ['green']*10+['red']*10+['blue']*10

In [4]:
style = {}
style['node_color'] = node_colors

In [5]:
pp.plot(t, **style);

In [6]:
# read the paths
paths_original = pp.PathData.from_csv('../data/temporal_clusters.ngram')
print(paths_original)
print(paths_original.num_nodes)

PathData with 7460 walks and 0 dags and total weight 29042
30


In [7]:
#caluclate paths 
dag = pp.algorithms.temporal_graph_to_event_dag(t, delta=1)
print(dag)

Graph with 89032 nodes and 60000 edges

Node attributes
	node_id		<class 'list'>
	node_name		<class 'list'>
	node_idx		<class 'list'>

Edge attributes
	edge_ts		<class 'torch.Tensor'> -> torch.Size([60000])

Graph attributes
	num_nodes		<class 'int'>



In [8]:
paths = pp.PathData.from_temporal_dag(dag)
print(paths)

PathData with 29032 walks and 0 dags and total weight 29032


In [11]:
print(len(set([tuple(pp.PathData.walk_to_node_seq(v).tolist()) for v in paths.paths.values()])))

7078


In [9]:
# Create the graph corresponding to paths
g = pp.HigherOrderGraph(paths, order=1)

# Plotting the time-aggregated network (first-order graph)
pp.plot(g);

In [10]:
# Create the second-order graph corresponding to paths
g2 = pp.HigherOrderGraph(paths, order=2)

# Plotting the second-order graph
pp.plot(g2);

In [15]:
t_shuffled = pp.TemporalGraph.from_csv('../data/temporal_clusters.tedges')
t_shuffled.data['t'] = t.data['t'][torch.randperm(len(t_shuffled.data['t']))] 

In [16]:
t.data.t

tensor([    0,     1,     2,  ..., 59997, 59998, 59999])

In [18]:
t_shuffled.data.t

tensor([30587, 15778, 16467,  ..., 11520, 29890, 56463])

In [25]:
#caluclate paths 
dag_shuffled = pp.algorithms.temporal_graph_to_event_dag(t_shuffled, delta=1)
print(dag_shuffled)

Graph with 118038 nodes and 60000 edges

Node attributes
	node_id		<class 'list'>
	node_idx		<class 'list'>
	node_name		<class 'list'>

Edge attributes
	edge_ts		<class 'torch.Tensor'> -> torch.Size([60000])

Graph attributes
	num_nodes		<class 'int'>



In [26]:
paths_shuffled = pp.PathData.from_temporal_dag(dag_shuffled)
print(paths_shuffled)
print(paths_shuffled.num_nodes)

PathData with 58038 walks and 0 dags and total weight 58038
30


In [29]:
# Create the second-order graph corresponding to paths
g2_shuffled = pp.HigherOrderGraph(paths_shuffled, order=2)
print(g2_shuffled)

# Plotting the second-order graph
pp.plot(g2_shuffled);

HigherOrderGraph (k=2) with 849 nodes and 1871 edges
	Total edge weight = 1962.0
Edge attributes
	edge_weight		<class 'torch.Tensor'> -> torch.Size([1871])

Graph attributes
	node_id		<class 'list'>
	num_nodes		<class 'int'>



# Prepare the data

In [None]:
# Define edge indices for first and second-order graphs
edge_index_g1 = g.data.edge_index
edge_index_g2 = g2.data.edge_index

In [None]:
# Define edge weights
edge_weights = g.data['edge_weight']
edge_weights_higher_order = g2.data['edge_weight']

In [None]:
# Define bipartite mapping
import torch

def generate_bipatite_edge_index(mapping = 'last'):

    if mapping == 'last':
        bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()),
                                     [i[1] for i in g2.node_index_to_id.values()]])

    elif mapping == 'first':
        bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()),
                                     [i[0] for i in g2.node_index_to_id.values()]])

    else:
        bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()) + list(g2.node_index_to_id.keys()),
                                     [i[0] for i in g2.node_index_to_id.values()] + [i[1] for i in g2.node_index_to_id.values()]])

    return bipartide_edge_index


# Original DBGNN implementation mapping = 'last'
bipatite_edge_index = generate_bipatite_edge_index(mapping='last')

In [None]:
# Define the PyG data object
from torch_geometric.data import Data

num_nodes = max(max(g.data['edge_index'][0]), max(g.data['edge_index'][1])).item() + 1 # since indexing starts from 0
num_ho_nodes = max(max(g2.data['edge_index'][0]), max(g2.data['edge_index'][1])).item() + 1 # since indexing starts from 0

data = Data(
    num_nodes = num_nodes,
    num_ho_nodes = num_ho_nodes,
    x = torch.eye(num_nodes, num_nodes),
    x_h = torch.eye(num_ho_nodes, num_ho_nodes),
    edge_index = edge_index_g1,
    edge_index_higher_order = edge_index_g2,
    edge_weights = edge_weights.float(),
    edge_weights_higher_order = edge_weights_higher_order.float(),
    bipartite_edge_index = bipatite_edge_index,
    y = torch.tensor([ int(i) // 10 for i in paths.node_id])
)

# DBGNN 

In [None]:
from sklearn.metrics import balanced_accuracy_score

def test(model, data):
    model.eval()

    _, pred = model(data).max(dim=1)

    metrics_train = balanced_accuracy_score(
        data.y[data.train_mask].cpu(),
        pred[data.train_mask].cpu().numpy()
        )

    metrics_test = balanced_accuracy_score(
        data.y[data.test_mask].cpu(),
        pred[data.test_mask].cpu().numpy()
        )

    return metrics_train, metrics_test

In [None]:
data = RandomNodeSplit(num_val=0, num_test=0.3)(data)

model = DBGNN(
        num_features =[num_nodes, num_ho_nodes],
        num_classes = len(data.y.unique()),
        hidden_dims = [16, 32, 8],
        p_dropout = 0.4
        ).to(device)

optimizer = torch.optim.Adam(model.parameters(),  lr=0.005)
loss_function = torch.nn.CrossEntropyLoss()

data = data.to(device)

In [None]:
data

In [None]:
print(model)

In [None]:
losses = []
for epoch in range(1000):
        output = model(data)
        loss = loss_function(output[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss)

        if epoch % 10 == 0:
                train_ba, test_ba  = test(model, data)
                print(f'Epoch: {epoch}, Loss: {loss}, Train balanced accuracy: {train_ba}, Test balanced accuracy: {test_ba}')

# Latent space representation of edges

In [None]:
g2.node_index_to_id[0]

In [None]:
model.eval()
latent = model.higher_order_layers[0].forward(data.x_h, data.edge_index_higher_order).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(latent.cpu())

colors = []
for v, w in g2.nodes:
    if data.y[v] == 0 and data.y[w] == 0:
        colors.append('red')
    elif data.y[v] == 1 and data.y[w] == 1:
        colors.append('green')
    elif data.y[v] == 2 and data.y[w] == 2:
        colors.append('blue')
    else:
        colors.append('grey')

plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)

for e in g2.edges:
    s = g2.node_id_to_index[e[0]]
    t = g2.node_id_to_index[e[1]]
    plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]], 
             color='lightsteelblue', 
             linestyle='-', 
             alpha=0.2,
             lw=0.2)
plt.axis('off')
plt.show()

In [None]:
model.eval()
latent = model.higher_order_layers[1].forward(latent.cpu(), data.edge_index_higher_order).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(latent.cpu())

colors = []
for v, w in g2.nodes:
    if data.y[v] == 0 and data.y[w] == 0:
        colors.append('red')
    elif data.y[v] == 1 and data.y[w] == 1:
        colors.append('green')
    elif data.y[v] == 2 and data.y[w] == 2:
        colors.append('blue')
    else:
        colors.append('grey')

plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)

for e in g2.edges:
    s = g2.node_id_to_index[e[0]]
    t = g2.node_id_to_index[e[1]]
    plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]], 
             color='lightsteelblue', 
             linestyle='-', 
             alpha=0.2,
             lw=0.2)
plt.axis('off')
plt.show()

# Latent space representation of nodes

In [None]:
model.eval()
latent = model.forward(data).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=10).fit_transform(latent.cpu())

colors = []
for v in g.nodes:
    if data.y[v] == 0:
        colors.append('red')
    elif data.y[v] == 1:
        colors.append('green')
    elif data.y[v] == 2:
        colors.append('blue')
    else:
        colors.append('grey')

plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)

for e in g.edges:
    s = e[0]
    t = e[1]
    plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]], 
             color='lightsteelblue', 
             linestyle='-', 
             alpha=0.2,
             lw=0.2)
plt.axis('off')
plt.show()