# Load librabies

In [1]:
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch.optim as optim
import dgl
import random
import networkx as nx
import numpy as np
import scipy.sparse as sp
import itertools

  from .autonotebook import tqdm as notebook_tqdm


# Load Data


In [2]:
# Load the graph and the deleted edge information from the file
with open("../../graphs_and_deleted_edges.pickle", "rb") as f:
    all_graphs = pickle.load(f)

In [3]:
print(all_graphs)

Graph(num_nodes=3, num_edges=12,
      ndata_schemes={'node_id': Scheme(shape=(), dtype=torch.int64), 'Up': Scheme(shape=(1,), dtype=torch.float32), 'Down': Scheme(shape=(1,), dtype=torch.float32), 'Right': Scheme(shape=(1,), dtype=torch.float32), 'Left': Scheme(shape=(1,), dtype=torch.float32), 'Out': Scheme(shape=(1,), dtype=torch.float32), 'In': Scheme(shape=(1,), dtype=torch.float32)}
      edata_schemes={'src_port': Scheme(shape=(), dtype=torch.int64), 'dst_port': Scheme(shape=(), dtype=torch.int64), 'edge_type': Scheme(shape=(), dtype=torch.int64)})


In [4]:
# Split edge set for training and testing
g = all_graphs
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = 1
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]


In [5]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [6]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [7]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [8]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [9]:
model = GraphSAGE(1, 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [10]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(2_000):
    # forward
    h = model(train_g, train_g.ndata['Up'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))


# Thumbnail credits: Link Prediction with Neo4j, Mark Needham
# sphinx_gallery_thumbnail_path = '_static/blitz_4_link_predict.png'

In epoch 0, loss: 3398269.75
In epoch 5, loss: 698297.9375
In epoch 10, loss: 139072.03125
In epoch 15, loss: 72298.734375
In epoch 20, loss: 26253.185546875
In epoch 25, loss: 42183.68359375
In epoch 30, loss: 11491.306640625
In epoch 35, loss: 14485.5341796875
In epoch 40, loss: 4355.78955078125
In epoch 45, loss: 4372.287109375
In epoch 50, loss: 17022.5546875
In epoch 55, loss: 10105.083984375
In epoch 60, loss: 17702.01953125
In epoch 65, loss: 9411.9287109375
In epoch 70, loss: 13864.666015625
In epoch 75, loss: 9938.47265625
In epoch 80, loss: 4470.14404296875
In epoch 85, loss: 7918.541015625
In epoch 90, loss: 7635.53076171875
In epoch 95, loss: 4985.84228515625
In epoch 100, loss: 15267.671875
In epoch 105, loss: 5373.400390625
In epoch 110, loss: 4500.8779296875
In epoch 115, loss: 4969.04833984375
In epoch 120, loss: 6060.79638671875
In epoch 125, loss: 1177.4405517578125
In epoch 130, loss: 1996.7987060546875
In epoch 135, loss: 10155.92578125
In epoch 140, loss: 10220.992

In [11]:
train_g.edata

{'src_port': tensor([3, 2, 4, 4, 0, 0, 5, 3, 2, 1, 1]), 'dst_port': tensor([3, 2, 4, 4, 0, 0, 5, 5, 5, 1, 1]), 'edge_type': tensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1])}

In [20]:
# Make a prediction on the first node pair.
u, v = train_g.edges()
print('Edge features of the first edge:', train_g.edata[0])
print('Prediction score of the first edge:', pred(train_g, h)[0])


KeyError: 0

# Prepare training and testing sets

In [None]:
# store the graph and the deleted edge information in a list
graphs = []
target = []


# Get all edges 
all_eids = all_graphs.edges()

# Convert the tensor to a list for easier iteration
all_eids = list(all_eids)

# Loop through all edge ids
for eid in range(len(all_eids[0])):
    # Get the source and destination node id
    u = all_eids[0][eid]
    v = all_eids[1][eid]
    # Append the edge to the deleted edges list
    target.append((u, v, all_graphs.edges[u, v].data))
    # Remove the edge from the graph
    all_graphs = dgl.remove_edges(all_graphs, eid)                     
    # Append the graph to the list
    graphs.append(all_graphs)
    # Add the edge back to the graph
    all_graphs = dgl.add_edges(all_graphs, u, v)

In [None]:
target[:3]

[(tensor(1),
  tensor(0),
  {'src_port': tensor([2]), 'dst_port': tensor([2]), 'edge_type': tensor([0])}),
 (tensor(1),
  tensor(0),
  {'src_port': tensor([4]), 'dst_port': tensor([4]), 'edge_type': tensor([1])}),
 (tensor(0),
  tensor(2),
  {'src_port': tensor([0]), 'dst_port': tensor([0]), 'edge_type': tensor([0])})]

# The model

In [None]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [None]:
# Define the GCN message passing function
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g, inputs):
        # Compute the node representations by message passing
        g.ndata['h'] = inputs
        g.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'h'))
        h = g.ndata.pop('h')
        return self.linear(h)

# Define the GCN model
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.gcn1 = GCNLayer(in_feats, hidden_size)
        self.gcn2 = GCNLayer(hidden_size, num_classes)

    def forward(self, g, inputs):
        h = self.gcn1(g, inputs)
        h = torch.relu(h)
        h = self.gcn2(g, h)
        return h

In [None]:
# Define the model, loss function, and optimizer
model = GCN()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Number of training epochs
epochs = 100

for epoch in range(epochs):
    # Zero the gradients
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(features, adjacency_matrix)
    
    # Compute the loss
    loss = criterion(outputs, labels)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    # Print the loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

In [None]:
# Split the data into training and validation sets
train_graphs = []
train_deleted_edges = []
val_graphs = []
val_deleted_edges = []

for graph, deleted_edge in zip(all_graphs, all_deleted_edges):
    if random.random() < 0.8:
        train_graphs.append(graph)
        train_deleted_edges.append(deleted_edge)
    else:
        val_graphs.append(graph)
        val_deleted_edges.append(deleted_edge)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define your GCN model
class GCN(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super(GCN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features, hidden_features),
            nn.ReLU(),
            nn.Linear(hidden_features, out_features)
        )
    
    def forward(self, x, adj):
        x = self.layers(x)
        x = torch.mm(adj, x)
        return x

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# in_features is the number of features of each node
in_features = graphs[0].number_of_nodes()
# hidden_features is the number of features of the hidden layer
hidden_features = 16
# out_features is the number of features of the output
out_features = 1

# Define the adjacency matrices
# Convert the DGL heterographs to NetworkX graphs
nx_graphs = [g.to_networkx() for g in graphs]

# Define the adjacency matrices
adjacency_matrices = [torch.tensor(nx.adjacency_matrix(g).todense()) for g in nx_graphs]
node_features = [torch.tensor(g.ndata) for g in graphs]


# Train the GCN model
model = GCN(in_features, hidden_features, out_features)
model.train()
num_epochs = 100
for epoch in range(num_epochs):
    # Zero the gradients
    optimizer.zero_grad()
    
    # Forward pass
    outputs = [model(g, adj) for g, adj in zip(graphs, adjacency_matrices)]
    outputs = torch.cat(outputs, dim=0)
    loss = criterion(outputs, target)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    # Print the loss every 10 epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Evaluation
model.eval()
with torch.no_grad():
    outputs = [model(g, adj) for g, adj in zip(graphs, adjacency_matrices)]
    outputs = torch.cat(outputs, dim=0)
    loss = criterion(outputs, target)
    print(f'Final loss: {loss.item()}')


ValueError: could not determine the shape of object type 'HeteroNodeDataView'

In [None]:
print(len(train_graphs), len(train_deleted_edges))

11 11


# Training