In [7]:
from torch_geometric.data import Data
from torch_geometric.nn.aggr import MaxAggregation
import torch_geometric
import pandas as pd
from torch_geometric.nn import Linear, MessagePassing
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.sampler import NegativeSampling, NeighborSampler
import torch.nn.functional as F
import torch
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, f1_score

In [8]:
class GraphSAGE(MessagePassing):
    def __init__(self, input_dim=1, output_dim=128):
        super().__init__(aggr='max')

        self.output_dim = output_dim

        self.weight1 = Linear(input_dim + 1024, output_dim, bias=True, weight_initializer="glorot")
        self.pool_weight1 = Linear(input_dim, 1024, bias=True, weight_initializer="glorot")

    def infer(self, node_features, adj_list):
        num_nodes = node_features.size(0)
        h = node_features
        h_out = torch.rand((num_nodes, self.output_dim))
        for v in range(num_nodes):
            if(len(adj_list[v]) > 0):
                h_neighborhood = torch.max(self.pool_weight1(h[adj_list[v]]), dim=0).values
                h_out[v] = F.relu(self.weight1(torch.cat((h[v], h_neighborhood))))
        h_out = F.normalize(h_out, dim=1)
        return h_out

    def forward(self, batch):
        x = batch.x
        edge_index = batch.edge_index
        h_neighborhood = self.propagate(edge_index[:, batch.src_index], x=x)
        x = F.relu(self.weight1(torch.cat((x, h_neighborhood), dim=1)))
        x = F.normalize(x, dim=1)
        return x

    def message(self, x_i, x_j):
        return F.relu(self.pool_weight1(x_j))
    
def compute_loss(Z, Z_pos, Z_neg):
    eps = 1e-9
    dot = torch.sum(Z * Z_pos, dim=1)
    term1 = -torch.log(torch.sigmoid(dot)+eps)
    term2 = 0
    for i in range(Z_neg.size(1)):
        term2 -= torch.log(torch.sigmoid(-torch.sum(Z * Z_neg[:, i, :], dim=1))+eps)
    return torch.mean(term1 + term2)

In [9]:
path = './Data/BlogCatalog-dataset/data/edges.csv'
edge_list = pd.read_csv(path, header=None).to_numpy()
num_nodes = 10313
embedding_size = 128
batch_size = 512
epochs = 10

node_features = torch.zeros((num_nodes, 1))
adj_list = [[] for _ in range(num_nodes)]

for edge in edge_list:
    edge[0] = int(edge[0])
    edge[1] = int(edge[1])
    node_features[edge[0]] += 1
    node_features[edge[1]] += 1
    adj_list[edge[0]].append(edge[1])
    adj_list[edge[1]].append(edge[0])

Z = torch.zeros((num_nodes, embedding_size))
edge_index=torch.tensor(edge_list, dtype=torch.long).t().contiguous()

data = Data(x=node_features, edge_index=edge_index, num_nodes=num_nodes)

ns = NegativeSampling(mode="triplet", amount=5)
loader = LinkNeighborLoader(data, num_neighbors=[25], batch_size=batch_size, shuffle=True, neg_sampling=ns, subgraph_type="bidirectional")

model = GraphSAGE(output_dim=embedding_size)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for _ in range(epochs):
    for i, batch in enumerate(loader):
        if batch.src_index.size(0) < batch_size:
            break
        z_tot = model(batch)
        z = z_tot[:batch_size]

        optimizer.zero_grad()
        
        positive_indices = batch.dst_pos_index
        negative_indices = batch.dst_neg_index

        z_pos = z_tot[positive_indices]
        z_neg = z_tot[negative_indices]

        loss = compute_loss(z, z_pos, z_neg)
        
        loss.backward()
        optimizer.step()

        print(f"Iteration {i}, loss: {loss}")

Iteration 0, loss: 6.321434020996094
Iteration 1, loss: 6.251202583312988
Iteration 2, loss: 6.214101314544678
Iteration 3, loss: 6.106999397277832
Iteration 4, loss: 6.142408847808838
Iteration 5, loss: 6.124651908874512
Iteration 6, loss: 6.105917453765869
Iteration 7, loss: 6.122644424438477
Iteration 8, loss: 6.114306926727295
Iteration 9, loss: 6.0928955078125
Iteration 10, loss: 6.098301887512207
Iteration 11, loss: 6.103994846343994
Iteration 12, loss: 6.0083160400390625
Iteration 13, loss: 6.038002967834473
Iteration 14, loss: 6.073243618011475
Iteration 15, loss: 6.037333011627197
Iteration 16, loss: 6.124039173126221
Iteration 17, loss: 6.043212890625
Iteration 18, loss: 6.102758407592773
Iteration 19, loss: 6.034023284912109
Iteration 20, loss: 5.952716827392578
Iteration 21, loss: 6.048224449157715
Iteration 22, loss: 6.03082799911499
Iteration 23, loss: 5.9785237312316895
Iteration 24, loss: 5.979532718658447
Iteration 25, loss: 5.938207149505615
Iteration 26, loss: 5.9318

In [10]:
model.eval()

with torch.no_grad():
    x = model.infer(node_features, adj_list)

    groups_path = "./Data/BlogCatalog-dataset/data/group-edges.csv"
    groups = torch.tensor(pd.read_csv(groups_path).to_numpy(), dtype=torch.long)
    y = torch.zeros((num_nodes, 1), dtype=torch.long)
    for (node, group) in groups:
        y[node] = group

    logreg = LogisticRegression(solver='liblinear')
    ovr = OneVsRestClassifier(logreg)
    kf = KFold(n_splits=5)

    micro_f1 = make_scorer(f1_score, average='micro')
    micro_scores = cross_val_score(ovr, x, y, cv=kf, scoring=micro_f1)

    macro_f1 = make_scorer(f1_score, average='macro')
    macro_scores = cross_val_score(ovr, x, y, cv=kf, scoring=macro_f1)

    print("Micro F1 score for each fold: ", micro_scores*100)
    print("Macro F1 score for each fold: ", macro_scores*100)

    print("Mean Micro F1 score: ", np.mean(micro_scores)*100)
    print("Mean Macro F1 score: ", np.mean(macro_scores)*100)

Micro F1 score for each fold:  [13.52399418 14.68734852 10.22782356  6.35305529  6.64403492]
Macro F1 score for each fold:  [0.70871359 0.79056314 0.54921231 0.30773929 0.32834819]
Mean Micro F1 score:  10.287251293281988
Mean Macro F1 score:  0.5369153043294947
