In [5]:
from torch_geometric.data import Data
from torch_geometric.nn.aggr import MaxAggregation
import torch_geometric
import pandas as pd
from torch_geometric.nn import Linear, MessagePassing
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.sampler import NegativeSampling, NeighborSampler
import torch.nn.functional as F
import torch
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, f1_score

In [6]:
from load_data import *
dataset_name = "BlogCatalog"
data_dir = "./Data/" + dataset_name

#total_graph = load_geometric_dataset(dataset_name)
#total_graph = load_reddit(data_dir)
#total_graph = load_youtube(data_dir)
#total_graph = load_flickr(data_dir)
total_graph = load_blogcatalog(data_dir)
#total_graph = load_cora(data_dir)
#total_graph = load_pubmed(data_dir)
print(total_graph['Multioutput'])
print(total_graph['N_nodes'], total_graph['N_edges'])

True
10312 333983


In [7]:
class GraphSAGE(MessagePassing):
    def __init__(self, input_dim=1, output_dim=128):
        super().__init__(aggr='max')

        self.output_dim = output_dim

        self.weight1 = Linear(input_dim + 1024, output_dim, bias=True, weight_initializer="glorot")
        self.pool_weight1 = Linear(input_dim, 1024, bias=True, weight_initializer="glorot")

    def infer(self, node_features, adj_list):
        num_nodes = node_features.size(0)
        h = node_features
        h_out = torch.rand((num_nodes, self.output_dim))
        for v in range(num_nodes):
            if(len(adj_list[v]) > 0):
                h_neighborhood = torch.max(self.pool_weight1(h[adj_list[v]]), dim=0).values
                h_out[v] = F.relu(self.weight1(torch.cat((h[v], h_neighborhood))))
        h_out = F.normalize(h_out, dim=1)
        return h_out

    def forward(self, batch):
        x = batch.x
        edge_index = batch.edge_index
        h_neighborhood = self.propagate(edge_index[:, batch.src_index], x=x)
        x = F.relu(self.weight1(torch.cat((x, h_neighborhood), dim=1)))
        x = F.normalize(x, dim=1)
        return x

    def message(self, x_i, x_j):
        return F.relu(self.pool_weight1(x_j))
    
def compute_loss(Z, Z_pos, Z_neg):
    eps = 1e-9
    dot = torch.sum(Z * Z_pos, dim=1)
    term1 = -torch.log(torch.sigmoid(dot)+eps)
    term2 = 0
    for i in range(Z_neg.size(1)):
        term2 -= torch.log(torch.sigmoid(-torch.sum(Z * Z_neg[:, i, :], dim=1))+eps)
    return torch.mean(term1 + term2)

In [8]:
edge_list = total_graph["edges_list"] 
num_nodes = total_graph["N_nodes"]
adj_list = total_graph["edges"]
embedding_size = 128
batch_size = 512
epochs = 1
nb_size = 25

node_features = torch.zeros((num_nodes, 1))

for edge in edge_list:
    node_features[edge[0]-1] += 1
    node_features[edge[1]-1] += 1

edge_index=torch.tensor(edge_list, dtype=torch.long).t().contiguous()

data = Data(x=node_features, edge_index=edge_index, num_nodes=num_nodes)

ns = NegativeSampling(mode="triplet", amount=5)
loader = LinkNeighborLoader(data, num_neighbors=[nb_size], batch_size=batch_size, shuffle=True, neg_sampling=ns, subgraph_type="bidirectional")

model = GraphSAGE(output_dim=embedding_size)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for _ in range(epochs):
    for i, batch in enumerate(loader):
        if batch.src_index.size(0) < batch_size:
            break
        z_tot = model(batch)
        z = z_tot[:batch_size]

        optimizer.zero_grad()
        
        positive_indices = batch.dst_pos_index
        negative_indices = batch.dst_neg_index

        z_pos = z_tot[positive_indices]
        z_neg = z_tot[negative_indices]

        loss = compute_loss(z, z_pos, z_neg)
        
        loss.backward()
        optimizer.step()

        print(f"Iteration {i}, loss: {loss}")

Iteration 0, loss: 6.738674640655518
Iteration 1, loss: 6.611703872680664
Iteration 2, loss: 6.602439880371094
Iteration 3, loss: 6.566192626953125
Iteration 4, loss: 6.601250648498535
Iteration 5, loss: 6.595719337463379
Iteration 6, loss: 6.556314945220947
Iteration 7, loss: 6.499269962310791
Iteration 8, loss: 6.593091011047363
Iteration 9, loss: 6.501219749450684
Iteration 10, loss: 6.474925518035889
Iteration 11, loss: 6.480581283569336
Iteration 12, loss: 6.484546661376953
Iteration 13, loss: 6.418727874755859
Iteration 14, loss: 6.4392194747924805
Iteration 15, loss: 6.41502571105957
Iteration 16, loss: 6.419413089752197
Iteration 17, loss: 6.432174205780029
Iteration 18, loss: 6.516456604003906
Iteration 19, loss: 6.476301670074463
Iteration 20, loss: 6.4725446701049805
Iteration 21, loss: 6.445981025695801
Iteration 22, loss: 6.433075904846191
Iteration 23, loss: 6.448967933654785
Iteration 24, loss: 6.377514362335205
Iteration 25, loss: 6.39747953414917
Iteration 26, loss: 6.

In [9]:
## Create 5-fold validation set for NC

import utils

NC_5folds = {}
kf = KFold(n_splits=5, shuffle=True)
nodes = np.array([i for i in range(total_graph['N_nodes'])])
for i, (train_index, test_index) in enumerate(kf.split(nodes)):  
    NC_5folds[i] = {"train":nodes[train_index], "test":nodes[test_index]}


reverse_fraction = 0
LP_test_X_unb, LP_test_Y_unb, training_graph_unbalanced, test_graph_unbalanced = utils.split_graphs(total_graph, directed=True)
LP_test_X, LP_test_Y = utils.balance_test_graph(total_graph, LP_test_X_unb, LP_test_Y_unb, test_graph_unbalanced, directed=True, reverse_fraction=reverse_fraction)
LP_train_X, LP_train_Y = utils.balance_training_graph(training_graph_unbalanced, total_graph, directed=True)

splitting graphs
0.09999940116533226
0.19999880233066453
0.29999820349599676
0.39999760466132905
0.4999970058266613
0.5999964069919935
0.6999958081573259
0.7999952093226581
0.8999946104879903
0.9999940116533226
balancing test graph
0.19999880233066453
0.39999760466132905
0.5999964069919935
0.7999952093226581
0.9999940116533226
balancing training graph
0.09999940116533226
0.19999880233066453
0.29999820349599676
0.39999760466132905
0.4999970058266613
0.5999964069919935
0.6999958081573259
0.7999952093226581
0.8999946104879903
0.9999940116533226


In [23]:
X = model.infer(node_features, adj_list)

In [35]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multioutput import MultiOutputClassifier
from  sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
# helper functions
import utils

mb = MultiLabelBinarizer(classes=[i for i in range(total_graph['N_classes'])])
scaler = StandardScaler()

f1_macro_list = []
f1_micro_list = []

model.eval()

with torch.no_grad():
    # 5-fold cross validation
    for i in range(5):
        print(i)
        training_nodes = NC_5folds[i]['train']
        test_nodes = NC_5folds[i]['test']
        X_train = np.array([X[node] for node in training_nodes])
        X_test = np.array([X[node] for node in test_nodes])
        # For the datasets that only have one one label per node, it gives better results to not use multioutputclassifier
        if not total_graph['Multioutput']:
            Y_train_sequence = np.array([total_graph['groups'][node][0] for node in training_nodes],dtype=int)
            Y_test_sequence = np.array([total_graph['groups'][node][0] for node in test_nodes], dtype=int)
            log_reg = LogisticRegression(multi_class="ovr", max_iter=5000)
            Y_train = Y_train_sequence
            Y_test = Y_test_sequence
            log_reg.fit(X_train, Y_train)
            Y_pred = log_reg.predict(X_test)
            Y_pred = utils.onehot(Y_pred, total_graph['N_classes'])
            Y_test = utils.onehot(Y_test, total_graph['N_classes'])
        else: 
            Y_train_sequence = np.array([total_graph['groups'][node]  for node in training_nodes], dtype=object)
            Y_test_sequence = np.array([total_graph['groups'][node] for node in test_nodes], dtype=object)
            Y_train = mb.fit_transform(Y_train_sequence)
            Y_test = mb.fit_transform(Y_test_sequence)
            log_reg = MultiOutputClassifier(LogisticRegression(multi_class="ovr", max_iter=5000))
            log_reg.fit(X_train, Y_train)
            Y_pred = log_reg.predict(X_test)
            
        f1_macro = utils.compute_f1_macro(Y_test, Y_pred, total_graph['N_classes'])
        f1_micro = utils.compute_f1_micro(Y_test, Y_pred,total_graph['N_classes'])
        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)
        print(f1_macro, f1_micro)
    
print(np.mean(f1_micro_list))
print(np.mean(f1_macro_list))

0
0.0 0.0
1
0.0 0.0
2
0.0 0.0
3
0.0 0.0
4
0.0 0.0
0.0
0.0


In [33]:
Y_train = LP_train_Y
Y_test = LP_test_Y

from sklearn.metrics import roc_auc_score

with torch.no_grad():
    # build representation of edge datasets using inner product of the representation of the two nodes
    X_train = np.zeros((len(LP_train_X), 1))
    for i, edge in enumerate(LP_train_X):
        u = edge[0]
        v = edge[1]
        X_train[i] = utils.get_edge_representation(X[u], X[v])
    X_test = np.zeros((len(LP_test_X), 1))
    for i, edge in enumerate(LP_test_X):
        u = edge[0]
        v = edge[1]
        X_test[i] = utils.get_edge_representation(X[u], X[v])
        
    print("fit model")
    classifier = LogisticRegression()
    classifier.fit(X_train, Y_train)
    Y_probs = classifier.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(Y_test, Y_probs)
    print(roc_auc)
  

fit model
0.5137099803359884
