In [9]:

import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import random
from sklearn.model_selection import KFold
from load_data import *
from copy import deepcopy
import sys

import torch_geometric

In [10]:

dataset_name = "Actor"
data_dir = "../Data/" + dataset_name


#total_graph = load_cora(data_dir)
from_torch_geometric = True
total_graph = load_geometric_dataset(dataset_name)
#total_graph = load_blogcatalog(data_dir)
#total_graph = load_toy(data_dir)
#total_graph = load_pubmed(data_dir)
#total_graph = load_flickr(data_dir)
#total_graph = load_reddit(data_dir)

def get_node_features(total_graph, mode="degree"):
    if mode=="degree":
        node_features = np.zeros((total_graph['N_nodes'],2))
        for i in range(total_graph['N_nodes']):
            neighbors = total_graph['edges'][i]
            degree = len(neighbors)
            node_features[i,0] = degree
            second_neighbors_count = 0
            for n in neighbors:
                second_neighbors_count += len(total_graph['edges'][n])
            node_features[i,1] = second_neighbors_count
    elif mode=="degree_dist":
        node_features = total_graph['adj_matrix']/(np.linalg.norm(total_graph['adj_matrix'], axis=1, keepdims=True)+1e-9)
    elif mode=="node_nr":
        node_features = np.zeros((total_graph['N_nodes'],total_graph['N_nodes']))
        for i in range(total_graph['N_nodes']):
            node_features[i,i] = 1
    return node_features

print(total_graph['N_nodes'], total_graph['N_edges'])

nr of self-loop edges: 93
7600 26659


In [11]:


class GraphSage(torch.nn.Module):
    def __init__(self, input_dim, output_dim, K):
        super(GraphSage, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_dim, output_dim))
        for i in range(K-1):
            self.layers.append(nn.Linear(output_dim, output_dim))
        for i in range(K):
            nn.init.zeros_(self.layers[i].bias)
        self.K = K
    
    def infer(self, node_features, neighborhood_dict):
        eps = 1e-9
        h = node_features
        N_nodes = h.shape[0]
        for k in range(self.K):
            layer = self.layers[k]
            h_updated = torch.zeros((N_nodes, layer.out_features))
            for v in range(N_nodes):        
                neighborhood = neighborhood_dict[v]
                
                hv = h[v].view(1, -1)
                hN = h[neighborhood]
                conc = torch.cat([hN, hv], dim=0)
                aggregated = torch.mean(conc, dim=0, keepdim=True)
                output = layer(aggregated)
                hv = F.relu(output)
                h_updated[v] =  hv    
            h = h_updated
            h = h / (torch.norm(h, dim=1, keepdim=True)+eps)
        return h

    
    def aggregate_neighbors(self, batch):
        batch_nodes = batch.src_index
        subgraph_edge_indices = batch.edge_index

        neighborhoods = {i.item():[] for i in batch_nodes}
        for src, target in zip(subgraph_edge_indices[0], subgraph_edge_indices[1]):
            src = src.item()
            target = target.item()
            if not neighborhoods.get(src):
                neighborhoods[src] = []
            neighborhoods[src].append(target)
        B = [[] for k in range(self.K+1)]
        B[-1] = batch_nodes[:].tolist()
        for k in range(self.K, 0, -1):
            B[k-1] = B[k][:]
            for node in B[k]:  
                B[k-1].extend(neighborhoods[node])
        return B, neighborhoods


    def forward(self, batch):
        eps = 1e-9
        B, neighborhood_dict = self.aggregate_neighbors(batch)
        h = batch.x
        N_nodes = h.shape[0]
        for k in range(self.K):
            layer = self.layers[k]
            h_updated = torch.zeros((N_nodes, layer.out_features))
            for i,v in enumerate(B[k+1]):      # this is because B[0] is base case, B[1] are nodes corresponding to layer 1 etc  
                neighborhood = neighborhood_dict[v]
                hv = h[v].view(1, -1)
                hN = h[neighborhood]
                conc = torch.cat([hN, hv], dim=0)
                aggregated = torch.mean(conc, dim=0, keepdim=True)
                output = layer(aggregated)
                hv = F.relu(output)
                h_updated[v] =  hv    
            h = h_updated
            h = h / (torch.norm(h, dim=1, keepdim=True)+eps)
        return h



def compute_loss(Z, Z_pos, Z_neg):
    eps = 1e-9
    dot = torch.sum(Z * Z_pos, dim=1)
    term1 = -torch.log(torch.sigmoid(dot)+eps)
    term2 = 0
    for q in range(Z_neg.shape[0]):
        term2 = -torch.log(torch.sigmoid(-torch.sum(Z * Z_neg[q,:,:], dim=1))+eps)
    return torch.mean(term1+term2)



In [12]:
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.sampler import NegativeSampling

# Define node features
feature_type = "node_nr"
node_features_np = get_node_features(total_graph, mode=feature_type) 
node_features_torch = torch.tensor(node_features_np).float()
#node_features_torch = total_graph['node_feats'] # for datasets coming from torch_geometric, there exist precomputed features

if from_torch_geometric:
    node_links_torch = total_graph['edges_list']
else:
    node_links_torch = torch.tensor(total_graph['edges_list']).T

print(total_graph['N_nodes'], total_graph['N_edges'])
    
graph_data = torch_geometric.data.Data(node_features_torch, node_links_torch)
# parameters
num_epochs = 1
K = 1   # number of iterations
Q = 10
batch_size = 256
input_size = node_features_torch.shape[1]
output_dim = 128
nb_size = 10
# Create the model
model = GraphSage(input_size, output_dim, K)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

ns = NegativeSampling(mode="triplet", amount=Q)
for epoch in range(num_epochs):

    loader = LinkNeighborLoader(graph_data, batch_size=batch_size, num_neighbors=[nb_size]*K, neg_sampling=ns, shuffle=True, replace=True)

    print("starting training")
    for i,batch in enumerate(loader):
        if batch.src_index.shape[0] != batch_size:
            break 
        positive_samples = batch.dst_pos_index.tolist()
        negative_samples = batch.dst_neg_index.numpy()
        Z_tot = model(batch)
        Z = Z_tot[batch.src_index.tolist()]
        Z_pos = Z_tot[positive_samples]
        Z_neg = torch.zeros((Q, batch_size, output_dim))
        for q in range(Q):
            Z_neg[q] = Z_tot[negative_samples[:,q]]
       
        loss = compute_loss(Z, Z_pos, Z_neg)
        print(i/len(loader), loss)
        loss.backward()
        optimizer.step()
    


7600 26659
starting training
0.0 tensor(1.3794, grad_fn=<MeanBackward0>)
0.00847457627118644 tensor(1.3778, grad_fn=<MeanBackward0>)
0.01694915254237288 tensor(1.3776, grad_fn=<MeanBackward0>)
0.025423728813559324 tensor(1.3701, grad_fn=<MeanBackward0>)
0.03389830508474576 tensor(1.3737, grad_fn=<MeanBackward0>)
0.0423728813559322 tensor(1.3698, grad_fn=<MeanBackward0>)
0.05084745762711865 tensor(1.3892, grad_fn=<MeanBackward0>)
0.059322033898305086 tensor(1.3760, grad_fn=<MeanBackward0>)
0.06779661016949153 tensor(1.3841, grad_fn=<MeanBackward0>)
0.07627118644067797 tensor(1.3771, grad_fn=<MeanBackward0>)
0.0847457627118644 tensor(1.3747, grad_fn=<MeanBackward0>)
0.09322033898305085 tensor(1.3874, grad_fn=<MeanBackward0>)
0.1016949152542373 tensor(1.3797, grad_fn=<MeanBackward0>)
0.11016949152542373 tensor(1.3835, grad_fn=<MeanBackward0>)
0.11864406779661017 tensor(1.3760, grad_fn=<MeanBackward0>)
0.1271186440677966 tensor(1.3779, grad_fn=<MeanBackward0>)
0.13559322033898305 tensor(1.

### Evaluate NC

In [13]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multioutput import MultiOutputClassifier
from  sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
# helper functions
import utils

# Create 5fold train/test data
NC_5folds = {}
kf = KFold(n_splits=5, shuffle=True)
nodes = np.array([i for i in range(total_graph['N_nodes'])])
#labels = np.array([total_graph['grops'][n] for n in nodes])
for i, (train_index, test_index) in enumerate(kf.split(nodes)):  
    NC_5folds[i] = {"train":list(nodes[train_index]), "test":list(nodes[test_index])}


N_classes = total_graph['N_classes']
mb = MultiLabelBinarizer(classes=[i for i in range(N_classes)])
f1_macro_list = []
f1_micro_list = []


# 5-fold cross validation
with torch.no_grad():
    for i in range(5):
        print(i)
        training_nodes = torch.tensor(NC_5folds[i]['train'])
        test_nodes = torch.tensor(NC_5folds[i]['test'])
        training_features = node_features_torch[training_nodes]
        test_features = node_features_torch[test_nodes]

        neighborhoods_train = utils.compute_neighborhoods_subgraph(total_graph['edges'], NC_5folds[i]['train'], 25)
        neighborhoods_test = utils.compute_neighborhoods_subgraph(total_graph['edges'], NC_5folds[i]['test'], 25)

        X_train = model.infer(training_features, neighborhoods_train)
        X_test = model.infer(test_features, neighborhoods_test)
        # For the datasets that only have one one label per node, it gives better results to not use multioutputclassifier
        if not total_graph['Multioutput']:
            yt = []
            for n in NC_5folds[i]['train']:
                if len(total_graph['groups'][n]):
                    yt.append(total_graph['groups'][n][0])
                else:
                    yt.append(0)
            #Y_train_sequence = np.array([total_graph['groups'][node][0]  for node in  NC_5folds[i]['train']],dtype=int)
            Y_train_sequence = np.array(yt,dtype=int)
            yt = []
            for n in NC_5folds[i]['test']:
                if len(total_graph['groups'][n]):
                    yt.append(total_graph['groups'][n][0])
                else:
                    yt.append(0)
            #Y_test_sequence = np.array([total_graph['groups'][node][0] for node in  NC_5folds[i]['test'] if len(total_graph['groups'][node])], dtype=int)
            Y_test_sequence = np.array(yt, dtype=int)
            log_reg = LogisticRegression(multi_class="ovr", max_iter=200)
            Y_train = Y_train_sequence
            Y_test = Y_test_sequence
            log_reg.fit(X_train, Y_train)
            Y_pred = log_reg.predict(X_test)
            Y_pred = utils.onehot(Y_pred, N_classes)
            Y_test = utils.onehot(Y_test, N_classes)
        else:
            print("fitting model")
            Y_train_sequence = [total_graph['groups'][node]  for node in NC_5folds[i]['train']]
            Y_test_sequence = [total_graph['groups'][node] for node in NC_5folds[i]['test']]
            Y_train = mb.fit_transform(Y_train_sequence)
            Y_test = mb.fit_transform(Y_test_sequence)
            log_reg = MultiOutputClassifier(LogisticRegression(multi_class="ovr", max_iter=200))   #
            log_reg.fit(X_train, Y_train)
            Y_pred = log_reg.predict(X_test)
    
        f1_macro = utils.compute_f1_macro(Y_test, Y_pred, N_classes)
        f1_micro = utils.compute_f1_micro(Y_test, Y_pred, N_classes)

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)
        print(f1_macro, f1_micro)

        
    print(np.mean(f1_micro_list))
    print(np.mean(f1_macro_list))

0
0.16256483118598392 0.25526315789473686
1
0.15975803489517645 0.2546052631578947
2
0.15669936390073452 0.2355263157894737
3
0.15770013944734246 0.25723684210526315
4
0.15112560169702616 0.2388157894736842
0.24828947368421056
0.15756959422525268


### Evaluate LP

In [14]:


from sklearn.metrics import roc_auc_score

directed_graph = False
reverse_fraction = 0
LP_test_X_unb, LP_test_Y_unb, training_graph_unbalanced, test_graph_unbalanced = utils.split_graphs(total_graph, directed=directed_graph)
LP_test_X, LP_test_Y = utils.balance_test_graph(total_graph, LP_test_X_unb, LP_test_Y_unb, test_graph_unbalanced, directed=directed_graph, reverse_fraction=reverse_fraction)
LP_train_X, LP_train_Y = utils.balance_training_graph(training_graph_unbalanced, total_graph, directed=directed_graph)


neighborhoods_train = utils.compute_neighborhoods_subgraph(training_graph_unbalanced, total_graph['nodes'], nb_size)
neighborhoods_test = utils.compute_neighborhoods_subgraph(test_graph_unbalanced, total_graph['nodes'], nb_size)

node_features_np = get_node_features(total_graph, mode=feature_type)
node_features_torch = torch.tensor(node_features_np).float()

Y_train = LP_train_Y
Y_test = LP_test_Y
with torch.no_grad():
    # build representation of edge datasets using inner product of the representation of the two nodes
    X_train = np.zeros((len(LP_train_X), 1))
    Z_train = model.infer(node_features_torch, neighborhoods_train)
    for i, edge in enumerate(LP_train_X):
        u = edge[0]
        v = edge[1]
        X_train[i] = utils.get_edge_representation(Z_train[u], Z_train[v])
    X_test = np.zeros((len(LP_test_X), 1))
    Z_test = model.infer(node_features_torch, neighborhoods_test)
    for i, edge in enumerate(LP_test_X):
        u = edge[0]
        v = edge[1]
        X_test[i] = utils.get_edge_representation(Z_test[u], Z_test[v])
        
    print("fit model")
    classifier = LogisticRegression()
    classifier.fit(X_train, Y_train)
    Y_probs = classifier.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(Y_test, Y_probs)
    print(roc_auc)
  

splitting graphs
0.099932478055368
0.199864956110736
0.299797434166104
0.399729912221472
0.49966239027683995
0.599594868332208
0.699527346387576
0.799459824442944
0.8993923024983119
0.9993247805536799
balancing test graph
0.19993998049366044
0.39987996098732087
0.5998199414809813
0.7997599219746417
0.9996999024683022
balancing training graph
0.099932478055368
0.199864956110736
0.299797434166104
0.399729912221472
0.49966239027683995
0.599594868332208
0.699527346387576
0.799459824442944
0.8993923024983119
0.9993247805536799
fit model
0.6760356974220538


In [16]:

with open("../Results/graphsage/{}_metrics{}.csv".format(dataset_name, reverse_fraction), "w") as file:
    settings_str = "Results for graphsage embedding generated with {} epochs, K={}, Q={}, nb_size={}\n".format(num_epochs, K, Q, nb_size)
    file.write(settings_str)
    header = "Dataset; F1 macro; F1 micro; ROC-AUC \n"
    file.write(header)
    data_row = "{dataset};{f1mac};{f1mic};{roc}".format(dataset=dataset_name, f1mac=np.mean(f1_macro_list), f1mic=np.mean(f1_micro_list), roc=roc_auc)
    file.write(data_row)