To install pytorch geometric run the cell below

In [None]:
#!pip install torch=='1.9.0'
#!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.9.0+cu102.html

# IMPORT

In [None]:
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score,average_precision_score

from torch_geometric.utils import negative_sampling
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges
from torch_geometric.transforms import RandomLinkSplit,NormalizeFeatures,Constant,OneHotDegree
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv,SAGEConv,GATConv
from scipy.stats import entropy

import torch
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

import copy
import itertools
import json

# LOAD DATASET

In [None]:
from torch_geometric.data import Data

current_data = Data()
current_data.x = torch.load("../data/gnn/node_feature_matrix.pt")
current_data.edge_index = torch.load("../data/gnn/edge_index.pt")

#NORMALIZATION (L1-Norm)

transform = NormalizeFeatures()
current_data = transform(current_data)

#TRAIN TEST SPLIT + NEGATIVE SAMPLING
transform = RandomLinkSplit(num_val=0.0,num_test=0.25)
train_data, val_data, current_test_data = transform(current_data)

In [None]:
future_data = Data()
future_data.x = torch.load("../data/gnn/future_node_feature_matrix.pt")
future_data.edge_index = torch.load("../data/gnn/future_edge_index.pt")

#NORMALIZATION
transform = NormalizeFeatures()
future_data = transform(future_data)

#NEGATIVE SAMPLING
future_neg_edge_index = negative_sampling(
        edge_index=future_data.edge_index, #positive edges
        num_nodes=future_data.num_nodes, # number of nodes
        num_neg_samples=future_data.edge_index.size(1)) # number of neg_sample equal to number of pos_edges

#edge index ok, edge_label cat, edge_label_index cat
num_pos_edge = future_data.edge_index.size(1)
future_data.edge_label = torch.Tensor(np.array([1 for i in range(num_pos_edge)] + [0 for i in range(num_pos_edge)]))
future_data.edge_label_index = torch.cat([future_data.edge_index, future_neg_edge_index], dim=-1)

In [None]:
train_data

In [None]:
future_data

In [None]:
#MY GNN CUSTOM MODULE
class LinkPredModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, layer = GCNConv, dropout=0.25, loss = torch.nn.BCEWithLogitsLoss):
        
        super(LinkPredModel, self).__init__()
        self.conv1 = layer(input_dim, hidden_dim)
        self.conv2 = layer(hidden_dim, num_classes)
        
        #Initialize the loss function to BCEWithLogitsLoss
        self.loss_fn = loss()

        self.dropout = dropout

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, batch):
        x , edge_index, edge_label_index = batch.x.float(), batch.edge_index, batch.edge_label_index
        
        ## Note
        ## 1. Feed the node feature into the first conv layer
        ## 2. Add a leaky-ReLU after the first conv layer
        ## 3. Add dropout after the ReLU (with probability self.dropout)
        ## 4. Repeat for the next layers
        ## 5. Select the embeddings of the source nodes and destination nodes
        ## by using the edge_label_index and compute the similarity of each pair
        ## by dot product

        h = self.conv1(x, batch.edge_index)
        h = F.leaky_relu(h)
        h = F.dropout(h, p=self.dropout)
        h = self.conv2(h, edge_index)
        h = F.leaky_relu(h)
        h = F.dropout(h, p=self.dropout)
        
        h_src = h[edge_label_index[0]]
        h_dst = h[edge_label_index[1]]
        h_sim = h_src * h_dst #dot product
        pred = torch.sum(h_sim, dim=-1)

        return pred
    
    def loss(self, pred, link_label):
        return self.loss_fn(pred, link_label)

In [None]:
from sklearn.metrics import *

def train(model, train_data, val_data, test_data, device,\
          optimizer, num_epochs=200, verbose=True):
    
    avgpr_val_max = 0
    best_model = copy.deepcopy(model)
    train_data = train_data.to(device)
    best_epoch = -1
    
    avgpr_trains = []
    avgpr_vals = []
    avgpr_tests = []
    
    #roc_trains = []
    #roc_vals = []
    #roc_tests = []
    
    for epoch in range(num_epochs):
        model.train()
        ## Note
        ## 1. Zero grad the optimizer
        ## 2. Compute loss and backpropagate
        ## 3. Update the model parameters
        optimizer.zero_grad()
            
        #pred = best_model(train_data)

        pred = model(train_data)
        loss = model.loss(pred, train_data.edge_label.type_as(pred))

        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.

        ##########################################

        log = 'Epoch: {:03d}\n AVGPR Train: {:.4f}, Val: {:.4f}, Test: {:.4f}\n ROC Train: {:.4f}, Val: {:.4f}, Test: {:.4f}\n F1-Score Train: {:.4f}, Val: {:.4f}, Test: {:.4f}\n Loss: {}'
        avgpr_score_train, f1_score_train, roc_score_train = test(model, train_data, device)
        avgpr_score_val, f1_score_val, roc_score_val = test(model, val_data, device)
        avgpr_score_test, f1_score_test, roc_score_test = test(model, test_data, device)
        #score_test = test(model, dataloaders['test'], args)
        
        #f1_trains.append(f1_score_train)
        #f1_vals.append(f1_score_val)
        #f1_tests.append(f1_score_test)
        
        avgpr_trains.append(avgpr_score_train)
        avgpr_vals.append(avgpr_score_val)
        avgpr_tests.append(avgpr_score_test)
        
        if verbose:
            print(log.format(epoch, avgpr_score_train, avgpr_score_val, avgpr_score_test, roc_score_train, roc_score_val, roc_score_test, f1_score_train, f1_score_val, f1_score_test, loss.item()))
            
        if avgpr_val_max < avgpr_score_val:
            avgpr_val_max = avgpr_score_val
            best_epoch = epoch
            best_model = copy.deepcopy(model)
            
    if verbose:
        print(f'Best Epoch: {best_epoch}')
        
    return best_model, avgpr_trains, avgpr_vals, avgpr_tests

In [None]:
def test(model, test_data, device):
    model.eval()
    
    f1_model_score=0

    test_data = test_data.to(device)

    h = model(test_data)
    
    pred_cont = torch.sigmoid(h).cpu().detach().numpy()
    pred = [1 if p > 0.5 else 0 for p in pred_cont]

    label = test_data.edge_label.cpu().detach().numpy()
      
    roc_score = roc_auc_score(label, pred_cont)
    avgpr_score = average_precision_score(label, pred_cont)
    f1_model_score = f1_score(label,pred)
 
    return avgpr_score, f1_model_score, roc_score

In [None]:
input_dim = train_data.num_node_features
num_classes = 2

## Tuning

Run this part of the notebook only if you want to re-perform hyperparameter tuning. Below you will find the model with the best config of hypeparameters found.

In [None]:
grid_all = {
    'optimizer': ['sgd','adam','adagrad'],
    'hidden_dim': [4,8,16, 32, 64, 128, 256],
    'layer': [GATConv, GCNConv, SAGEConv],
    'dropout': [0,0.25,0.50,0.75],
    'weight_decay': [5e-5,5e-3,5e-2,5e-1,5e-6],
    'lr': [1,0.1,0.075,0.05,0.025,0.01,0.001,0.0001,0.00001]
}

In [None]:
grid_1 = {
    'optimizer': ['sgd','adam','adagrad'],
    'hidden_dim': [16, 32, 64, 128, 256],
    'layer': [GATConv, GCNConv, SAGEConv],
    'dropout': [0.25,0.50,0.75],
    'lr': [1,0.1,0.01,0.001,0.0001,0.00001]
}

## USATA PER PRIMI EXP (STRUCT F PERIOD 1)

In [None]:
grid_2 = {
    'optimizer': ['sgd','adam','adagrad'],
    'hidden_dim': [16,32],
    'layer': [GATConv, GCNConv, SAGEConv],
    'dropout': [0.0, 0.25],
    'lr': [0.1,0.01,0.001]
}

## USATA PER CONSTANT PERIOD 1

In [None]:
grid_3 = {
    'optimizer': ['adam'],
    'hidden_dim': [16,32],
    'layer': [GCNConv],
    'dropout': [0.0],
    'lr': [0.1,0.01]
}

In [None]:
grid_4 = {
    'optimizer': ['adam'],
    'hidden_dim': [4,8],
    'layer': [GCNConv],
    'dropout': [0.0],
    'lr': [0.025,0.05,0.075],
    'weight_decay': [5e-5,5e-3,5e-2,5e-1,5e-6]
    
}

In [None]:
def gnn_tuning(grid, input_dim, num_classes, device, train_data, current_test_data, test_data, logname, fold, plotF1=True):
    names, values = zip(*grid.items())
    configurations = [dict(zip(names,v)) for v in itertools.product(*values)]
    print('There are ', len(configurations), ' configurations for this model')
    
    i=0
    avgpr_max = 0
    best_config = {}
    best_config_i = 0
    weight_decay = 5e-4
    with open(logname,'a+') as log:
        for config in configurations:
            print('Configuration number ', str(i), ' start')
            
            log.write('CONFIGURATION ' + str(i) + '\n\n')
            writeConfig = copy.deepcopy(config)
            writeConfig['layer'] = writeConfig['layer'].__name__
            nameLayer = writeConfig['layer']
            log.write(json.dumps(writeConfig,indent=2))
            log.write('\n')

            hidden_dim = config['hidden_dim']
            layer = config['layer']
            dropout = config['dropout']
            if 'weight_decay' in config:
                weight_decay = config['weight_decay']
            model = LinkPredModel(input_dim, hidden_dim, num_classes, dropout = dropout, layer = layer).to(device)
            model.reset_parameters()
            
            opt = config['optimizer']
            lr = config['lr']
            if opt == 'sgd':
                optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
            elif opt == 'adam':
                optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, weight_decay = weight_decay)
            else:
                optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, weight_decay=weight_decay)

            best_model, avgpr_trains, avgpr_vals, avgpr_tests = \
            train(model, train_data, current_test_data, test_data, device, optimizer,verbose=False)
            
            print('GNN fitted')

            avgpr_train, f1_score_train, roc_score_train = test(best_model, train_data, device)
            avgpr_val, f1_score_current_test, roc_score_val = test(best_model, current_test_data, device)
            avgpr_test, f1_score_future_test, roc_score_test = test(best_model, future_data, device)
            
            print('GNN performance computed')

            log.write('TRAIN SET\n')
            log.write(f' AVGPR SCORE: {avgpr_train}\n')
            log.write(f' ROC SCORE: {roc_score_train}\n')
            log.write(f' F1-SCORE: {f1_score_train}\n\n')

            log.write('CURRENT TEST SET\n')
            log.write(f' AVGPR SCORE: {avgpr_val}\n')
            log.write(f' ROC SCORE: {roc_score_val}\n')
            log.write(f' F1-SCORE: {f1_score_current_test}\n\n')

            log.write('FUTURE TEST SET\n')
            log.write(f' AVGPR SCORE: {avgpr_test}\n')
            log.write(f' ROC SCORE: {roc_score_test}\n')
            log.write(f' F1-SCORE: {f1_score_future_test}\n')
            
            print('GNN performance written')

            if avgpr_max < avgpr_test:
                avgpr_max = avgpr_test
                best_config = config
                best_config_i = i
                scores = {
                    'avgpr_train': avgpr_train,
                    'roc_score_train': roc_score_train,
                    'f1_score_train': f1_score_train,
                    'avgpr_current_test': avgpr_val,
                    'roc_score_val': roc_score_val,
                    'f1_score_current_test': f1_score_current_test,
                    'avgpr_future_test': avgpr_test,
                    'roc_score_test': roc_score_test,
                    'f1_score_future_test': f1_score_future_test
                }
                top_model = copy.deepcopy(best_model)
                
            if plotF1:
                #train orange test blue val green
                num_epochs = 200
                x = range(num_epochs)
                plt.clf()
                plt.plot(x, avgpr_trains, color='orange', label='avgpr_train')
                plt.plot(x, avgpr_vals, color='green', label='avgpr_val')
                plt.plot(x, avgpr_tests, color='blue', label = 'avgpr_test')
                plt.xlabel('Epoch')
                plt.ylabel('AVGPR-score')
                plt.legend()
                plt.ylim(top=1)
                plt.grid()
                plt.savefig(f'learningCurves/{fold}/{nameLayer}_{hidden_dim}_{dropout}_{opt}_{lr}_{weight_decay}.pdf'\
                            ,bbox_inches='tight')
                plt.clf()
            
            print('Configuration number ', str(i), ' end')
            print()
            
            log.write('\n\n')
            log.flush()

            i+=1
            
        log.write(f'Best configuration number: {best_config_i}\n')
        best_config_write = copy.deepcopy(best_config)
        best_config_write['layer'] = best_config_write['layer'].__name__
        log.write(json.dumps(best_config_write,indent=2))
        log.write('\n')
        log.write(json.dumps(scores,indent=2))
        log.write('\n')
        log.flush()
        log.close()
        
    return top_model, best_config, scores

In [None]:
best_model, best_config, scores = gnn_tuning(grid_4, input_dim, num_classes, device, train_data, current_test_data, future_data, 'new_grid_gnn_text_august2016.txt', 'GNN/new_grid/text')

## Single Train

In [None]:
model = LinkPredModel(input_dim, 16, num_classes, dropout=0.0, layer=GCNConv).to(device)
model.reset_parameters()

In [None]:
weight_decay = 5e-4
#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=weight_decay)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay = weight_decay)
#optimizer = torch.optim.RMSprop(model.parameters(), lr=0.1, weight_decay = weight_decay)
#optimizer = torch.optim.Adagrad(model.parameters(), lr=0.01, weight_decay=weight_decay)

In [None]:
best_model , prtrain , prval, prtest = train(model, train_data, current_test_data, future_data, device, optimizer)

In [None]:
#train orange test blue val green
num_epochs = 200
x = range(num_epochs)
plt.clf()
plt.plot(x, prtrain, color='orange', label='avgpr_train')
plt.plot(x, prval, color='green', label='avgpr_val')
plt.plot(x, prtest, color='blue', label = 'avgpr_test')
plt.xlabel('Epoch')
plt.ylabel('AVGPR-score')
plt.legend()
plt.ylim(top=1)
plt.grid()
plt.savefig(f'new_AVGPR_learningCurve_GNNAll_august2016.pdf',bbox_inches='tight')
plt.show()
plt.clf()

In [None]:
roc_train, f1_score_train = test(best_model, train_data, device)
roc_val, f1_score_current_test = test(best_model, current_test_data, device)
roc_test, f1_score_future_test = test(best_model, future_data, device)

In [None]:
print('TRAIN SET')
print(f' ROC AUC SCORE: {roc_train}')
print(f' F1-SCORE: {f1_score_train}')

print()

print('CURRENT TEST SET')
print(f' ROC AUC SCORE: {roc_val}')
print(f' F1-SCORE: {f1_score_current_test}')

print()

print('FUTURE TEST SET')
print(f' ROC AUC SCORE: {roc_test}')
print(f' F1-SCORE: {f1_score_future_test}')