To install pytorch geometric run the cell below

In [None]:
#!pip install torch=='1.9.0'
#!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.9.0+cu102.html

# IMPORT

In [None]:
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score,average_precision_score

from torch_geometric.utils import negative_sampling
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges
from torch_geometric.transforms import RandomLinkSplit,NormalizeFeatures,Constant,OneHotDegree
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv,SAGEConv,GATConv
from scipy.stats import entropy

import torch
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

import copy
import itertools
import json

# LOAD DATASET

You have to choose only one of the following four cells to construct the time slices over a certain time interval

In [None]:
#time period used in the works
time_slices = {
    0: {
        'future_start': '2016-07-03',
        'future_end': '2016-08-02',
        'start' : '2016-06-03',
        'end': '2016-07-02'
    },
    
    1: {
        'future_start': '2016-08-03',
        'future_end': '2016-09-02',
        'start': '2016-07-03',
        'end': '2016-08-02'
    },
    
    2:{
        'future_start': '2016-09-03',
        'future_end': '2016-10-02',
        'start': '2016-08-03',
        'end': '2016-09-02'
    },
    
    3:{
        'future_start': '2016-10-03',
        'future_end': '2016-11-02',
        'start': '2016-09-03',
        'end': '2016-10-02'
    }
}

In [None]:
#10 days time slice
time_slices = {
    0: {
        'future_start': '2016-08-03',
        'future_end': '2016-08-12',
        'start' : '2016-06-03',
        'end': '2016-08-02'
    },
    
    1: {
        'future_start': '2016-08-13',
        'future_end': '2016-08-22',
        'start': '2016-08-03',
        'end': '2016-08-12'
    },
    
    2:{
        'future_start': '2016-08-23',
        'future_end': '2016-09-01',
        'start': '2016-08-13',
        'end': '2016-08-22'
    },
    
    3:{
        'future_start': '2016-09-02',
        'future_end': '2016-09-11',
        'start': '2016-08-23',
        'end': '2016-09-01'
    },
    
    4:{
        'future_start': '2016-09-12',
        'future_end': '2016-09-21',
        'start': '2016-09-02',
        'end': '2016-09-11'
    },
    
}

In [None]:
#Nov-Dec 2016 time interval
time_slices = {
    0: {
        'future_start': '2016-12-03',
        'future_end': '2017-01-02',
        'start' : '2016-06-03',
        'end': '2016-12-02'
    },
    
    1: {
        'future_start': '2017-01-03',
        'future_end': '2017-02-02',
        'start': '2016-12-03',
        'end': '2017-01-02'
    },
    
    2:{
        'future_start': '2017-02-03',
        'future_end': '2017-03-02',
        'start': '2017-01-03',
        'end': '2017-02-02'
    },
}

In [None]:
#Nov-Dec 2016 time interval with 10 days time slice
time_slices = {
    0: {
        'future_start': '2016-12-03',
        'future_end': '2016-12-12',
        'start' : '2016-06-03',
        'end': '2016-12-02'
    },
    
    1: {
        'future_start': '2016-12-13',
        'future_end': '2016-12-22',
        'start': '2016-12-03',
        'end': '2016-12-12'
    },
    
    2:{
        'future_start': '2016-12-23',
        'future_end': '2017-01-01',
        'start': '2016-12-13',
        'end': '2016-12-22'
    },
    
    3:{
        'future_start': '2017-01-02',
        'future_end': '2017-01-11',
        'start': '2016-12-23',
        'end': '2017-01-01'
    },
    4:{
        'future_start': '2017-01-12',
        'future_end': '2017-01-21',
        'start': '2017-01-02',
        'end': '2017-01-11'
    },
}

In [None]:
PERIOD = 1

start_train = time_slices[0]['start']
end_train = time_slices[PERIOD]['end']

start_future = time_slices[0]['start']
end_future = time_slices[PERIOD]['future_end']

start_future_test = time_slices[0]['start']
end_future_test = time_slices[PERIOD+1]['future_end']

In [None]:
#gpath_train = f"dataset/graph/custom_json_{start_train}_{end_train}"
#gpath_future = f"dataset/graph/custom_json_{start_future}_{end_future}"
#gpath_test_future = f"dataset/graph/custom_json_{start_future_test}_{end_future_test}"
gpath_train = f"../dummy-data/gnn/custom_json_{start_train}_{end_train}"
gpath_future = f"../dummy-data/gnn/custom_json_{start_future}_{end_future}"
gpath_test_future = f"../dummy-data/gnn/custom_json_{start_future_test}_{end_future_test}"

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#textfpath_train = f"dataset/textFeatures/textFeatures_{start_train}_{end_train}.json" 
#textfpath_test = f"dataset/textFeatures/textFeatures_{start_future}_{end_future}.json"
textfpath_train = f"../dummy-data/gnn/textFeatures_{start_train}_{end_train}.json" 
textfpath_test = f"../dummy-data/gnn/textFeatures_{start_future}_{end_future}.json"

In [None]:
def load_pyg_dataset(gpath_current, gpath_future, feature = 'struct', textfpath = None, topicSim=False):
    
    if feature not in ['constant','one-hot-encode','struct','text','topic','all']:
        raise Exception('Invalid feature value')
    
    #LOAD GRAPH AND COMPUTE NODE FEATURES
    G_current_all = nx.read_edgelist(gpath_current, create_using=nx.DiGraph)

    if feature == 'struct' or feature == 'all':
        #COMPUTE STRUCTURAL FEATURES
        pr = nx.pagerank(G_current_all)
        indeg = dict(G_current_all.in_degree())
        outdeg = dict(G_current_all.out_degree())
        neighdeg = nx.average_neighbor_degree(G_current_all)
    
    if feature == 'text' or feature == 'topic' or feature == 'all':
        #COMPUTE TEXTUAL FEATURES
        if textfpath is None:
            raise Exception('textfpath parameter must be not None if feature is text or all')
        with open(textfpath) as f:
            textf_raw = json.load(f)
            textf = dict()
            textf['numPost'] = textf_raw['numPost']
            textf['numComment'] = textf_raw['numComment']
            textf['numTag'] = textf_raw['numTag']
            textf['corpusLength'] = textf_raw['corpusLength']
            textf['corpusStdev'] = textf_raw['corpusStdev']
            
            if feature == 'topic':
                topics = {}
                for user,lda in textf_raw['lda_all'].items():
                    for i in range(30): #30 is the number of LDA topic
                        if f'topic_{i}' not in topics:
                            topics[f'topic_{i}'] = {}
                        topics[f'topic_{i}'][user] = lda[i]
                        
                for name,topicf in topics.items():
                    textf[name] = topicf
            
            if topicSim: 
                #compute an index based on entropy of LDA distributions to reflect
                #the user importance in term of its contents
                #the study on this feature is still not covered on our works
                entropy_all = {user:entropy(ldaAll) for user,ldaAll in textf_raw['lda_all'].items()}
                #entropy_post = {user:entropy(ldaPost) for user,ldaPost in textf_raw['lda_post'].items()}
                #entropy_comment = {user:entropy(ldaComment) for user,ldaComment in textf_raw['lda_comment'].items()}
                reciprocal_mean_followers_entropy_all = {}
                for user in G_current_all.nodes():
                    sum_followers_entropies = 0
                    count_followers = 0
                    for follower in G_current_all.predecessors(user):
                        follower_entropy = entropy_all[follower]
                        sum_followers_entropies += follower_entropy
                        count_followers += 1
                    mean_followers_entropy = 0
                    if count_followers > 0:
                        mean_followers_entropy = sum_followers_entropies / count_followers
                    reciprocal_mean = 0
                    if mean_followers_entropy != 0:
                        reciprocal_mean = 1 / mean_followers_entropy
                    reciprocal_mean_followers_entropy_all[user] = reciprocal_mean
                textf['reciprocal_mean_followers_entropy_all'] = reciprocal_mean_followers_entropy_all
                    
                
    #POSITIVE SET WITH LABELLING ON THE FUTURE PERIOD
    G_future = nx.read_edgelist(gpath_future, create_using=nx.DiGraph)

    G_current = nx.DiGraph(G_future.subgraph(G_current_all.nodes()))
    
    if feature == 'struct' or feature == 'text' or feature == 'topic' or feature == 'all':
        if feature == 'struct' or feature == 'all':
            #ASSIGN STRUCTURAL FEATURES TO EACH NODE
            nx.set_node_attributes(G_current,pr,'pagerank')
            nx.set_node_attributes(G_current,indeg,'in_degree')
            nx.set_node_attributes(G_current,outdeg,'out_degree')
            nx.set_node_attributes(G_current,neighdeg,'neigh_degree')
            #nx.set_node_attributes(G_train,nx.clustering(G_train),'clustering')
            #nx.set_node_attributes(G_train,nx.katz_centrality(G_train),'katz')
            
        if feature == 'text' or feature == 'all' or feature == 'topic':
            #ASSIGN TEXTUAL FEATURES TO EACH NODE
            for name, fdict in textf.items():
                nx.set_node_attributes(G_current, fdict, name)
    
    #CREATE PYG DATASET
    if feature != 'constant' and feature != 'one-hot-encode':
        group_attrs = all
    else:
        group_attrs = None

        
    current_data = from_networkx(G_current, group_node_attrs = group_attrs)
    current_data.train_mask = current_data.val_mask = current_data.test_mask = current_data.y = None
    
    if feature == 'constant':
        transform = Constant()
        current_data = transform(current_data)
    
    if feature == 'one-hot-encode':
        raise NotImplementedError('one-hot-encode not implemented')
        
    return current_data

In [None]:
current_data = load_pyg_dataset(gpath_train, gpath_future, feature = 'topic', textfpath = textfpath_train, topicSim=True)

#NORMALIZATION (L1-Norm)

transform = NormalizeFeatures()
current_data = transform(current_data)

#TRAIN TEST SPLIT + NEGATIVE SAMPLING
transform = RandomLinkSplit(num_val=0.0,num_test=0.25)
train_data, val_data, current_test_data = transform(current_data)

In [None]:
future_data = load_pyg_dataset(gpath_future, gpath_test_future, feature = 'topic', textfpath = textfpath_test,topicSim=True)

#NORMALIZATION
transform = NormalizeFeatures()
future_data = transform(future_data)

#NEGATIVE SAMPLING
future_neg_edge_index = negative_sampling(
        edge_index=future_data.edge_index, #positive edges
        num_nodes=future_data.num_nodes, # number of nodes
        num_neg_samples=future_data.edge_index.size(1)) # number of neg_sample equal to number of pos_edges

#edge index ok, edge_label cat, edge_label_index cat
num_pos_edge = future_data.edge_index.size(1)
future_data.edge_label = torch.Tensor(np.array([1 for i in range(num_pos_edge)] + [0 for i in range(num_pos_edge)]))
future_data.edge_label_index = torch.cat([future_data.edge_index, future_neg_edge_index], dim=-1)

In [None]:
train_data

In [None]:
future_data

# DATASET MANIPULATION UTILITIES

In [None]:
def getTrainTestNegEdgeIndex(dataset):
    num_pos = len(dataset.edge_index[0])
    neg_edge_index_src = dataset.edge_label_index[0][num_pos:]
    neg_edge_index_trg = dataset.edge_label_index[1][num_pos:]
    neg_edge_index = torch.Tensor(np.array([np.array(neg_edge_index_src),\
                                            np.array(neg_edge_index_trg)])).long()
    return neg_edge_index

In [None]:
def getValNegEdgeIndex(dataset):
    
    def posNegSplitPoint(edge_label):
        for i in range(1,len(edge_label)):
            if edge_label[i-1] != edge_label[i]:
                return i
        return -1
    
    num_pos = posNegSplitPoint(dataset.edge_label)
    neg_edge_index_src = dataset.edge_label_index[0][num_pos:]
    neg_edge_index_trg = dataset.edge_label_index[1][num_pos:]
    neg_edge_index = torch.Tensor(np.array([np.array(neg_edge_index_src),\
                                            np.array(neg_edge_index_trg)])).long()
    return neg_edge_index

# GAE MODULE

In [None]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) # cached only for transductive learning
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True) # cached only for transductive learning

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

In [None]:
def train(data):
    x = data.x.float().to(device)
    train_pos_edge_index = data.edge_index.to(device)
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    #if args.variational:
    #   loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)


def test(data, pos_edge_index, neg_edge_index):
    x = data.x.float().to(device)
    current_pos_edge_index = data.edge_index.to(device)
    model.eval()
    with torch.no_grad():
        z = model.encode(x, current_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [None]:
train_neg_edge_index = getTrainTestNegEdgeIndex(train_data)
val_pos_edge_index = current_test_data.edge_index
val_neg_edge_index = getValNegEdgeIndex(current_test_data)
test_pos_edge_index = future_data.edge_index
test_neg_edge_index = future_neg_edge_index

In [None]:
from torch_geometric.nn import GAE

In [None]:
# parameters
out_channels = 2
num_features = train_data.num_node_features

# model
model = GAE(GCNEncoder(num_features, out_channels))
model.reset_parameters()

# move to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# inizialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
num_epochs = 100
best_epoch = 0
ap_max = 0.0

avgpr_trains = []
avgpr_vals = []
avgpr_tests = []

best_model = copy.deepcopy(model)

for epoch in range(1, num_epochs + 1):
    
    loss = train(train_data)
    
    auc_train, ap_train = test(train_data, train_data.edge_index, train_neg_edge_index)
    auc_val, ap_val = test(current_test_data, val_pos_edge_index, val_neg_edge_index)
    auc_test, ap_test = test(future_data, test_pos_edge_index, test_neg_edge_index)
    print('Epoch: {:03d}, AUC: train: {:.4f}, val: {:.4f}, test: {:.4f}, AP: train: {:.4f}, val: {:.4f}, test: {:.4f}'\
          .format(epoch, auc_train, auc_val, auc_test, ap_train, ap_val, ap_test))
    
    if ap_test >= ap_max:
        best_epoch = epoch
        ap_max = ap_test
        best_model = copy.deepcopy(model)
        
    avgpr_trains.append(ap_train)
    avgpr_vals.append(ap_val)
    avgpr_tests.append(ap_test)

"""
#train orange test blue val green
x = range(num_epochs)
plt.clf()
plt.plot(x, avgpr_trains, color='orange', label='avgpr_train')
plt.plot(x, avgpr_vals, color='green', label='avgpr_val')
plt.plot(x, avgpr_tests, color='blue', label = 'avgpr_test')
plt.xlabel('Epoch')
plt.ylabel('AVGPR-score')
plt.legend()
plt.ylim(top=1)
plt.grid()
plt.savefig(f'learningCurves/GAE/august2016/new_all.pdf'\
            ,bbox_inches='tight')
plt.clf()
"""

print(f'Best epoch: {best_epoch}')