# IMPORT

In [None]:
import torch
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, GRUCell
from torch_geometric.data import Data
from sklearn.metrics import roc_auc_score,average_precision_score

import random

import bisect 

import gc
import copy

from itertools import permutations

import pandas as pd

from torch_geometric.utils import negative_sampling
import torch_geometric.transforms as T
from torch_geometric.transforms import SVDFeatureReduction
from torch_geometric.utils import train_test_split_edges
from torch_geometric.transforms import RandomLinkSplit,NormalizeFeatures,Constant,OneHotDegree
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv,SAGEConv,GATv2Conv, GINConv, Linear
from scipy.stats import entropy

import torch
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

import copy
import itertools
import json

# LOAD DATASET

In [None]:
from steemitdata import get_steemit_dataset

In [None]:
#Snapshots with constant encoder as node features
#Snapshots with textual features as node features

snapshots_c = get_steemit_dataset(preprocess='constant')
snapshots_t = get_steemit_dataset(preprocess='text')

In [None]:
for i in range(len(snapshots_t)):
    torch.save(snapshots_t[i].x, f'steemit-t3gnn-data/{i}_x.pt')
    torch.save(snapshots_t[i].edge_index, f'steemit-t3gnn-data/{i}_edge_index.pt')

In [None]:
#Snapshots with random features as node features
snapshots_ts = get_steemit_dataset(preprocess='constant')
for snap in snapshots_ts:
    snap.x = torch.randn(snap.num_nodes, 384)

# LOAD MODEL

In [None]:
from t3gnn import T3GNNLP

In [None]:
def test(model, test_data, data, isnap, device='cpu'):
    model.eval()

    test_data = test_data.to(device)

    h, _ = model(test_data.x, test_data.edge_index, edge_label_index = test_data.edge_label_index, isnap=isnap)
    
    pred_cont_link = torch.sigmoid(h).cpu().detach().numpy()
    
    label_link = test_data.edge_label.cpu().detach().numpy()
      
    avgpr_score_link = average_precision_score(label_link, pred_cont_link)
    
    return avgpr_score_link

In [None]:
from sklearn.metrics import *

def train_single_snapshot(model, data, train_data, val_data, test_data, isnap,\
                          last_embeddings, optimizer, device='cpu', num_epochs=50, verbose=False):
    
    avgpr_val_max = 0
    best_model = model
    train_data = train_data.to(device)
    best_epoch = -1
    best_current_embeddings = []
    
    avgpr_trains = []
    #avgpr_vals = []
    avgpr_tests = []
    
    tol = 1
    
    for epoch in range(num_epochs):
        model.train()
        ## Note
        ## 1. Zero grad the optimizer
        ## 2. Compute loss and backpropagate
        ## 3. Update the model parameters
        optimizer.zero_grad()

        pred,\
        current_embeddings =\
            model(train_data.x, train_data.edge_index, edge_label_index = train_data.edge_label_index,\
                  isnap=isnap, previous_embeddings=last_embeddings)
        
        loss = model.loss(pred, train_data.edge_label.type_as(pred)) #loss to fine tune on current snapshot

        loss.backward(retain_graph=True)  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.

        ##########################################

        log = 'Epoch: {:03d}\n AVGPR Train: {:.4f}, Val: {:.4f}, Test: {:.4f}\n MRR Train: {:.4f}, Val: {:.4f}, Test: {:.4f}\n F1-Score Train: {:.4f}, Val: {:.4f}, Test: {:.4f}\n Loss: {}'
        avgpr_score_val  = test(model, val_data, data, isnap, device)
        
        #avgpr_trains.append(avgpr_score_train)
        #avgpr_vals.append(avgpr_score_val)
        #avgpr_tests.append(avgpr_score_test)
        
        #mrr_trains.append(mrr_train)
        #mrr_vals.append(mrr_val)
        #mrr_tests.append(mrr_test)
        
        if avgpr_val_max-tol <= avgpr_score_val:
            avgpr_val_max = avgpr_score_val
            best_epoch = epoch
            best_current_embeddings = current_embeddings
            best_model = model
        else:
            break
        
        
    avgpr_score_train = test(model, train_data, data, isnap, device)
    avgpr_score_test = test(model, test_data, data, isnap, device)
            
    if verbose:
        print(f'Best Epoch: {best_epoch}')
    #print(f'Best Epoch: {best_epoch}')
    
    return best_model, optimizer, avgpr_score_train, avgpr_score_test, best_current_embeddings

In [None]:
def train_roland(snapshots, hidden_conv1, hidden_conv2, update='gru', device='cpu'):
    num_snap = len(snapshots)
    input_channels = snapshots[0].x.size(1)
    num_nodes = snapshots[0].x.size(0)
    last_embeddings = [torch.Tensor([[0 for i in range(hidden_conv1)] for j in range(num_nodes)]),\
                                    torch.Tensor([[0 for i in range(hidden_conv2)] for j in range(num_nodes)])]
    avgpr_train_singles = []
    avgpr_test_singles = []
    mrr_train_singles = []
    mrr_test_singles = []
    
    roland = T3GNNLP(input_channels, num_nodes, dropout=0.3, update=update)
    rolopt = torch.optim.Adam(params=roland.parameters(), lr=0.01, weight_decay = 5e-3)
    roland.reset_parameters()
    
    for i in range(num_snap-1):
        #CREATE TRAIN + VAL + TEST SET FOR THE CURRENT SNAP
        snapshot = copy.deepcopy(snapshots[i])
        num_current_edges = len(snapshot.edge_index[0])
        transform = RandomLinkSplit(num_val=0.0,num_test=0.25)
        train_data, _, val_data = transform(snapshot)
        test_data = copy.deepcopy(snapshots[i+1])
        
        #NEGATIVE SET: EDGES CLOSED IN THE PAST BUT NON IN THE CURRENT TEST SET
        past_edges = set(zip([int(e) for e in snapshot.edge_index[0]],\
                             [int(e) for e in snapshot.edge_index[1]]))
        current_edges = set(zip([int(e) for e in test_data.edge_index[0]],\
                             [int(e) for e in test_data.edge_index[1]]))
        
        negative_edges = list(past_edges.difference(current_edges))[:test_data.edge_index.size(1)]
        future_neg_edge_index = torch.Tensor([[a[0] for a in negative_edges],\
                                                 [a[1] for a in negative_edges]]).long()
        
        num_pos_edge = test_data.edge_index.size(1)
        num_neg_edge = future_neg_edge_index.size(1)
        test_data.edge_label = torch.Tensor(np.array([1 for i in range(num_pos_edge)] + [0 for i in range(num_neg_edge)]))
        test_data.edge_label_index = torch.cat([test_data.edge_index, future_neg_edge_index], dim=-1)
        
        #TRAIN AND TEST THE MODEL FOR THE CURRENT SNAP
        roland, rolopt, avgpr_train, avgpr_test, last_embeddings =\
            train_single_snapshot(roland, snapshot, train_data, val_data, test_data, i,\
                                  last_embeddings, rolopt)
        
        
        #SAVE AND DISPLAY EVALUATION
        print(f'Snapshot: {i}\n\tLinkPre AVGPR Train: {avgpr_train}, Test: {avgpr_test}')
        avgpr_train_singles.append(avgpr_train)
        avgpr_test_singles.append(avgpr_test)
        
    avgpr_train_all = sum(avgpr_train_singles)/len(avgpr_train_singles)
    avgpr_test_all = sum(avgpr_test_singles)/len(avgpr_test_singles)
    
    print(f'LinkPre AVGPR over time: Train {avgpr_train_all}, Test: {avgpr_test_all}')
    
    return roland, avgpr_train_singles, avgpr_test_singles

In [None]:
def train_roland_random(snapshots, hidden_conv1, hidden_conv2, update='gru', device='cpu'):
    num_snap = len(snapshots)
    input_channels = snapshots[0].x.size(1)
    num_nodes = snapshots[0].x.size(0)
    last_embeddings = [torch.Tensor([[0 for i in range(hidden_conv1)] for j in range(num_nodes)]),\
                                    torch.Tensor([[0 for i in range(hidden_conv2)] for j in range(num_nodes)])]
    avgpr_train_singles = []
    avgpr_test_singles = []
    mrr_train_singles = []
    mrr_test_singles = []
    
    roland = T3GNNLP(input_channels, num_nodes, dropout=0.3, update=update)
    rolopt = torch.optim.Adam(params=roland.parameters(), lr=0.01, weight_decay = 5e-3)
    roland.reset_parameters()
    
    for i in range(num_snap-1):
        #CREATE TRAIN + VAL + TEST SET FOR THE CURRENT SNAP
        snapshot = copy.deepcopy(snapshots[i])
        num_current_edges = len(snapshot.edge_index[0])
        transform = RandomLinkSplit(num_val=0.0,num_test=0.25)
        train_data, _, val_data = transform(snapshot)
        test_data = copy.deepcopy(snapshots[i+1])
        
        #NEGATIVE SET: EDGES CLOSED IN THE PAST BUT NON IN THE CURRENT TEST SET
        future_neg_edge_index = negative_sampling(
            edge_index=test_data.edge_index, #positive edges
            num_nodes=test_data.num_nodes, # number of nodes
            num_neg_samples=test_data.edge_index.size(1)) # number of neg_sample equal to number of pos_edges
        #edge index ok, edge_label concat, edge_label_index concat
        num_pos_edge = test_data.edge_index.size(1)
        test_data.edge_label = torch.Tensor(np.array([1 for i in range(num_pos_edge)] + [0 for i in range(num_pos_edge)]))
        test_data.edge_label_index = torch.cat([test_data.edge_index, future_neg_edge_index], dim=-1)
        
        #TRAIN AND TEST THE MODEL FOR THE CURRENT SNAP
        roland, rolopt, avgpr_train, avgpr_test, last_embeddings =\
            train_single_snapshot(roland, snapshot, train_data, val_data, test_data, i,\
                                  last_embeddings, rolopt)
        
        
        #SAVE AND DISPLAY EVALUATION
        print(f'Snapshot: {i}\n\tLinkPre AVGPR Train: {avgpr_train}, Test: {avgpr_test}')
        avgpr_train_singles.append(avgpr_train)
        avgpr_test_singles.append(avgpr_test)
        
    avgpr_train_all = sum(avgpr_train_singles)/len(avgpr_train_singles)
    avgpr_test_all = sum(avgpr_test_singles)/len(avgpr_test_singles)
    
    print(f'LinkPre AVGPR over time: Train {avgpr_train_all}, Test: {avgpr_test_all}')
    
    return roland, avgpr_train_singles, avgpr_test_singles

## Train

In [None]:
import random
device = torch.device('cuda')
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.cuda.empty_cache()

In [None]:
hidden_conv1 = 64
hidden_conv2 = 32

In [None]:
_, avgpr_trains, avgpr_tests_c = train_roland(snapshots_c, hidden_conv1, hidden_conv2, update='mlp')

In [None]:
_, avgpr_trains, avgpr_tests_t = train_roland(snapshots_t, hidden_conv1, hidden_conv2, update='mlp')

In [None]:
_, avgpr_trains, avgpr_tests_ts = train_roland(snapshots_ts, hidden_conv1, hidden_conv2, update='mlp')

In [None]:
_, avgpr_trains, avgpr_tests_tr = train_roland_random(snapshots_t, hidden_conv1, hidden_conv2, update='mlp')

In [None]:
import matplotlib
font = {'size'   : 10}

matplotlib.rc('font', **font)

#plt.style.use('default')

In [None]:
x = range(0,25)
plt.plot(x, avgpr_tests_c, label='constant', linewidth=3)
plt.plot(x, avgpr_tests_t, label='text', linewidth=3)
plt.plot(x, avgpr_tests_ts, label='random-feature', linewidth=3)
plt.plot(x, avgpr_tests_tr, label='random-sampling', linewidth=3)
plt.xlabel('2-Week', fontsize=15)
plt.xlim((0,24))
plt.ylabel('AUPRC', fontsize=15)
plt.title('Link prediction performance over time', fontsize=15)
plt.legend()
#plt.savefig('linkpre-results-random-feature-sampling15.pdf',bbox_inches='tight')
plt.show()

In [None]:
comment_op = [
    191712,
    174692,
    156561,
    141733,
    130279,
    181194,
    135637,
    133169,
    174483,
    289970,
    341374,
    448947,
    552799,
    411647,
    337296,
    298362,
    269092,
    270818,
    243382,
    235857,
    190407,
    199524,
    222980,
    217708,
    234757
]

In [None]:
plt.plot(range(0,25),[e.edge_index.size(1) for e in snapshots_c[1:]], label='transaction_op', linewidth=7)
plt.plot(range(0,25),comment_op, label='comment_op', linewidth=7)
plt.yscale('log')
plt.xlabel('2-Week')
plt.ylabel('Count')
plt.xlim(0,24)
plt.legend()
plt.title('Number of operations per 2-week')
#plt.savefig('operations-wide20.pdf',bbox_inches='tight')
plt.show()