## Import Package

In [1]:
# Base
from time import time
from datetime import timedelta
import random
import numpy as np
import pandas as pd

# Graph
import networkx as nx
# node embedding
from node2vec import Node2Vec

# Pytorch
import torch
from torch import nn
from torch.nn import init
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [2]:
class GraphStructure():   
    def __init__(self, G):
          self.G = G

    '''calucate disconnected pairs for negative sample'''
    def disconnected_node_pairs(self, node_list):
        possible_node_pairs = list()
        adjacency_matrix = nx.to_numpy_array(self.G, nodelist=node_list)
        for i in range(adjacency_matrix.shape[0]):
            for j in range(adjacency_matrix.shape[1]):
                if i != j:
                    try:
                        n = nx.shortest_path_length(G, str(i), str(j))
                    except:
                        n = 0
                    if n <= 2 and adjacency_matrix[i, j] == 0:
                        possible_node_pairs.append((node_list[i], node_list[j]))
#                 if i != j and adjacency_matrix[i][j] == 0:
#                     possible_node_pairs.append((node_list[i], node_list[j]))
        return possible_node_pairs

    '''calucate removable pairs for positive sample'''
    def removable_node_pairs(self, node_pairs_df):
        # check whether removing a node pair will cause
        # 1: graphic segmentation
        # 2: reduce the number of nodes
        removable_links_index = list()
        original_node_num = self.G.number_of_nodes()
        temp_node_pairs_df = node_pairs_df.copy()
        for i in tqdm(node_pairs_df.index.values):
            temp_G = nx.from_pandas_edgelist(temp_node_pairs_df.drop(index = i), "node1", "node2", create_using=nx.Graph())
            if (nx.number_connected_components(temp_G) == 1) and (temp_G.number_of_nodes() == original_node_num):
                removable_links_index.append(i)
                temp_node_pairs_df = temp_node_pairs_df.drop(index = i) 
        return removable_links_index

def load_dataset(file_path, split_symbol, read_title=False):
    node_pairs = list()
    with open(file_path, 'r') as f:
        if read_title:
            title = f.readline()
        for line in f.readlines():
            node_pairs.append(list(line.strip().split(split_symbol)))
        dataset_df = pd.DataFrame(node_pairs, columns=['node1', 'node2'])
    return dataset_df

def preprocess(node_pairs_df):
    instances = list()
    for i, row in node_pairs_df.iterrows():
        s_index, t_index, label = row
        instance = {
            'source': torch.LongTensor(np.array([int(s_index)-1])),
            'target': torch.LongTensor(np.array([int(t_index)-1])),
            'label': torch.FloatTensor(np.array([float(label)]))
        }
        instances.append(instance)
    return instances

## Load data

In [3]:
if __name__ == '__main__':
    # Random seed
    seed = 42
    valid_sample_ratio = 0.1
    test_sample_ratio = 0.2
    random.seed(seed)
    torch.cuda.manual_seed(seed)

#     node_pairs_df = load_dataset('fb-pages-food.edges', split_symbol=',', read_title=False)
    node_pairs_df = load_dataset('out.dimacs10-polblogs', split_symbol='\t', read_title=True)

## Dataset Splitting and Labeling

In [4]:
    # node_pairs = [ pair for pair in zip(node_pairs_df['node1'], node_pairs_df['node2'])]
    test_snapshot = nx.from_pandas_edgelist(node_pairs_df, 'node1', 'node2', create_using=nx.Graph())
    test_node_pairs_df = pd.DataFrame(list(test_snapshot.edges()), columns=['node1', 'node2'])    
    print('total # of nodes:', test_snapshot.number_of_nodes())
    print('total # of edges:', test_snapshot.number_of_edges())

total # of nodes: 1224
total # of edges: 16715


#### Test data

In [5]:
    test_gs = GraphStructure(test_snapshot)
    # sampling edges(negative)
    test_no_edge_pairs = test_gs.disconnected_node_pairs(list(dict.fromkeys(node_pairs_df['node1'].to_list()+node_pairs_df['node2'].to_list())))
    test_no_edge_pairs_df = pd.DataFrame(test_no_edge_pairs, columns=['node1', 'node2'])
    test_negative_df = test_no_edge_pairs_df
    
    # to run for about 6 mins 
    # removable_node_pairs_index = gs.removable_node_pairs(node_pairs_df)
    
    # sampling edges(postive)
    sample_ratio = 0.2
    test_positive_instance = random.sample(list(test_snapshot.edges()), int(test_snapshot.number_of_edges()*sample_ratio))
    test_positive_df = pd.DataFrame(test_positive_instance, columns=['node1', 'node2'])
    
    # labeling
    test_negative_df['label'] = 0
    test_positive_df['label'] = 1
    print("test # of negative: %d\t# of positive: %d" % (len(test_negative_df), len(test_positive_df)))
    
    test_negative_df = test_negative_df.sample(len(test_positive_df), replace=True)
    test_dataset_df = test_negative_df.append(test_positive_df)
    test_positive_num, test_negative_num = test_dataset_df.label.value_counts()
    print("sample after:\n# of negative: %d\t# of positive: %d\n" % (test_positive_num, test_negative_num))
    print(test_dataset_df)    

test # of negative: 1463522	# of positive: 3343
sample after:
# of negative: 3343	# of positive: 3343

        node1 node2  label
666016    410   281      0
1089926  1166   248      0
1247237   632   732      0
364504    118   612      0
566664    186   632      0
...       ...   ...    ...
3338      234   644      1
3339      231   397      1
3340      312   583      1
3341      177   181      1
3342      129   292      1

[6686 rows x 3 columns]


#### Training data

In [6]:
    previous_snapshot = test_snapshot.copy()
    # remove postive edges
    for pair in test_positive_instance:
        previous_snapshot.remove_edge(*pair)
        
    train_positive_df = pd.DataFrame(previous_snapshot.edges(), columns=['node1', 'node2'])
    train_gs = GraphStructure(previous_snapshot)
    train_no_edge_pairs = train_gs.disconnected_node_pairs(list(dict.fromkeys(train_positive_df['node1'].to_list()+train_positive_df['node2'].to_list())))
    train_no_edge_pairs_df = pd.DataFrame(train_no_edge_pairs, columns=['node1', 'node2'])
    train_negative_df = train_no_edge_pairs_df
    
    # labeling
    train_negative_df['label'] = 0
    train_positive_df['label'] = 1
    print("# of negative: %d\t# of positive: %d" % (len(train_negative_df), len(train_positive_df)))

    train_negative_df = train_negative_df.sample(len(train_positive_df), replace=True)
    train_dataset_df = train_negative_df.append(train_positive_df)
    train_positive_num, train_negative_num = train_dataset_df.label.value_counts()
    print("sample after:\n# of negative: %d\t# of positive: %d\n" % (train_positive_num, train_negative_num))
    print(train_dataset_df)

# of negative: 1395312	# of positive: 13372
sample after:
# of negative: 13372	# of positive: 13372

        node1 node2  label
1131863   511   764      0
296741    277   146      0
1314033  1106  1108      0
1107438   964  1063      0
54017      50   190      0
...       ...   ...    ...
13367    1081  1155      1
13368    1117  1157      1
13369    1168  1210      1
13370    1180  1181      1
13371    1189  1213      1

[26744 rows x 3 columns]


#### Preprocessing

In [7]:
    test_instances = preprocess(test_dataset_df)
    train_instances = preprocess(train_dataset_df)
    
    print('# of train instances:', len(train_instances))
    print('# of test instances:', len(test_instances))
    print('# of total instances:', len(train_instances)+len(test_instances))

# of train instances: 26744
# of test instances: 6686
# of total instances: 33430


## Graph Node Embedding with Node2Vec

In [8]:
    node2vec = Node2Vec(previous_snapshot, dimensions=128, walk_length=80, num_walks=10)

HBox(children=(HTML(value='Computing transition probabilities'), FloatProgress(value=0.0, max=1224.0), HTML(va…

Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s]




Generating walks (CPU: 1): 100%|██████████| 10/10 [00:36<00:00,  3.69s/it]


In [9]:
    n2v_model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [10]:
    node_embedding = n2v_model.wv.vectors
    node_embedding.shape

(1224, 128)

## Training

In [11]:
    class NodePairDataset(Dataset):
        def __init__(self, instances):
            self.instances = instances

        def __len__(self):
            return len(self.instances)

        def __getitem__(self, i):
            instance = self.instances[i]
            source = instance['source']
            target = instance['target']
            label = instance['label']
            return source, target, label
        
    def collate_fn(batch):
        source, target, labels = zip(*batch)
        source = torch.stack(source)
        target = torch.stack(target)
        labels = torch.stack(labels)
        return source, target, labels

    def get_dataloader(instances, collate_fn=collate_fn,batch_size=1, num_workers=2):
        dataset = NodePairDataset(instances)
        dataloader = DataLoader(dataset, collate_fn=collate_fn, shuffle=True, batch_size=batch_size, num_workers=num_workers)
        return dataloader

In [12]:
    class LinkEmbedding(nn.Module):
        def __init__(self, inputs_dim, output_dim):
            super(LinkEmbedding, self).__init__()
            self.weight = nn.Parameter(nn.init.xavier_uniform_(torch.empty(inputs_dim, output_dim)))
            
            
        def forward(self, hidden_state, source, target):
            propagation = torch.mul(hidden_state[source, :], hidden_state[target, :])
            propagation = propagation.matmul(self.weight)
            return propagation
    
    class GraphConvolution(nn.Module):
        def __init__(self, inputs_dim, hidden_features):
            super(GraphConvolution, self).__init__()
            self.weight = nn.Parameter(nn.init.kaiming_normal_(torch.empty(inputs_dim, hidden_features), mode='fan_in', nonlinearity='relu'))
            
        def forward(self, input_features, adj_matrix):
            # aggregate 
            aggregate  = torch.mm(input_features, self.weight)
            propagation = torch.mm(adj_matrix, aggregate)
            return propagation
        
    class GCN(nn.Module):
        def __init__(self, inputs_dim, hidden_dim, output_dim, class_num=1):
            super(GCN, self).__init__()
            self.gcn_layer1 = GraphConvolution(inputs_dim, hidden_dim)
            self.gcn_layer2 = GraphConvolution(hidden_dim, hidden_dim)
            self.link_embed_layer = LinkEmbedding(hidden_dim, class_num)
            self.relu = nn.ReLU()
            self.sigmoid = nn.Sigmoid()

        def forward(self, input_features, adj_matrix, source, target):
            hidden_state = self.relu(self.gcn_layer1(input_features, adj_matrix))
            hidden_state = self.gcn_layer2(hidden_state, adj_matrix)
            hidden_state = self.link_embed_layer(hidden_state, source, target)
            return hidden_state

In [13]:
    class GCNTrainer():
        def __init__(self, features, adj_matrix, train_instances, valid_instances=None, test_instances=None, 
            hidden_dim=16, epoch=1, learning_rate=1e-2, batch_size=1,num_workers=2, valid=False):

            # parameters
            self.valid = valid
            self.epochs = epoch
            self.learning_rate = learning_rate
            self.batch_size = batch_size
            self.num_workers = num_workers
            # early stop
            self.best_valid_loss = 1e10
            self.max_patience = 0
            self.patience = 0

            # setup cuda device
            self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

            # dataset
            self.train_instances = train_instances
            self.valid_instances = valid_instances
            self.test_instances = test_instances
            self.features = torch.FloatTensor(features).cuda()
            self.adj_matrix = torch.FloatTensor(self.normalize(adj_matrix)).cuda()
            
            # GCN Model
            self.model = GCN(self.features.shape[1], hidden_dim, 1)
            self.model.cuda()
            # print(self.model)

            # AdamW optimizer with hyper-parameter
            self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)

            # Binary Cross Entropy with Loss for criterion
            self.criterion = nn.BCEWithLogitsLoss()
        

        def normalize(self, A):
            '''
            :var I: identity matrix
            :var A: adjacency matrix
            :var D: degree matrix
            :var A_hat: adding self-loops
            :var D_inv: degree inverse matrix
            '''
            I = np.matrix(np.identity(A.shape[0]))
            A_hat = I + A
            
            D = np.array(np.sum(A, axis=0))
            D_inv = D**-0.5
            D_inv[np.isinf(D_inv)] = 0.
            D_inv = np.diag(D_inv)

            A_hat = D_inv * A_hat * D_inv
            return A_hat
        
        def accuracy(self, predicts, labels):
            predicts_labels = torch.round(torch.sigmoid(predicts))
            total_correct = (predicts_labels == labels).sum().float()
            return torch.round((total_correct / labels.shape[0]) * 100)

        def train(self):
            start_time = time()
            self.optimizer.zero_grad()

            train_dataloader = get_dataloader(self.train_instances, collate_fn=collate_fn, batch_size=self.batch_size, num_workers=self.num_workers)
            for epoch in range(self.epochs):
                self.model.train()
                epoch_loss, epoch_acc = 0, 0
                ''' train '''
                for i, batch in enumerate(train_dataloader, start=1):
                    batch = (tensor.cuda() for tensor in batch)
                    source, target, labels = batch
                    # forward
                    # feature: all node embedding
                    outputs = self.model(self.features, self.adj_matrix, source, target)
                    outputs = outputs.reshape(labels.size())
                    # backward
                    loss = self.criterion(outputs, labels)
                    acc = self.accuracy(outputs, labels)
                    epoch_loss += loss.item()
                    epoch_acc += acc
                    
                    loss.backward()
                    # optimize
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
                    # Progressbar
                    elapsed_time = time() - start_time
                    elapsed_time = timedelta(seconds=int(elapsed_time))
                    # print("Epoch %d/%d | loss: %.6f | acc: %f | batch: [%d/%d] | %s" % (epoch+1, self.epochs, loss, acc, i, len(train_dataloader), elapsed_time))
                
                print("Epoch %d/%d - train_loss: %.6f - train_acc: %f" 
                      % (epoch+1, self.epochs, epoch_loss/len(train_dataloader), epoch_acc/len(train_dataloader)))
                
                ''' validate '''
                if self.valid:
                    valid_loss, valid_acc = self.validate()
                    elapsed_time = time() - start_time
                    elapsed_time = timedelta(seconds=int(elapsed_time))
                    print("Epoch %d/%d - valid_loss: %.6f - valid_acc: %f" % (epoch+1, self.epochs, valid_loss, valid_acc))

                    # early stoping
                    if valid_loss < self.best_valid_loss:
                        self.patience = 0
                        self.best_valid_loss = valid_loss
                    else:
                        self.patience += 1

                    if self.patience > self.max_patience:
                        print('Earlystop at epoch %d' % (epoch+1))
                        break


        def validate(self):
            total_loss, total_acc = 0, 0
            self.model.eval()
            with torch.no_grad():
                valid_dataloader = get_dataloader(self.valid_instances, collate_fn=collate_fn, batch_size=self.batch_size, num_workers=self.num_workers)
                for batch in valid_dataloader:
                    batch = (tensor.cuda() for tensor in batch)
                    source, target, labels = batch
                    outputs = self.model(self.features, self.adj_matrix, source, target)
                    outputs = outputs.reshape(labels.size())
                    loss = self.criterion(outputs, labels)
                    # loss and accuracy
                    total_loss += loss.item()
                    total_acc += self.accuracy(outputs, labels)
            
            total_loss /= len(valid_dataloader)
            total_acc /= len(valid_dataloader)
            return float(total_loss), float(total_acc)

        def test(self):
            total_loss, total_acc = 0, 0
            self.model.eval()
            with torch.no_grad():
                test_dataloader = get_dataloader(self.test_instances, collate_fn=collate_fn, batch_size=self.batch_size, num_workers=self.num_workers)
                for batch in test_dataloader:
                    batch = (tensor.cuda() for tensor in batch)
                    source, target, labels = batch
                    outputs = self.model(self.features, self.adj_matrix, source, target)
                    outputs = outputs.reshape(labels.size())
                    loss = self.criterion(outputs, labels)
                    # loss and accuracy
                    total_loss += loss.item()
                    total_acc += self.accuracy(outputs, labels)
            
            total_loss /= len(test_dataloader)
            total_acc /= len(test_dataloader)
            return total_loss, float(total_acc)

In [14]:
    adj_matrix = nx.to_numpy_array(previous_snapshot)
    print(adj_matrix)

[[0. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
    torch.cuda.empty_cache()
    trainer = GCNTrainer(features=node_embedding, adj_matrix=adj_matrix, 
                         train_instances=train_instances, 
                         valid_instances=None,
                         test_instances=test_instances,
                         hidden_dim=64, epoch=300, learning_rate=1e-2, batch_size=128, num_workers=2)

  D_inv = D**-0.5


In [18]:
    trainer.train()

Epoch 1/300 - train_loss: 0.568621 - train_acc: 67.239235
Epoch 2/300 - train_loss: 0.499618 - train_acc: 75.210526
Epoch 3/300 - train_loss: 0.473987 - train_acc: 77.550240
Epoch 4/300 - train_loss: 0.460492 - train_acc: 78.181816
Epoch 5/300 - train_loss: 0.455196 - train_acc: 78.320572
Epoch 6/300 - train_loss: 0.445189 - train_acc: 79.033493
Epoch 7/300 - train_loss: 0.430102 - train_acc: 80.143539
Epoch 8/300 - train_loss: 0.423889 - train_acc: 80.655502
Epoch 9/300 - train_loss: 0.426761 - train_acc: 81.315788
Epoch 10/300 - train_loss: 0.423019 - train_acc: 81.430618
Epoch 11/300 - train_loss: 0.413002 - train_acc: 82.358849
Epoch 12/300 - train_loss: 0.389020 - train_acc: 83.038277
Epoch 13/300 - train_loss: 0.384336 - train_acc: 83.435402
Epoch 14/300 - train_loss: 0.497431 - train_acc: 79.344498
Epoch 15/300 - train_loss: 0.393819 - train_acc: 83.028702
Epoch 16/300 - train_loss: 0.367515 - train_acc: 84.315788
Epoch 17/300 - train_loss: 0.373923 - train_acc: 84.086121
Epoch 

In [19]:
    trainer.test()

(0.8730430625519663, 83.94339752197266)