In [4]:
seed = 1234

### Setup

In [5]:
%%capture
!pip install dgl-cu111 dglgo -f https://data.dgl.ai/wheels/repo.html
# !pip install dgl dglgo -f https://data.dgl.ai/wheels/repo.html

%matplotlib inline
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import scipy.sparse as sp
import networkx as nx

import argparse
import math 
import time
import random
import itertools
import heapq
from tqdm import tqdm
from dgl.data import register_data_args

seed = seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
dgl.random.seed(seed)

### Neighborhood extention functions

In [6]:
class ExtendNeighborhood():

    def __init__(self, graph, graph_undirected, alpha, beta, gamma, explore, extend_metric, ebunch=None):
        self.graph = graph
        self.graph_undirected = graph_undirected
        self.extend_metric = extend_metric
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.explore = explore
        self.ebunch = ebunch

    def calc_score_aa(self, u, v):         
        common_neighs = list(nx.common_neighbors(self.graph_undirected, u, v))
        if len(common_neighs) > 0: 
            # scores = map(lambda x: 1 / math.log(self.graph_undirected.degree(x)), common_neighs)
            scores = []
            for x in common_neighs: 
                d = math.log(self.graph_undirected.degree(x))
                if d > 0:
                    scores.append(1/math.log(d))
                else:
                  continue
            return sum(scores)

    def add_edges_centrality_based(self):
        # these ones return a Dictionary of nodes with centrality as the value.
        if self.extend_metric == 'degree':
            centrality = nx.degree_centrality(self.graph_undirected)
        elif self.extend_metric == 'eigenvector':
            di_graph = nx.DiGraph(self.graph_undirected)
            centrality = nx.eigenvector_centrality(di_graph)

        important_nodes = heapq.nlargest(self.alpha, centrality, key=centrality.get)
        
        # TODO, should the selected nodes change for each imp_node?
        new_dataset = self.graph
        for item in important_nodes:
            tgt_nodes = set(self.graph_undirected) - set(self.graph_undirected[item])
            selected_nodes = torch.tensor(np.random.choice(list(tgt_nodes), self.beta))
            important_node = torch.ones(len(selected_nodes), dtype=int)*torch.tensor(item)
            new_dataset = dgl.add_edges(new_dataset, selected_nodes, important_node)
            new_dataset = dgl.add_edges(new_dataset, important_node, selected_nodes) 

        # new_dataset = self.graph
        # for item in important_nodes:
        #     tgt_nodes = set(self.graph_undirected) - set(self.graph_undirected[item])
        #     important_node = torch.ones(len(tgt_nodes), dtype=int)*torch.tensor(item)
        #     candidate_edges = list(zip(important_node, tgt_nodes))
            
        #     edges = []
        #     scores = []
        #     for edge in candidate_edges:
        #         u, v = edge
        #         try:
        #             pred_val = sum(1 / math.log(self.graph_undirected.degree(w)) for w in nx.common_neighbors(self.graph_undirected, u, v))
        #             # pred_val = calc_score(u, v)
        #             if pred_val > 0:
        #                 edges.append((u,v))
        #                 scores.append(pred_val)
        #         except:
        #             continue
        #     scores = np.array(scores)
        #     sum_scores = scores.sum()
        #     scores /= sum_scores

        #     if len(edges) > self.beta:
        #         rnd_indices = np.random.choice(len(edges), int(self.gamma), p=scores, replace=False)
        #         edges = [edges[i] for i in rnd_indices]
                
        #     extra_random = self.beta - len(edges)
        #     if extra_random > 0:
        #         rnd_indices = np.random.choice(len(candidate_edges), int(extra_random), replace=False)
        #         extra_edges = [candidate_edges[i] for i in rnd_indices]
        #         edges.extend(extra_edges)

        #     selected_edges = torch.tensor(edges)
        #     new_edges_1 = selected_edges[:,0]
        #     new_edges_2 = selected_edges[:,1]

        #     new_dataset = dgl.add_edges(new_dataset, new_edges_1, new_edges_2)
        #     new_dataset = dgl.add_edges(new_dataset, new_edges_2, new_edges_1)

        return new_dataset


    def non_edges_important_nodes(self):
        centrality = nx.degree_centrality(self.graph_undirected)
        scores = list( np.array(list(centrality.values())) / np.sum(list(centrality.values())) )
        selected_src_nodes = np.random.choice(list(centrality.keys()), self.alpha, p=scores, replace=False)
        # selected_src_nodes = heapq.nlargest(self.alpha, centrality, key=centrality.get)
        edges_list = []
        # for u in tqdm(selected_src_nodes):
        for u in selected_src_nodes:
            tgt_nodes = set(self.graph_undirected) - set(self.graph_undirected[u])
            selected_tgt_nodes = np.random.choice(list(tgt_nodes), int(self.explore*len(tgt_nodes)), replace=False)
            edges = list(zip(np.ones(len(selected_tgt_nodes), dtype=int)*u, selected_tgt_nodes))
            edges_list.extend(edges)
        return edges_list

    def adamic_adar_index(self, ebunch):
        def calc_score(u, v):         
            common_neighs = list(nx.common_neighbors(self.graph_undirected, u, v))
            if len(common_neighs) > 0: 
                # scores = map(lambda x: 1 / math.log(self.graph_undirected.degree(x)), common_neighs)
                scores = []
                for x in common_neighs: 
                    d = math.log(self.graph_undirected.degree(x))
                    if d > 0:
                        scores.append(1/math.log(d))
                    else:
                      continue
                return sum(scores)
                
        edges = []
        scores = []
        # for edge in tqdm(ebunch):
        for edge in ebunch:
            u, v = edge
            try:
                pred_val = sum(1 / math.log(self.graph_undirected.degree(w)) for w in nx.common_neighbors(self.graph_undirected, u, v))
                # pred_val = calc_score(u, v)

                if pred_val > 0:
                    edges.append((u,v))
                    scores.append(pred_val)
            except:
                continue
        scores = np.array(scores)
        sum_scores = scores.sum()
        scores /= sum_scores

        # print(len(scores), len(edges))
        return edges, scores

    def resource_allocation_index(self, ebunch):
        def calc_score(u, v):         
            common_neighs = list(nx.common_neighbors(self.graph_undirected, u, v))
            if len(common_neighs) > 0: 
                scores = []
                for x in common_neighs: 
                    d = self.graph_undirected.degree(x)
                    if d > 0:
                        scores.append(d)
                    else:
                      continue
                return sum(scores)

        edges = []
        scores = []
        # for edge in tqdm(ebunch):
        for edge in ebunch:
            u, v = edge
            try:
              pred_val = sum(1 / self.graph_undirected.degree(w) for w in nx.common_neighbors(self.graph_undirected, u, v))
              # pred_val = calc_score(u, v)
              if pred_val > 0:
                  edges.append((u,v))
                  scores.append(pred_val)
            except:
              continue
        scores = np.array(scores, dtype='float64')
        sum_scores = scores.sum()
        scores /= sum_scores
        return edges, scores

    def jaccard_coefficient_index(self, ebunch):
        edges = []
        scores = []
        # for edge in tqdm(ebunch):
        for edge in ebunch:
            u, v = edge
            try:
              cnbors = list(nx.common_neighbors(self.graph_undirected, u, v))
              union_size = len(set(self.graph_undirected[u]) | set(self.graph_undirected[v]))
              if union_size == 0:
                  pred_val = 0
              else:
                  pred_val = len(cnbors) / union_size
              
              if pred_val > 0:
                  edges.append((u,v))
                  scores.append(pred_val)
            except:
              continue
        scores = np.array(scores)
        sum_scores = scores.sum()
        scores /= sum_scores
        return edges, scores


    def add_edges_similarity_based(self):
        if self.alpha == self.graph_undirected.number_of_nodes() and self.explore == 1:
            ebunch = nx.non_edges(self.graph_undirected)
        else:
            ebunch = self.non_edges_important_nodes()

        # ss = nx.adamic_adar_index(self.graph_undirected, ebunch)
        # print(list(ss))
        if self.extend_metric == 'resource_alloc':
            edges, scores = self.resource_allocation_index(ebunch)
        if self.extend_metric == 'jaccard':
            edges, scores = self.jaccard_coefficient_index(ebunch)
        if self.extend_metric == 'adamic_adar':
            edges, scores = self.adamic_adar_index(ebunch)

        # print()
        # print(len(edges), len(scores))

        if len(edges) > self.gamma:
            rnd_indices = np.random.choice(len(edges), int(self.gamma), p=scores, replace=False)
            edges = [edges[i] for i in rnd_indices]
            
            
        extra_random = int(self.gamma - len(edges))
        if extra_random > 0:
            rnd_indices = np.random.choice(len(ebunch), int(extra_random), replace=False)
            extra_edges = [ebunch[i] for i in rnd_indices]
            edges.extend(extra_edges)

        selected_edges = torch.tensor(edges)
        new_edges_1 = selected_edges[:,0]
        new_edges_2 = selected_edges[:,1]

        new_dataset = dgl.add_edges(self.graph, new_edges_1, new_edges_2)
        new_dataset = dgl.add_edges(new_dataset, new_edges_2, new_edges_1)

        return new_dataset



In [7]:
# extend_neighborhood = ExtendNeighborhood(g, graph_undirected, centrality_metric='degree', alpha=alpha, beta=beta, gamma=gamma, explore=explore, extend_metric=extend_metric)
# if extend_metric == 'adamic_adar':
#     print("---Adamic Adar index---")
#     extended_graph = extend_neighborhood.add_edges()

### Load Data



In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
def load_dataset(dataset_name, verbose=False):
    if dataset_name == "cora": 
        dataset = dgl.data.CoraGraphDataset(verbose=False)
    elif dataset_name == "citeseer":
        dataset = dgl.data.CiteseerGraphDataset(verbose=False)
    elif dataset_name == "pubmed":
        dataset = dgl.data.PubmedGraphDataset(verbose=False)
    elif dataset_name == "amazonphoto":
        dataset = dgl.data.AmazonCoBuyPhotoDataset(verbose=False)
    elif dataset_name == "amazoncomputer":
        dataset = dgl.data.AmazonCoBuyComputerDataset(verbose=False)
    else: 
      raise("Not defined!")

    graph_nx = dgl.to_networkx(dataset[0])
    graph_undirected = nx.Graph(graph_nx)
    g = dataset[0]

    if (dataset_name == 'amazonphoto') or (dataset_name == 'amazoncomputer'): 
        length = g.num_nodes()
        train_num = int(np.floor(0.7*length))
        valid_num = int(np.floor(0.1*length))
        test_num = int(np.ceil(0.2*length))
        a = (np.zeros(train_num))
        a_1 = (np.ones(valid_num))
        a_2 = (np.ones(test_num)*2)

        mask = np.hstack((a, a_1))
        mask = np.hstack((mask, a_2))

        np.random.shuffle(mask)

        train_mask = torch.tensor(np.where(mask==0,True,False))
        val_mask = torch.tensor( np.where(mask==1,True,False))
        test_mask = torch.tensor(np.where(mask==2,True,False))

        features = g.ndata['feat']
        labels = g.ndata['label']
        num_feats = features.shape[1]
        n_classes = dataset.num_classes
        n_edges = g.num_edges()

    else:
        features = g.ndata['feat']
        labels = g.ndata['label']
        train_mask = g.ndata['train_mask']
        val_mask = g.ndata['val_mask']
        test_mask = g.ndata['test_mask']
        num_feats = features.shape[1]
        n_classes = dataset.num_classes 
        n_edges = g.number_of_edges()

    # if verbose:
    #     print("""----Data statistics------'
    #       #Edges %d
    #       #Classes %d
    #       #Train samples %d
    #       #Val samples %d
    #       #Test samples %d""" %
    #           (n_edges, n_classes,
    #             train_mask.int().sum().item(),
    #             val_mask.int().sum().item(),
    #             test_mask.int().sum().item()))
        
    return g, graph_nx, graph_undirected, features, labels, train_mask, val_mask, test_mask, num_feats, n_classes, n_edges

In [10]:
# print(dataset_name)
# g, graph_nx, graph_undirected, features, labels, train_mask, val_mask, test_mask, num_feats, n_classes, n_edges = load_dataset(dataset_name, verbose=True)
# features_back, labels_back = features, labels

### GNN models

In [11]:
from dgl.nn import SAGEConv
from dgl.nn import GraphConv
from dgl.nn import GATConv
from dgl.nn import GATv2Conv

In [12]:
class GAT(nn.Module):
    def __init__(self,
                 num_layers,
                 in_dim,
                 num_hidden,
                 num_classes,
                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
                 negative_slope,
                 residual):
        super(GAT, self).__init__()
        self.num_layers = num_layers
        self.gat_layers = nn.ModuleList()
        self.activation = activation
        # input projection (no residual)
        self.gat_layers.append(GATConv(
            in_dim, num_hidden, heads[0],
            feat_drop, attn_drop, negative_slope, False, self.activation))
        # hidden layers
        for l in range(1, num_layers):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.gat_layers.append(GATConv(
                num_hidden * heads[l-1], num_hidden, heads[l],
                feat_drop, attn_drop, negative_slope, residual, self.activation))
        # output projection
        self.gat_layers.append(GATConv(
            num_hidden * heads[-2], num_classes, heads[-1],
            feat_drop, attn_drop, negative_slope, residual, None))

    def forward(self, g, inputs):
        h = inputs
        for l in range(self.num_layers):
            h = self.gat_layers[l](g, h).flatten(1)
        # output projection
        logits = self.gat_layers[-1](g, h).mean(1)
        return logits

In [13]:
class GATv2(nn.Module):
    def __init__(self,
                 num_layers,
                 in_dim,
                 num_hidden,
                 num_classes,
                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
                 negative_slope,
                 residual):
        super(GATv2, self).__init__()
        self.num_layers = num_layers
        self.gat_layers = nn.ModuleList()
        self.activation = activation
        # input projection (no residual)
        self.gat_layers.append(GATv2Conv(
            in_dim, num_hidden, heads[0],
            feat_drop, attn_drop, negative_slope, False, self.activation))
        # hidden layers
        for l in range(1, num_layers):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.gat_layers.append(GATv2Conv(
                num_hidden * heads[l-1], num_hidden, heads[l],
                feat_drop, attn_drop, negative_slope, residual, self.activation))
        # output projection
        self.gat_layers.append(GATv2Conv(
            num_hidden * heads[-2], num_classes, heads[-1],
            feat_drop, attn_drop, negative_slope, residual, None))

    def forward(self, g, inputs):
        h = inputs
        for l in range(self.num_layers):
            h = self.gat_layers[l](g, h).flatten(1)
        # output projection
        logits = self.gat_layers[-1](g, h).mean(1)
        return logits

In [14]:
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
        # self.conv1 = SAGEConv(in_feats, h_feats, 'gcn')
        # self.conv2 = SAGEConv(h_feats, h_feats, 'gcn')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class GCN(nn.Module):
  def __init__(self, in_feats, h_feats):
      super(GCN, self).__init__()
      self.conv1 = GraphConv(in_feats, h_feats)
      self.conv2 = GraphConv(h_feats, h_feats)
  
  def forward(self, g, in_feat):
      h = self.conv1(g, in_feat)
      h = F.relu(h)
      h = self.conv2(g, h)
      return h

### Train & Evaluation

In [15]:
class EarlyStopping:
    def __init__(self, patience=10):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def step(self, acc, model):
        score = acc
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(model)
        elif score < self.best_score:
            self.counter += 1
            # print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(model)
            self.counter = 0
        return self.early_stop

    def save_checkpoint(self, model):
        '''Saves model when validation loss decrease.'''
        torch.save(model.state_dict(), 'es_checkpoint.pt')

In [16]:
def accuracy(logits, labels):
    _, indices = torch.max(logits, dim=1)
    correct = torch.sum(indices == labels)
    return correct.item() * 1.0 / len(labels)

def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        labels = labels[mask]
        return accuracy(logits, labels)

In [17]:
# Parameters
lr = 0.005
weight_decay = 5e-4
epochs = 200
fastmode = False
early_stop = False

### Run function

In [18]:
def run(seed, dataset_name, model_name, alpha, beta, gamma, explore, extend_metric):

    def accuracy(logits, labels):
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

    def evaluate(model, features, labels, mask):
        model.eval()
        with torch.no_grad():
            logits = model(g, features)
            logits = logits[mask]
            labels = labels[mask]
            return accuracy(logits, labels)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    dgl.random.seed(seed)

    g, graph_nx, graph_undirected, features, labels, train_mask, val_mask, test_mask, num_feats, n_classes, n_edges = load_dataset(dataset_name, verbose=True)

    if model_name == 'sage':
        # print("SAGE model is used")
        model = GraphSAGE(g.ndata['feat'].shape[1], 16)
    elif model_name == 'gcn':
        # print("GCN model is used")
        model = GCN(g.ndata['feat'].shape[1], 16)
    elif model_name == 'gat':
        # print("GAT model is used")
        num_heads, num_layers, num_out_heads = 8, 2, 1
        num_hidden = n_classes = 16
        in_drop, attn_drop, negative_slope, residual = 0.6, 0.6, 0.2, False
        heads = ([num_heads] * num_layers) + [num_out_heads]
        model = GAT(num_layers, g.ndata['feat'].shape[1], num_hidden, n_classes, heads, F.elu, in_drop, attn_drop, negative_slope, residual)
    elif model_name == 'gatv2':
        # print("GATv2 model is used")
        num_heads, num_layers, num_out_heads = 8, 2, 1
        num_hidden = n_classes = 16
        in_drop, attn_drop, negative_slope, residual = 0.6, 0.6, 0.2, False
        heads = ([num_heads] * num_layers) + [num_out_heads]
        model = GATv2(num_layers, g.ndata['feat'].shape[1], num_hidden, n_classes, heads, F.elu, in_drop, attn_drop, negative_slope, residual)
    else:
        raise("Not a model!")


    if extend_metric != 'None':
        t1 = time.time()
        num_edge_before_extention = g.number_of_edges()
        # print(f'Total number of edges before extension: {num_edge_before_extention}')
        extend_neighborhood = ExtendNeighborhood(g, graph_undirected, alpha=alpha, beta=beta, gamma=gamma, explore=explore, extend_metric=extend_metric)
        # print(extend_metric)
        if extend_metric == 'degree' or extend_metric == 'eigenvector':
            extended_graph = extend_neighborhood.add_edges_centrality_based()
        else: 
            extended_graph = extend_neighborhood.add_edges_similarity_based()
        g = extended_graph
        t_total = time.time() - t1
        num_edge_after_extention = g.number_of_edges()
        # print()
        # print(f'Total number of edges after extension: {num_edge_after_extention}')
        # print(f'Total number of added edges: {num_edge_after_extention - num_edge_before_extention}')
        # print(f"*** Total construction time in seconds: {t_total:.2f} ***")
    else:
        print("Neighborhood is not extended!")

    if add_self_loop == True:
        # print(f"Total edges before adding self-loop {g.number_of_edges()}")
        g = g.remove_self_loop().add_self_loop()
        # print(f"Total edges after adding self-loop {g.number_of_edges()}")

    model = model.to(device)
    g, labels, features, train_mask, val_mask, test_mask = map(lambda x: x.to(device), (g, labels, features, train_mask, val_mask, test_mask))
    
    loss_fcn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=lr, weight_decay=weight_decay)

    # if early_stop:
    #     stopper = EarlyStopping(patience=100)

    for epoch in range(epochs):
        model.train()

        # forward
        logits = model(g, features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc = accuracy(logits[train_mask], labels[train_mask])

        if fastmode:
            val_acc = accuracy(logits[val_mask], labels[val_mask])
        else:
            val_acc = evaluate(model, features, labels, val_mask)
            # if early_stop:
            #     if stopper.step(val_acc, model):
            #         break
        
        # if epoch % 10 == 0:
        #     print("Epoch {:05d} | Loss {:.4f} | TrainAcc {:.4f} |"
        #         " ValAcc {:.4f}".
        #         format(epoch, loss.item(), train_acc,
        #                 val_acc))

    # val_acc = evaluate(model, features, labels, val_mask)
    acc = evaluate(model, features, labels, test_mask)

    return acc

## Run Experiments: Centrality

### Experiments One

In [None]:
# seed = 1000
# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# dgl.random.seed(seed)

In [None]:
# dataset_name = 'cora'
# dataset_name = 'citeseer'
dataset_name = 'pubmed'
# dataset_name = 'amazonphoto'
# dataset_name = 'amazoncomputer'

In [None]:
seed = 1000
# seed = 4
add_self_loop = True

# dummies
gamma = explore = alpha = beta = gamma = 1

# model_names = ['gcn', 'sage', 'gat', 'gatv2']
model_name = 'gatv2'

# extend_metrics = ["degree", "eigenvector"]
extend_metrics = ["degree"]

In [None]:
for i in range(0, 200):
    original_acc = run(seed=i, dataset_name=dataset_name, extend_metric='None', model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
    print(i, original_acc)
    # if original_acc == 0.801:
        # print(i)
        # break  

Test one!

In [None]:
dataset_name = 'cora'
model_name = 'gcn'
seed = 3

original_acc = run(seed=seed, dataset_name=dataset_name, extend_metric='None', model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
print(original_acc) 

betas = [3]
# betas = [1]
alphas = [388]

for beta in betas: 
  for alpha in alphas:
    test_run = run(seed=seed, dataset_name=dataset_name, extend_metric='degree', \
                  model_name=model_name, alpha=alpha, beta=beta, gamma=1, explore=1)
    print("alpha:", alpha, "  beta:", beta, " | res:", test_run) 

### Expreiments full 

#### Test Settings and Run function 

In [None]:
def test_settings(name):
    if name == 'cora':
        num_nodes = 2708
        test_list_alpha_centrality = []
        test_list_alpha_similarity = []
        test_list_beta = []

        percents = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
        percents = [int(num*num_nodes) for num in percents]
        # print(percents)

        for i in range(1, num_nodes):
            if i < 100:
                if i % 20 == 0:
                    test_list_alpha_centrality.append(i)
            if 100 <= i < 1000:
                if i % 80 == 0:
                    test_list_alpha_centrality.append(i)
            if i < 6:
                test_list_beta.append(i)
            if 10 <= i < 100:
                if i % 20 == 0: 
                    test_list_beta.append(i)
            if i in percents:
                test_list_alpha_similarity.append(i)
    elif name == 'citeseer':
        num_nodes = 3327
        test_list_alpha_centrality = []
        test_list_alpha_similarity = []
        test_list_beta = []

        percents = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
        percents = [int(num*num_nodes) for num in percents]
        # print(percents)

        for i in range(1, num_nodes):
            if i < 100:
                if i % 20 == 0:
                    test_list_alpha_centrality.append(i)
            if 100 <= i < 1200:
                if i % 100 == 0:
                    test_list_alpha_centrality.append(i)
            if i < 5:
                test_list_beta.append(i)
            if 10 <= i < 100:
                if i % 20 == 0: 
                    test_list_beta.append(i)
            if i in percents:
                test_list_alpha_similarity.append(i)
    elif name == 'pubmed':
        num_nodes = 19717
        test_list_alpha_centrality = []
        test_list_alpha_similarity = []
        test_list_beta = []

        percents = [0.5, 0.6, 0.7, 0.8]
        percents = [int(num*num_nodes) for num in percents]
        # print(percents)

        for i in range(1, num_nodes):
            if i < 300:
                if i % 50 == 0:
                    test_list_alpha_centrality.append(i)
            if 300 < i < 10000:
                if i % 700 == 0:
                    test_list_alpha_centrality.append(i)
            if i < 5:
                test_list_beta.append(i)
            if i in percents:
                test_list_alpha_similarity.append(i)
    elif name == 'amazonphoto':
        num_nodes = 7650
        test_list_alpha_centrality = []
        test_list_alpha_similarity = []
        test_list_beta = []

        percents = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
        percents = [int(num*num_nodes) for num in percents]
        # print(percents)

        for i in range(1, num_nodes):
            if i < 300:
                if i % 50 == 0:
                    test_list_alpha_centrality.append(i)
            if 500 < i < 10000:
                if i % 800 == 0:
                      test_list_alpha_centrality.append(i)
            if i < 5:
                test_list_beta.append(i)
            if i in percents:
                test_list_alpha_similarity.append(i)
    elif name == 'amazoncomputer':
        num_nodes = 13752
        test_list_alpha_centrality = []
        test_list_alpha_similarity = []
        test_list_beta = []

        percents = [0.5, 0.6, 0.7, 0.8, 0.9]
        percents = [int(num*num_nodes) for num in percents]
        # print(percents)

        for i in range(1, num_nodes):
            if i < 500:
                if i % 50 == 0:
                    test_list_alpha_centrality.append(i)
            if 500 < i < 8000:
                if i % 1000 == 0:
                    test_list_alpha_centrality.append(i)
            if i < 5:
                test_list_beta.append(i)
            if i in percents:
                test_list_alpha_similarity.append(i)
    else: 
          raise("Not defined!")
    
    return test_list_alpha_centrality, test_list_alpha_similarity, test_list_beta

In [None]:
add_self_loop = True
def run_experiments(seed, dataset_name, model_name, extend_metrics):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    dgl.random.seed(seed)

    gamma = explore = alpha = beta = gamma = 1
    results_file = dataset_name + '_results'

    original_acc = run(seed=seed, dataset_name=dataset_name, extend_metric='None', model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
    results_file = f"{dataset_name}_{model_name}.txt"
    with open(results_file, 'w') as f:
        f.write("Original, model:{}, accuracy:{}, seed:{}".format(model_name, original_acc, seed))
        f.write("\n\n")
        f.write("{},{},{},{},{}".format("accuracy", "alpha", "beta", "model_name", "extend_metric"))
        f.write("\n--------------------------------------------\n")

    test_list_alpha_centrality, test_list_alpha_similarity, test_list_beta = test_settings(dataset_name)

    for extend_metric in extend_metrics:
      for alpha in tqdm(test_list_alpha_centrality):
        for beta in test_list_beta:

              acc = run(seed=seed, dataset_name=dataset_name, model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore, extend_metric=extend_metric)

              if acc >= original_acc:
                  with open(results_file, 'a') as f:
                      f.write("{},{},{},{},{}".format(acc, alpha, beta, model_name, extend_metric))
                      f.write("\n")

#### Run 

In [None]:
# dataset_name: cora, citeseer, pubmed, amazonphoto, amazoncomputer
# model_name: gat, gatv2, gcn, sage

In [None]:
# run_experiments(seed=31, dataset_name='cora', model_name='gat', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=31, dataset_name='cora', model_name='gatv2', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=31, dataset_name='cora', model_name='gcn', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=31, dataset_name='cora', model_name='sage', extend_metrics = ["degree", "eigenvector"])

In [None]:
# run_experiments(seed=138, dataset_name='citeseer', model_name='gat', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=138, dataset_name='citeseer', model_name='gatv2', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=10, dataset_name='citeseer', model_name='gcn', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=10, dataset_name='citeseer', model_name='sage', extend_metrics = ["degree", "eigenvector"])

In [None]:
# run_experiments(seed=10, dataset_name='pubmed', model_name='gat', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=10, dataset_name='pubmed', model_name='gatv2', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=123, dataset_name='pubmed', model_name='gcn', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=123, dataset_name='pubmed', model_name='sage', extend_metrics = ["degree", "eigenvector"])

In [None]:
# run_experiments(seed=11, dataset_name='amazonphoto', model_name='gat', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=10, dataset_name='amazonphoto', model_name='gatv2', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=123, dataset_name='amazonphoto', model_name='gcn', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=123, dataset_name='amazonphoto', model_name='sage', extend_metrics = ["degree", "eigenvector"])

In [None]:
# run_experiments(seed=11, dataset_name='amazoncomputer', model_name='gat', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=10, dataset_name='amazonphoto', model_name='gatv2', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=123, dataset_name='amazoncomputer', model_name='gcn', extend_metrics = ["degree", "eigenvector"])
# run_experiments(seed=30, dataset_name='amazoncomputer', model_name='sage', extend_metrics = ["degree", "eigenvector"])

In [None]:
# from google.colab import files
# files.download('/content/pubmed_gat.txt')

## Run Experiments: Similarity

### Experiment One

In [None]:
# beta = 1
# add_self_loop = True
# dataset_name = 'amazonphoto'
# seed = 1000
# model_name = 'gatv2'

# acc = run(seed=seed, dataset_name=dataset_name, model_name=model_name, alpha=4125, \
#           beta=beta, gamma=5600, explore=0.01, extend_metric='adamic_adar')

# acc

In [None]:
dataset_name = 'cora'
num_nodes = 2708

# dataset_name = 'citeseer'
# num_nodes = 3327

# dataset_name = 'pubmed'
# num_nodes = 19717

# dataset_name = 'amazoncomputer'
# num_nodes = 13752

# dataset_name = 'amazonphoto'
# num_nodes = 7650

In [None]:
num_nodes * 0.9

In [None]:
model_name = 'gatv2'
add_self_loop = True
# extend_metric = 'adamic_adar'
# extend_metric = "resource_alloc"
# extend_metric = "jaccard"
extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"]

seed = 1234
# seed = 1000
beta = 1    # dummies

# alphas = [int(num_nodes*percent*0.1) for percent in range(1, 9, 1)]
# gamma = 3200
# explore = 0.01

# for alpha in alphas:
#     for extend_metric in extend_metrics:
#         test_run = run(seed=seed, dataset_name=dataset_name, extend_metric=extend_metric, \
#                         model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
#         print(f"{extend_metric},{alpha},{gamma},{explore}", " | res:", test_run) 
#     print()

# ==========================================================

alpha = int(num_nodes*0.9)
gammas = [i for i in range(300, 1200, 100)]
explore = 0.1

for gamma in gammas:
    for extend_metric in extend_metrics:
        test_run = run(seed=seed, dataset_name=dataset_name, extend_metric=extend_metric, \
                        model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
        print(f"{extend_metric},{alpha},{gamma},{explore}", " | res:", test_run) 
    print()

In [None]:
alpha = int(num_nodes*0.2)
gammas = [1700, 1750, 1800, 1850, 1900, 1950, 2000]
explore = 0.1

for gamma in gammas:
    for extend_metric in extend_metrics:
        test_run = run(seed=seed, dataset_name=dataset_name, extend_metric=extend_metric, \
                        model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
        print(f"{extend_metric},{alpha},{gamma},{explore}", " | res:", test_run) 
    print()

In [None]:
alpha = int(num_nodes*0.2)
gammas = [int(i) for i in range(2500, 2800, 50)]
explore = 0.01

for gamma in gammas:
    for extend_metric in extend_metrics:
        test_run = run(seed=seed, dataset_name=dataset_name, extend_metric=extend_metric, \
                        model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
        print(f"{extend_metric},{alpha},{gamma},{explore}", " | res:", test_run) 
    print()

### Experiments Full

#### Test Settings and Run function 

In [19]:
def test_settings_similarity(name):
    if name == 'cora':
        num_nodes = 2708
        test_list_gamma = []
        test_list_explore = [0.1]
        percents = [0.4, 0.6, 0.8, 1]
        test_list_alpha_similarity = [int(num*num_nodes) for num in percents]
        for i in range(num_nodes):
            if i > 300: 
                if i % 200 == 0:
                    test_list_gamma.append(i)

    elif name == 'citeseer':
        num_nodes = 3327
        test_list_gamma = []
        test_list_explore = [0.1]
        percents = [0.4, 0.6, 0.8, 1]
        test_list_alpha_similarity = [int(num*num_nodes) for num in percents]
        for i in range(num_nodes):
            if i > 300: 
                if i % 200 == 0:
                    test_list_gamma.append(i)

    elif name == 'pubmed':
        num_nodes = 19717
        test_list_gamma = []
        test_list_explore = [0.1]
        percents = [0.2]
        test_list_alpha_similarity = [int(num*num_nodes) for num in percents]
        for i in range(num_nodes):
            if 799 < i < 10000: 
                if i % 800 == 0:
                    test_list_gamma.append(i)

    elif name == 'amazonphoto':
        num_nodes = 7650
        test_list_gamma = []
        test_list_explore = [0.004, 0.01]

        percents = [0.6]
        test_list_alpha_similarity = [int(num*num_nodes) for num in percents]
        # print(percents)

        for i in range(1, num_nodes):
                if i % 800 == 0:
                    test_list_gamma.append(i)
    elif name == 'amazoncomputer':
        num_nodes = 13752
        test_list_gamma = []
        test_list_explore = [0.01]
        percents = [0.3]
        test_list_alpha_similarity = [int(num*num_nodes) for num in percents]
        for i in range(1, num_nodes):
                if i % 800 == 0:
                    test_list_gamma.append(i)

    else: 
          raise("Not defined!")
    
    return test_list_alpha_similarity, test_list_gamma, test_list_explore

In [20]:
add_self_loop = True
def run_experiments_similarity(seed, dataset_name, model_name, extend_metrics):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    dgl.random.seed(seed)

    gamma = explore = alpha = beta = gamma = 1
    results_file = dataset_name + '_results'

    original_acc = run(seed=seed, dataset_name=dataset_name, extend_metric='None', model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore)
    results_file = f"{dataset_name}_{model_name}.txt"
    with open(results_file, 'w') as f:
        f.write("Original, model:{}, accuracy:{}, seed:{}".format(model_name, original_acc, seed))
        f.write("\n\n")
        f.write("{},{},{},{},{},{}".format("accuracy", "alpha", "gamma", "explore", "model_name", "extend_metric"))
        f.write("\n--------------------------------------------\n")

    test_list_alpha_similarity, test_list_gamma, test_list_explore = test_settings_similarity(dataset_name)

    for extend_metric in extend_metrics:
      for explore in test_list_explore:
        for alpha in test_list_alpha_similarity:
          for gamma in tqdm(test_list_gamma):
              acc = run(seed=seed, dataset_name=dataset_name, model_name=model_name, alpha=alpha, beta=beta, gamma=gamma, explore=explore, extend_metric=extend_metric)
              if acc >= original_acc:
                  with open(results_file, 'a') as f:
                      f.write("{},{},{},{},{},{}".format(acc, alpha, gamma, explore, model_name, extend_metric))
                      f.write("\n")

#### Run 

In [None]:
# dataset_name: cora, citeseer, pubmed, amazonphoto, amazoncomputer
# model_name: gat, gatv2, gcn, sage

In [None]:
# run_experiments_similarity(seed=31, dataset_name='cora', model_name='gat', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=31, dataset_name='cora', model_name='gatv2', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=31, dataset_name='cora', model_name='gcn', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=31, dataset_name='cora', model_name='sage', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])

In [18]:
# run_experiments_similarity(seed=138, dataset_name='citeseer', model_name='gat', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=138, dataset_name='citeseer', model_name='gatv2', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=10, dataset_name='citeseer', model_name='gcn', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=10, dataset_name='citeseer', model_name='sage', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])

In [None]:
# run_experiments_similarity(seed=10, dataset_name='pubmed', model_name='gat', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=10, dataset_name='pubmed', model_name='gatv2', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=123, dataset_name='pubmed', model_name='gcn', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
run_experiments_similarity(seed=42, dataset_name='pubmed', model_name='sage', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])

In [None]:
# run_experiments_similarity(seed=11, dataset_name='amazonphoto', model_name='gat', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=10, dataset_name='amazonphoto', model_name='gatv2', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=123, dataset_name='amazonphoto', model_name='gcn', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=123, dataset_name='amazonphoto', model_name='sage', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])

In [None]:
# run_experiments_similarity(seed=11, dataset_name='amazoncomputer', model_name='gat', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=10, dataset_name='amazonphoto', model_name='gatv2', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=123, dataset_name='amazoncomputer', model_name='gcn', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])
# run_experiments_similarity(seed=30, dataset_name='amazoncomputer', model_name='sage', extend_metrics = ["adamic_adar", "resource_alloc", "jaccard"])

In [None]:
# from google.colab import files
# files.download(results_file) 