In [1]:
import argparse
from dataset_utils import DataLoader
from utils import random_planetoid_splits
from GNN_models import GPRGNN_conv

import torch
import torch.nn.functional as F
from tqdm import tqdm

import torch_geometric.transforms as T
from torch_geometric.utils import negative_sampling
from pytorch_lightning import seed_everything
seed_everything(15)

import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score
from torch_geometric.nn import GCNConv

Global seed set to 15


In [2]:
import argparse
import configparser
import argparse
from dataset_utils import DataLoader
from utils import random_planetoid_splits
from GNN_models import *

import torch
import torch.nn.functional as F
from tqdm import tqdm

from copy import deepcopy
import numpy as np
from pytorch_lightning import seed_everything
parser = argparse.ArgumentParser()


parser.add_argument('--K', type=int, default=10)
parser.add_argument('--alpha', type=float, default=0.1)
parser.add_argument('--hidden', type=int, default=64)

parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--dprate', type=float, default=0)

parser.add_argument('--Init', type=str,
                    choices=['SGC', 'PPR', 'NPPR', 'Random', 'WS', 'Null'],
                    default='Random')
parser.add_argument('--Gamma', default=None)
parser.add_argument('--ppnp', default='GPR_prop',
                    choices=['PPNP', 'GPR_prop'])

parser.add_argument('--dataset', default='squirrel', choices=[
                    'chameleon', 'squirrel', 'film'])
parser.add_argument('--train_rate', type=float, default=0.6)
parser.add_argument('--val_rate', type=float, default=0.2)


parser.add_argument('--epochs', type=int, default=1000)
parser.add_argument('--early_stopping', type=int, default=200)
parser.add_argument('--lr', type=float, default=0.002)
parser.add_argument('--weight_decay', type=float, default=0.0005)
parser.add_argument('--RPMAX', type=int, default=10)

parser.add_argument('--auto', default=False)

args = parser.parse_args([])


print('arg:', args)

seed_everything(15)
dname = args.dataset
dataset, data = DataLoader(dname)
Init = args.Init

Global seed set to 15


arg: Namespace(Gamma=None, Init='Random', K=10, RPMAX=10, alpha=0.1, auto=False, dataset='squirrel', dprate=0, dropout=0.5, early_stopping=200, epochs=1000, hidden=64, lr=0.002, ppnp='GPR_prop', train_rate=0.6, val_rate=0.2, weight_decay=0.0005)


In [3]:
data = T.RandomLinkSplit(num_val=0.1, num_test=0.05, is_undirected=True,
                      add_negative_train_samples=False)(data)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data, val_data, test_data = data
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

In [5]:
print(train_data.edge_label.shape)
print(val_data.edge_label.shape)
print(test_data.edge_label.shape)

torch.Size([168720])
torch.Size([39698])
torch.Size([19848])


In [6]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels, num_classes, dropout=0.5):
        super().__init__()
        self.encoder = GPRGNN_conv(in_channels, out_channels, args=args)
        self.edge_proj1 = torch.nn.Linear(out_channels*2, 1)
        self.dropout = dropout


    def encode(self, x, edge_index):
        x = self.encoder(x, edge_index)
        return x
    
   
    def decode_edge(self, z, edge_label_index):
        z = torch.dropout(z, self.dropout, train=self.training)
        h = torch.cat([z[edge_label_index[0]], z[edge_label_index[1]]], dim=1)
        h = self.edge_proj1(h)
        return h

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()



In [7]:
model = Net(dataset.num_features, dataset.num_classes, dataset.num_classes).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=0)
criterion = torch.nn.BCEWithLogitsLoss()
# lamb = 0.1

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse').to(device)

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )

    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out_edge = model.decode_edge(z, edge_label_index).view(-1)
    # out_class = model.decode_class(z)
    loss_edge = criterion(out_edge, edge_label)
    # loss_class = torch.nn.functional.nll_loss(out_class[train_data.train_mask], train_data.y[train_data.train_mask])
    loss = loss_edge
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test_edge(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode_edge(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

best_val_auc = final_test_auc = 0
best_val_acc = final_test_acc = 0
for epoch in range(1, 1001):
    loss = train()
    val_auc = test_edge(val_data)
    test_auc = test_edge(test_data)
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        final_test_auc = test_auc
        best_temp = model.encoder.prop1.temp.detach().cpu().numpy()
        best_model_dict = deepcopy(model.state_dict())

    if epoch % 100 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
          f'Test: {test_auc:.4f}')
        print(best_temp)
print(f'Final Test: {final_test_auc:.4f}')

Epoch: 100, Loss: 0.3826, Val: 0.9312, Test: 0.9293
[ 0.44827567 -0.05709306  0.1695483   0.00950561  0.09784034  0.00276275
 -0.03042223 -0.15381549 -0.21595468 -0.25329435 -0.04284211]
Epoch: 200, Loss: 0.3724, Val: 0.9336, Test: 0.9320
[ 0.39782367 -0.11272177  0.24843474  0.07444385  0.15271318  0.06123738
 -0.00939125 -0.14402719 -0.24750811 -0.30726757 -0.13649332]
Epoch: 300, Loss: 0.3673, Val: 0.9356, Test: 0.9338
[ 0.37302589 -0.15991681  0.28408788  0.12392276  0.17925603  0.11345807
  0.00547589 -0.12364276 -0.26346194 -0.33278888 -0.19528616]
Epoch: 400, Loss: 0.3679, Val: 0.9366, Test: 0.9348
[ 0.36636784 -0.19421816  0.2957065   0.15777214  0.18692592  0.15211853
  0.0108205  -0.10718151 -0.27748144 -0.35051948 -0.24023334]
Epoch: 500, Loss: 0.3629, Val: 0.9371, Test: 0.9354
[ 0.36955899 -0.21966779  0.29575787  0.18633027  0.18771939  0.18895886
  0.01564491 -0.08736831 -0.2862714  -0.36003561 -0.27510558]
Epoch: 600, Loss: 0.3616, Val: 0.9376, Test: 0.9360
[ 0.37494104 

In [8]:
np.save(
    f'./results/gamma_{args.K}_{args.Init}_{args.dataset}_gprgnn_unsupervised.npy', best_temp)

In [9]:
def test_logistic(train_z, train_y, test_z, test_y, solver='lbfgs',
             multi_class='auto', *args, **kwargs):
        r"""Evaluates latent space quality via a logistic regression downstream
        task."""
        from sklearn.linear_model import LogisticRegression

        clf = LogisticRegression(solver=solver, multi_class=multi_class, *args,
                                 **kwargs).fit(train_z.detach().cpu().numpy(),
                                               train_y.detach().cpu().numpy())
        return clf.score(test_z.detach().cpu().numpy(),
                         test_y.detach().cpu().numpy())


In [10]:
# now we load the original dataset and split the dataset
dname = args.dataset
dataset, data = DataLoader(dname)
data = data.to(device)
train_rate = args.train_rate
val_rate = args.val_rate
percls_trn = int(round(train_rate*len(data.y)/dataset.num_classes))
val_lb = int(round(val_rate*len(data.y)))
TrueLBrate = (percls_trn*dataset.num_classes+val_lb)/len(data.y)
print('True Label rate: ', TrueLBrate)
permute_masks = random_planetoid_splits
data = permute_masks(data, dataset.num_classes, percls_trn, val_lb)


True Label rate:  0.7998461834262642


In [11]:
# now we load the model with best weights
model.load_state_dict(best_model_dict)
model.eval()
z = model.encode(data.x, data.edge_index)
acc = test_logistic(z[data.train_mask], data.y[data.train_mask],
                     z[data.test_mask], data.y[data.test_mask], max_iter=150)
print(acc)

0.22478386167146974
